mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-11 10:08:57 -06:00
finish off "download once" protocol, ignore .html files in data
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,6 +7,7 @@ keys.json
|
|||||||
process.py
|
process.py
|
||||||
app.db-journal
|
app.db-journal
|
||||||
test.html
|
test.html
|
||||||
|
app/data/*.html
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|||||||
@@ -90,25 +90,32 @@ class Episode(db.Model):
|
|||||||
) # sections of quotes under this episode
|
) # sections of quotes under this episode
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def links(self):
|
def link(self):
|
||||||
return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php"
|
return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self):
|
def path(self):
|
||||||
return os.path.join('app', 'data', f'{self.season_id}-{self.number}')
|
return os.path.join('app', 'data', f'{self.season_id}-{self.number}.html')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def downloaded(self):
|
def downloaded(self):
|
||||||
return os.path.exists(self.path)
|
return os.path.exists(self.path)
|
||||||
|
|
||||||
def download(self):
|
def download(self, force=False):
|
||||||
"""downloads data"""
|
"""downloads data"""
|
||||||
data = requests.get(link).text
|
print(f'Downloading e{self.number}/s{self.season_id} from {self.link}')
|
||||||
open(self.path, "w+", encoding="utf-8").write(data)
|
if not self.downloaded or force:
|
||||||
|
data = requests.get(self.link).text
|
||||||
|
open(self.path, "w+", encoding="utf-8").write(data)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def data(self):
|
||||||
|
return open(self.path, 'r', encoding="utf-8").read()
|
||||||
|
|
||||||
def build(self):
|
def build(self):
|
||||||
"""downloads, processes, and automatically creates Sections and Quotes"""
|
"""downloads, processes, and automatically creates Sections and Quotes"""
|
||||||
soup = BeautifulSoup(data, "html.parser")
|
self.download()
|
||||||
|
soup = BeautifulSoup(self.data, "html.parser")
|
||||||
|
|
||||||
sections = soup.find_all(attrs={"class": "quote"})
|
sections = soup.find_all(attrs={"class": "quote"})
|
||||||
deleted = 0
|
deleted = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user