mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-10 10:08:57 -06:00
finish off "download once" protocol, ignore .html files in data
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,6 +7,7 @@ keys.json
|
||||
process.py
|
||||
app.db-journal
|
||||
test.html
|
||||
app/data/*.html
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
@@ -90,25 +90,32 @@ class Episode(db.Model):
|
||||
) # sections of quotes under this episode
|
||||
|
||||
@property
|
||||
def links(self):
|
||||
def link(self):
|
||||
return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php"
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return os.path.join('app', 'data', f'{self.season_id}-{self.number}')
|
||||
return os.path.join('app', 'data', f'{self.season_id}-{self.number}.html')
|
||||
|
||||
@property
|
||||
def downloaded(self):
|
||||
return os.path.exists(self.path)
|
||||
|
||||
def download(self):
|
||||
def download(self, force=False):
|
||||
"""downloads data"""
|
||||
data = requests.get(link).text
|
||||
open(self.path, "w+", encoding="utf-8").write(data)
|
||||
print(f'Downloading e{self.number}/s{self.season_id} from {self.link}')
|
||||
if not self.downloaded or force:
|
||||
data = requests.get(self.link).text
|
||||
open(self.path, "w+", encoding="utf-8").write(data)
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
return open(self.path, 'r', encoding="utf-8").read()
|
||||
|
||||
def build(self):
|
||||
"""downloads, processes, and automatically creates Sections and Quotes"""
|
||||
soup = BeautifulSoup(data, "html.parser")
|
||||
self.download()
|
||||
soup = BeautifulSoup(self.data, "html.parser")
|
||||
|
||||
sections = soup.find_all(attrs={"class": "quote"})
|
||||
deleted = 0
|
||||
|
||||
Reference in New Issue
Block a user