finish off "download once" protocol, ignore .html files in data

This commit is contained in:
Xevion
2020-03-09 02:53:51 -05:00
parent b4d2922163
commit edd5b49dfe
2 changed files with 14 additions and 6 deletions

1
.gitignore vendored
View File

@@ -7,6 +7,7 @@ keys.json
process.py
app.db-journal
test.html
app/data/*.html
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@@ -90,25 +90,32 @@ class Episode(db.Model):
) # sections of quotes under this episode
@property
def links(self):
def link(self):
return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php"
@property
def path(self):
return os.path.join('app', 'data', f'{self.season_id}-{self.number}')
return os.path.join('app', 'data', f'{self.season_id}-{self.number}.html')
@property
def downloaded(self):
return os.path.exists(self.path)
def download(self):
def download(self, force=False):
"""downloads data"""
data = requests.get(link).text
open(self.path, "w+", encoding="utf-8").write(data)
print(f'Downloading e{self.number}/s{self.season_id} from {self.link}')
if not self.downloaded or force:
data = requests.get(self.link).text
open(self.path, "w+", encoding="utf-8").write(data)
@property
def data(self):
return open(self.path, 'r', encoding="utf-8").read()
def build(self):
"""downloads, processes, and automatically creates Sections and Quotes"""
soup = BeautifulSoup(data, "html.parser")
self.download()
soup = BeautifulSoup(self.data, "html.parser")
sections = soup.find_all(attrs={"class": "quote"})
deleted = 0