From edd5b49dfec62fa918bcd7c78ca2e942d5851aa3 Mon Sep 17 00:00:00 2001 From: Xevion Date: Mon, 9 Mar 2020 02:53:51 -0500 Subject: [PATCH] finish off "download once" protocol, ignore .html files in data --- .gitignore | 1 + app/models.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 288b7e9..1d00bab 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ keys.json process.py app.db-journal test.html +app/data/*.html # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/app/models.py b/app/models.py index 69f6b44..6d0debb 100644 --- a/app/models.py +++ b/app/models.py @@ -90,25 +90,32 @@ class Episode(db.Model): ) # sections of quotes under this episode @property - def links(self): + def link(self): return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php" @property def path(self): - return os.path.join('app', 'data', f'{self.season_id}-{self.number}') + return os.path.join('app', 'data', f'{self.season_id}-{self.number}.html') @property def downloaded(self): return os.path.exists(self.path) - def download(self): + def download(self, force=False): """downloads data""" - data = requests.get(link).text - open(self.path, "w+", encoding="utf-8").write(data) + print(f'Downloading e{self.number}/s{self.season_id} from {self.link}') + if not self.downloaded or force: + data = requests.get(self.link).text + open(self.path, "w+", encoding="utf-8").write(data) + + @property + def data(self): + return open(self.path, 'r', encoding="utf-8").read() def build(self): """downloads, processes, and automatically creates Sections and Quotes""" - soup = BeautifulSoup(data, "html.parser") + self.download() + soup = BeautifulSoup(self.data, "html.parser") sections = soup.find_all(attrs={"class": "quote"}) deleted = 0