More testing on Episode preprocessing method, getting close to figuring out how to proceed.

This commit is contained in:
Xevion
2020-03-10 19:05:59 -05:00
parent d915a862d6
commit 0dcf1ccd1e

View File

@@ -21,16 +21,30 @@ quotePattern = r"([\w\s\.\',-\[\]\d&\"#]+):(.+)"
with open(os.path.join('app', 'static', 'titles.json'), 'r', encoding="utf-8") as file: with open(os.path.join('app', 'static', 'titles.json'), 'r', encoding="utf-8") as file:
titles = json.load(file) titles = json.load(file)
class Season(db.Model): class Season(db.Model):
"""
Represents a complete season of The Office, complete with a variable number of Episode objects.
As a a Database Object, it can be queried to attain all active instantiated Season objects.
"""
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
episodes = db.relationship("Episode", backref="season", lazy="dynamic") episodes = db.relationship("Episode", backref="season", lazy="dynamic")
def __init__(self, **kwargs): def __init__(self, **kwargs):
"""
Instantiates a Season object.
:param kwargs: Requires a `id` paramter 0-9 inclusive, plus any relevant SQLAlchemy database arguments.
"""
assert 0 <= kwargs.get("id") <= 9, "Season ID must be 0-9 inclusive" assert 0 <= kwargs.get("id") <= 9, "Season ID must be 0-9 inclusive"
super(Season, self).__init__(**kwargs) super(Season, self).__init__(**kwargs)
def build(self, rebuild=False): def build(self, rebuild=False):
"""runs build operations on every Episode under this season""" """
:param rebuild:
"""
print(f"Running build() on Season {self.id}") print(f"Running build() on Season {self.id}")
for episode in range(1, episodes[self.id] + 1): for episode in range(1, episodes[self.id] + 1):
ep = Episode.query.filter_by(season_id=self.id, number=episode).first() ep = Episode.query.filter_by(season_id=self.id, number=episode).first()
@@ -46,7 +60,6 @@ class Season(db.Model):
if rebuild: if rebuild:
print(f"Rebuilding Season {self.id}, Episode {episode}") print(f"Rebuilding Season {self.id}, Episode {episode}")
ep.build() ep.build()
pass
def download(self, force=False): def download(self, force=False):
episodes = Episode.query.filter_by(season_id=self.id).all() episodes = Episode.query.filter_by(season_id=self.id).all()
@@ -143,6 +156,11 @@ class Episode(db.Model):
data = requests.get(self.link).text data = requests.get(self.link).text
open(self.HTMLpath, "w+", encoding="utf-8").write(data) open(self.HTMLpath, "w+", encoding="utf-8").write(data)
@staticmethod
def test():
e = Episode.query.all()[0]
e.preprocess()
def preprocess(self): def preprocess(self):
""" """
Runs pre-processing on this Episode, which creates and automatically builds a JSON file full of the data Runs pre-processing on this Episode, which creates and automatically builds a JSON file full of the data
@@ -153,7 +171,7 @@ class Episode(db.Model):
print(f'Rebuilding s{self.season_id} e{self.number}') print(f'Rebuilding s{self.season_id} e{self.number}')
self.download() self.download()
soup = BeautifulSoup(self.data, "html.parser") soup = BeautifulSoup(self.HTMLdata, "html.parser")
sections = soup.find_all(attrs={"class": "quote"}) sections = soup.find_all(attrs={"class": "quote"})
deleted = 0 deleted = 0
@@ -161,32 +179,37 @@ class Episode(db.Model):
for section in sections: for section in sections:
isNewpeat = False isNewpeat = False
isDeleted = "deleted scene" in section.text isDeleted = "deleted scene" in section.text.lower()
if isDeleted:
print(section)
if isDeleted: if isDeleted:
deleted += 1 deleted += 1
quotes = [] quotes = []
for quote in section.find_all("b"): if not isDeleted:
if "Newpeat" in quote.string: for quote in section.find_all("b"):
quote = quote.next_sibling if "Newpeat" in quote.string:
isNewpeat = True quote = quote.next_sibling
# if quote is None or quote.next_sibling is None: isNewpeat = True
# print("Quote is None or next sibling is None") # if quote is None or quote.next_sibling is None:
# continue # print("Quote is None or next sibling is None")
quotes.append(quote.string + quote.next_sibling.string) # continue
quotes.append(quote.string + quote.next_sibling.string)
else:
paragraph = section.parent.find_all("p")[-1]
for quote in paragraph.find_all("b"):
quotes.append(quote.string + quote.next_sibling.string)
if len(quotes) == 0: if len(quotes) == 0:
print(f"Section found with Zero quotes. Newpeat: {isNewpeat} Deleted: {isDeleted}") print(f"Section found with Zero quotes. Newpeat: {isNewpeat} Deleted: {isDeleted}")
if not (isNewpeat or isDeleted): if not (isNewpeat or isDeleted):
continue continue
sectionData = {'isNewpeat' : isNewpeat, 'isDeleted' : isDeleted, 'quotes' : quotes} sectionData = {'isNewpeat': isNewpeat, 'isDeleted': isDeleted, 'quotes': quotes}
root.append(sectionData) root.append(sectionData)
with open(self.JSONpath, 'w+', encoding='utf-8') as file: with open(self.JSONpath, 'w+', encoding='utf-8') as file:
json.dump(root, file) json.dump(root, file, indent=4)
def build(self): def build(self):
""" """
@@ -257,7 +280,7 @@ class Section(db.Model):
q = Quote( q = Quote(
section=self, section=self,
speaker=quote[:mark], speaker=quote[:mark],
text=quote[mark + 1 :], text=quote[mark + 1:],
section_index=i, section_index=i,
) )
db.session.add(q) db.session.add(q)