Files
the-office/app/models.py
2020-03-10 18:25:24 -05:00

302 lines
10 KiB
Python

import requests
import json
import os
from bs4 import BeautifulSoup
from app import db, login
episodes = [
5,
6,
22,
23,
14,
26,
24,
24,
24,
23,
] # Episode counts. Index 0 is for Webisodes.
quotePattern = r"([\w\s\.\',-\[\]\d&\"#]+):(.+)"
with open(os.path.join('app', 'static', 'titles.json'), 'r', encoding="utf-8") as file:
titles = json.load(file)
class Season(db.Model):
id = db.Column(db.Integer, primary_key=True)
episodes = db.relationship("Episode", backref="season", lazy="dynamic")
def __init__(self, **kwargs):
assert 0 <= kwargs.get("id") <= 9, "Season ID must be 0-9 inclusive"
super(Season, self).__init__(**kwargs)
def build(self, rebuild=False):
"""runs build operations on every Episode under this season"""
print(f"Running build() on Season {self.id}")
for episode in range(1, episodes[self.id] + 1):
ep = Episode.query.filter_by(season_id=self.id, number=episode).first()
if ep is None:
# Add the episode, then build
print(f"Creating new Episode, Season {self.id}, Episode {episode}")
ep = Episode(season_id=self.id, number=episode)
db.session.add(ep)
# I'm commiting early, which is a bit taboo, but I'm more worried about what the Episode object will need while building.
db.session.commit()
ep.build()
else:
if rebuild:
print(f"Rebuilding Season {self.id}, Episode {episode}")
ep.build()
pass
def download(self, force=False):
episodes = Episode.query.filter_by(season_id=self.id).all()
for ep in episodes:
ep.rebuild()
@staticmethod
def create_all(build=True):
"""
creates new Season objects and runs build() on them"""
for i in range(1, 10):
if Season.query.get(i) is None:
s = Season(id=i)
db.session.add(s)
if build:
s.build()
db.session.commit()
@staticmethod
def rebuild_all():
"""
Runs .build() on all Season objects in database
"""
for season in Season.query.all():
season.rebuild()
@property
def characters(self, sort):
"""
returns a List of all characters in this Season, built off the Episode's .characters() method
"""
pass
class Episode(db.Model):
"""
represents a Episode with underlying Sections (representing a specific cutscene in the show)
also has some other attributes useful for identify the episode and displaying, as well as countless methods
aimed at providing easy to access information using the database collection
"""
id = db.Column(
db.Integer, primary_key=True
) # arbitrary ID, should NOT be relied on to determine episode number or correlating season
number = db.Column(db.Integer) # episode number
title = db.Column(db.String(32))
season_id = db.Column(
db.Integer, db.ForeignKey("season.id")
) # correlating season number
built = db.Column(db.Boolean, default=False)
sections = db.relationship(
"Section", backref="episode", lazy="dynamic"
) # sections of quotes under this episode
@property
def link(self):
return f"http://officequotes.net/no{self.season_id}-{str(self.number).zfill(2)}.php"
@property
def HTMLpath(self):
return os.path.join("app", "data", "raw", f"{self.season_id}-{self.number}.html")
@property
def HTMLdata(self):
return open(self.HTMLpath, "r", encoding="utf-8").read()
@property
def JSONpath(self):
"""
returns the path for the JSON file with data for this episode
@return: a path
"""
return os.path.join("app", "data", "preprocess", f"{self.season_id}-{self.number}.json")
@property
def JSONdata(self):
"""
Returns the raw JSON data for this episode
"""
return open(self.JSONpath, "r", encoding="utf-8").read()
@property
def downloaded(self):
"""
Checks whether the raw episode script data has been downloaded.
@return: boolean stating the existence (and thus likely properly downloaded) of raw data
"""
return os.path.exists(self.HTMLpath)
def download(self, force=False):
"""downloads data"""
if not self.downloaded or force:
print(f"Downloading e{self.number}/s{self.season_id} from {self.link}")
data = requests.get(self.link).text
open(self.HTMLpath, "w+", encoding="utf-8").write(data)
def preprocess(self):
"""
Runs pre-processing on this Episode, which creates and automatically builds a JSON file full of the data
required to create a Episode properly, right before the Developer edits a episode and then enters it into the
database as a full fledged 'processed' episode.
"""
print(f'Pre-processing data for {self}')
print(f'Rebuilding s{self.season_id} e{self.number}')
self.download()
soup = BeautifulSoup(self.data, "html.parser")
sections = soup.find_all(attrs={"class": "quote"})
deleted = 0
root = []
for section in sections:
isNewpeat = False
isDeleted = "deleted scene" in section.text
if isDeleted:
deleted += 1
quotes = []
for quote in section.find_all("b"):
if "Newpeat" in quote.string:
quote = quote.next_sibling
isNewpeat = True
# if quote is None or quote.next_sibling is None:
# print("Quote is None or next sibling is None")
# continue
quotes.append(quote.string + quote.next_sibling.string)
if len(quotes) == 0:
print(f"Section found with Zero quotes. Newpeat: {isNewpeat} Deleted: {isDeleted}")
if not (isNewpeat or isDeleted):
continue
sectionData = {'isNewpeat' : isNewpeat, 'isDeleted' : isDeleted, 'quotes' : quotes}
root.append(sectionData)
with open(self.JSONpath, 'w+', encoding='utf-8') as file:
json.dump(root, file)
def build(self):
"""
Downloads, Processes, and Automatically creates Sections and Quotes
"""
self.built = True
self.title = titles[self.season_id][self.number - 1]
print(self.title)
db.session.commit()
def rebuild(self):
"""
Clears all sections f
"""
print(f'Rebuilding s{self.season_id} e{self.number}')
self.clear()
self.build()
def clear(self):
"""delete all sections relevant to this episode in order to reprocess"""
sections = Section.query.filter_by(episode_id=self.id).all()
if len(sections > 0):
print(f"Clearing {len(sections)} Sections of Ep {self.number} Season {self.season_id}")
for section in sections:
section.clear(commit=False, delete=True)
self.built = False
db.session.commit()
else:
print('No sections for this episode (s{self.season_id}/e{self.number}) could be found.')
@staticmethod
def clear_all():
"""
Runs clear() on every episode in the database
"""
print('Clearing all episodes in database.')
for episode in Episode.query.all():
episode.clear()
def __repr__(self):
sections = len(Section.query.filter_by(episode_id=self.id).all())
return f"Episode(id={self.id} s={self.season_id} ep={self.number} sects=[{sections}...])"
class Section(db.Model):
"""represents a Section of Quotes, a specific scene with relevant dialog"""
id = db.Column(db.Integer, primary_key=True)
episode_id = db.Column(db.Integer, db.ForeignKey("episode.id"))
deleted = db.Column(db.Integer, default=-1)
newpeat = db.Column(db.Boolean, default=False)
quotes = db.relationship("Quote", backref="section", lazy="dynamic")
def build(self, quotes, commit=False, reset=False):
"""
Given an List of unformatted script quotes, automatically creates Quotes assigned to this Section
"""
for i, quote in enumerate(quotes):
if quote.lower().startswith("deleted scene"):
raise Exception(
f'Deleted Scene Quote passed to Section Builder: "{quote}"'
)
# match = re.match(quotePattern, quote)
# assert match != None, f"Quote '{quote}' could not be processed."
# q = Quote(section=self, speaker=match[1].strip(), text=match[2].strip())
mark = quote.find(":")
q = Quote(
section=self,
speaker=quote[:mark],
text=quote[mark + 1 :],
section_index=i,
)
db.session.add(q)
if commit:
db.session.commit()
def clear(self, doprint=False, commit=True, delete=False):
"""
Delete all quotes relevant to this section.
"""
quotes = Quote.query.filter_by(section_id=self.id).all()
if doprint:
print(f"Clearing {len(quotes)} quotes from Section ID {self.id}")
for quote in quotes:
db.session.delete(quote)
if delete:
db.session.delete(self)
if commit:
db.session.commit()
def __repr__(self):
season = Episode.query.get(self.episode_id).id
quotes = len(Quote.query.filter_by(section_id=self.id).all())
return f"Section(id={self.id} S-EP={season}/{self.episode_id} quotes=[{quotes}...])"
class Quote(db.Model):
"""represents a specific quote by a specific speaker"""
id = db.Column(db.Integer, primary_key=True)
section_id = db.Column(
db.Integer, db.ForeignKey("section.id")
) # The section this quote belongs to.
speaker = db.Column(db.String(32)) # The name of a character
text = db.Column(
db.String(512)
) # The content of the Quote. Usually a sentence, sometimes more.
section_index = db.Column(db.Integer) # The index of this quote in the section
def __repr__(self):
return f"Quote(speaker='{self.speaker}' text='{self.text[:50]}{'...' if len(self.text) > 51 else ''}')"