Files
the-office/server/process.py

197 lines
6.7 KiB
Python

import json
import os
import re
import traceback
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
s = requests.Session()
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, 'server', 'data')
def get_raw(season, episode):
html_filename = f'{season}-{str(episode).zfill(2)}.html'
html_filepath = os.path.join(DATA_DIR, 'html', html_filename)
# If .html file exists, read
if os.path.exists(html_filepath):
# print('Reading from disk...')
with open(html_filepath, 'r', encoding='utf-8') as file:
page_data = file.read()
# If not, write to disk for later usage
else:
link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
resp = s.get(link)
if resp.ok:
page_data = resp.text
with open(html_filepath, 'w', encoding='utf-8') as file:
file.write(page_data)
else:
raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"')
soup = BeautifulSoup(page_data, "html.parser")
data = []
sections = soup.find_all(attrs={"class": "quote"})
for section in sections:
for br in section.find_all('br'):
br.replace_with("\n" + br.text)
for line in section.get_text().split('\n'):
data.append(line.strip())
data.append('-')
data.pop(-1)
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w',
encoding='utf-8') as file:
file.write('\n'.join(data))
def episodes():
ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23]
for season_num, ep_count in enumerate(ep_nums, start=1):
for episode_num in range(1, ep_count + 1):
yield season_num, episode_num
def download_all_raw():
for season_num, episode_num in episodes():
print(f'{season_num}-{str(episode_num).zfill(2)}')
try:
get_raw(season_num, episode_num)
except Exception as exception:
print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})')
traceback.print_exc()
def process(season, episode):
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r',
encoding='utf-8') as file:
sections = []
for s in re.split('^-', file.read(), flags=re.MULTILINE):
section = {
'quotes': []
}
section_data = list(s.strip().split('\n'))
if section_data[0].startswith('!'):
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
for q in section_data:
quote = q.split('|', 1)
print(quote)
section['quotes'].append(
{
'speaker': quote[0],
'text': quote[1]
}
)
sections.append(section)
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w',
encoding='utf-8') as file:
json.dump(sections, file, indent=4, ensure_ascii=False)
deleted_count = [0, set()]
quote_count = 0
speakers = set()
for section in sections:
quote_count += len(section['quotes'])
if 'deleted' in section.keys():
deleted_count[0] += 1
deleted_count[1].add(section['deleted'])
for quote in section['quotes']:
speakers.add(quote['speaker'])
print(f'{quote_count} quotes.')
print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
print(f'{len(speakers)} Speakers:')
print(', '.join(speakers))
def generate_algolia():
data = []
quote_num = 0
for season, episode in episodes():
try:
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r',
encoding='utf-8') as file:
episode_data = json.load(file)
except FileNotFoundError:
print(f'No JSON data for Season {season} Episode {episode}')
else:
for section_num, section in enumerate(episode_data, start=1):
for quote in section['quotes']:
quote_num += 1
quote['quote'] = quote_num
quote['section'] = section_num
quote['episode'] = episode
quote['season'] = season
quote['is_deleted'] = 'deleted' in section.keys()
quote['deleted_section'] = section.get('deleted')
data.append(quote)
with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def get_episode_scenes(season, episode):
filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json')
if os.path.exists(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
return json.load(file)
else:
return None
def get_characters(season, episode):
scenes = get_episode_scenes(season, episode)
if scenes is None:
return None
characters = defaultdict(int)
for scene in scenes:
for quote in scene['quotes']:
characters[quote['speaker']] += 1
characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()}
for character, appearances in characters.items()]
return list(sorted(characters, key=lambda item: item['appearances'], reverse=True))
def generate_final():
"""Merge episode descriptions/titles and quotes into final JSON file."""
with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file:
data = json.load(file)
output = []
for season_id, season in enumerate(data, start=1):
output.append({
'season_id': season_id,
'episodes': [
{
'title': episode['title'].strip(),
'description': episode['description'].strip(),
'episode_id': episode_id,
'characters': get_characters(season_id, episode_id),
'scenes': get_episode_scenes(season_id, episode_id)
}
for episode_id, episode in enumerate(season, start=1)
]
})
with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file:
json.dump(output, file, ensure_ascii=False, indent=4)
# generate_algolia()
# process(3, 10)
generate_final()