mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-09 16:08:50 -06:00
finish preprocess, build algolia, build final commands, update README, remove obsolete code in process.py
This commit is contained in:
42
README.md
42
README.md
@@ -46,10 +46,10 @@ The data has to be parsed, but due to high irregularity (at least too much for m
|
||||
inspected and manually processed.
|
||||
|
||||
```python server/cli.py preprocess
|
||||
--season SEASON Pre-processes all episodes from a specific season.
|
||||
--episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified.
|
||||
--all Pre-processes all episodes from every season.
|
||||
--overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever.
|
||||
-s --season SEASON Pre-processes all episodes from a specific season.
|
||||
-e --episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified.
|
||||
-a --all Pre-processes all episodes from every season.
|
||||
-o --overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever.
|
||||
```
|
||||
|
||||
From then on, once all files have been pre-processed, you will have to begin the long, annoying process of editing them into my custom format.
|
||||
@@ -91,9 +91,9 @@ they are just the JSON format of the previous stage.
|
||||
|
||||
```
|
||||
python server/cli.py process
|
||||
--season SEASON Processes all episodes from a specific season.
|
||||
--epsiode EPISODE Processes a specific episode. Requires SEASON to be specified.
|
||||
--all Processes all episodes from all seasons.
|
||||
-s --season SEASON Processes all episodes from a specific season.
|
||||
-e --epsiode EPISODE Processes a specific episode. Requires SEASON to be specified.
|
||||
-a --all Processes all episodes from all seasons.
|
||||
```
|
||||
|
||||
Now that they're all in individual files, the final commands can be ran to compile them into one file, a static
|
||||
@@ -109,6 +109,34 @@ Each command is ran with no special arguments (as of now), generating a `algolia
|
||||
|
||||
This `data.json` file is loaded by the Flask server and the `algolia.json` can be uploaded to your primary index.
|
||||
|
||||
For every command mentioned, you can read all arguments with `--help`:
|
||||
|
||||
```
|
||||
$ python cli.py preprocess --help
|
||||
Usage: cli.py preprocess [OPTIONS]
|
||||
|
||||
Pre-processes raw HTML files into mangled custom quote data.
|
||||
|
||||
Custom quote data requires manual inspection and formatting, making it a
|
||||
dangerous operation that may overwrite precious quote data.
|
||||
|
||||
Options:
|
||||
-s, --season INTEGER Season to be fetched. Without --episode, will
|
||||
download all episodes in a season.
|
||||
|
||||
-e, --episode INTEGER Specific episode to be fetched. Requires
|
||||
--season to be specified.
|
||||
|
||||
--all Fetch all episodes, regardless of previous
|
||||
specifications.
|
||||
|
||||
-o, --overwrite Overwrite if a file already exists.
|
||||
-ss, --silent-skip Skip missing/existing files silently
|
||||
-ssm, --silent-skip-missing Skip missing files silently
|
||||
-sse, --silent-skip-existing Skip overwrite skips silently
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
This project was built on Python 3.7 and Node v12.18.3 / npm 6.14.6.
|
||||
|
||||
30
client/package-lock.json
generated
30
client/package-lock.json
generated
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "client",
|
||||
"version": "0.1.0",
|
||||
"name": "TheOfficeQuotes",
|
||||
"version": "0.2.0",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
@@ -1171,6 +1171,32 @@
|
||||
"to-fast-properties": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"@fortawesome/fontawesome-common-types": {
|
||||
"version": "0.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-0.2.30.tgz",
|
||||
"integrity": "sha512-TsRwpTuKwFNiPhk1UfKgw7zNPeV5RhNp2Uw3pws+9gDAkPGKrtjR1y2lI3SYn7+YzyfuNknflpBA1LRKjt7hMg=="
|
||||
},
|
||||
"@fortawesome/fontawesome-svg-core": {
|
||||
"version": "1.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-svg-core/-/fontawesome-svg-core-1.2.30.tgz",
|
||||
"integrity": "sha512-E3sAXATKCSVnT17HYmZjjbcmwihrNOCkoU7dVMlasrcwiJAHxSKeZ+4WN5O+ElgO/FaYgJmASl8p9N7/B/RttA==",
|
||||
"requires": {
|
||||
"@fortawesome/fontawesome-common-types": "^0.2.30"
|
||||
}
|
||||
},
|
||||
"@fortawesome/free-solid-svg-icons": {
|
||||
"version": "5.14.0",
|
||||
"resolved": "https://registry.npmjs.org/@fortawesome/free-solid-svg-icons/-/free-solid-svg-icons-5.14.0.tgz",
|
||||
"integrity": "sha512-M933RDM8cecaKMWDSk3FRYdnzWGW7kBBlGNGfvqLVwcwhUPNj9gcw+xZMrqBdRqxnSXdl3zWzTCNNGEtFUq67Q==",
|
||||
"requires": {
|
||||
"@fortawesome/fontawesome-common-types": "^0.2.30"
|
||||
}
|
||||
},
|
||||
"@fortawesome/vue-fontawesome": {
|
||||
"version": "0.1.10",
|
||||
"resolved": "https://registry.npmjs.org/@fortawesome/vue-fontawesome/-/vue-fontawesome-0.1.10.tgz",
|
||||
"integrity": "sha512-b2+SLF31h32LSepVcXe+BQ63yvbq5qmTCy4KfFogCYm2bn68H5sDWUnX+U7MBqnM2aeEk9M7xSoqGnu+wSdY6w=="
|
||||
},
|
||||
"@hapi/address": {
|
||||
"version": "2.1.4",
|
||||
"resolved": "https://registry.npm.taobao.org/@hapi/address/download/@hapi/address-2.1.4.tgz?cache=0&sync_timestamp=1593993773437&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2F%40hapi%2Faddress%2Fdownload%2F%40hapi%2Faddress-2.1.4.tgz",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "The Office Quotes",
|
||||
"name": "TheOfficeQuotes",
|
||||
"version": "0.2.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
@@ -8,6 +8,9 @@
|
||||
"lint": "vue-cli-service lint"
|
||||
},
|
||||
"dependencies": {
|
||||
"@fortawesome/fontawesome-svg-core": "^1.2.30",
|
||||
"@fortawesome/free-solid-svg-icons": "^5.14.0",
|
||||
"@fortawesome/vue-fontawesome": "^0.1.10",
|
||||
"algoliasearch": "^4.3.1",
|
||||
"axios": ">=0.18.1",
|
||||
"bootstrap": "^4.3.1",
|
||||
|
||||
4
client/vue.config.js
Normal file
4
client/vue.config.js
Normal file
@@ -0,0 +1,4 @@
|
||||
module.exports = {
|
||||
indexPath: '../../dist/index.html',
|
||||
assetsDir: '../../dist',
|
||||
};
|
||||
182
server/cli.py
182
server/cli.py
@@ -5,16 +5,20 @@ CLI entrypoint for fetching, processing and compiling quote data.
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import click
|
||||
import enlighten
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sys.path[0] += '\\..'
|
||||
from server.process import get_episodes, get_filepath, sleep_from, verify_episode
|
||||
from server.process import DATA_DIR, get_characters, get_episodes, get_filepath, load_file, \
|
||||
save_file, sleep_from, \
|
||||
verify_episode
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger('cli')
|
||||
@@ -24,7 +28,6 @@ manager = enlighten.get_manager()
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""Base command group."""
|
||||
pass
|
||||
|
||||
|
||||
@@ -90,9 +93,8 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
|
||||
last_request = time.time()
|
||||
if resp.ok:
|
||||
# Write data to file
|
||||
with open(filepath, 'w', encoding='utf-8') as file:
|
||||
file.write(resp.text)
|
||||
logger.debug('Successfully fetched.')
|
||||
save_file(filepath, resp.text, False)
|
||||
logger.debug('Successfully fetched & saved.')
|
||||
else:
|
||||
logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
|
||||
pbar.update()
|
||||
@@ -105,15 +107,18 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
|
||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||
@click.option('-d', '--dry-run', is_flag=True)
|
||||
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
|
||||
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
|
||||
@click.option('-sse', '--silent-skip-existing', is_flag=True, help='Skip overwrite skips silently')
|
||||
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool,
|
||||
silent_skip_existing: bool):
|
||||
"""
|
||||
Pre-processes raw HTML files into mangled custom quote data.
|
||||
|
||||
Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
|
||||
precious quote data.
|
||||
"""
|
||||
print(silent_skip_existing)
|
||||
episodes: List[Tuple[int, int]]
|
||||
|
||||
if all:
|
||||
@@ -134,6 +139,38 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
|
||||
else:
|
||||
logger.info('You must specify which episodes to pre-process.')
|
||||
logger.info('Check --help for more information on this command.')
|
||||
return
|
||||
|
||||
for season, episode in episodes:
|
||||
# Overwrite protection
|
||||
save_path = get_filepath(season, episode, 'raw')
|
||||
if os.path.exists(save_path) and not overwrite:
|
||||
if (not silent_skip) or (not silent_skip_existing):
|
||||
logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.')
|
||||
continue
|
||||
|
||||
try:
|
||||
page_data = load_file(get_filepath(season, episode, 'html'), False)
|
||||
except FileNotFoundError:
|
||||
if not silent_skip or not silent_skip_missing:
|
||||
logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.')
|
||||
else:
|
||||
soup = BeautifulSoup(page_data, "html.parser")
|
||||
data = []
|
||||
|
||||
sections = soup.find_all(attrs={"class": "quote"})
|
||||
for section in sections:
|
||||
for br in section.find_all('br'):
|
||||
br.replace_with("\n" + br.text)
|
||||
|
||||
for line in section.get_text().split('\n'):
|
||||
data.append(line.strip())
|
||||
|
||||
data.append('-')
|
||||
data.pop(-1)
|
||||
|
||||
data = '\n'.join(data)
|
||||
save_file(save_path, data, False)
|
||||
|
||||
|
||||
@cli.command('process')
|
||||
@@ -141,9 +178,8 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
|
||||
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||
def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
|
||||
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed')
|
||||
def process(season: int, episode: int, all: bool, report: bool):
|
||||
"""
|
||||
Processes manually processed raw quote data into JSON.
|
||||
"""
|
||||
@@ -169,24 +205,138 @@ def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip:
|
||||
logger.info('Check --help for more information on this command.')
|
||||
return
|
||||
|
||||
quote: Union[str, List[str]]
|
||||
section_num: int
|
||||
for _season, _episode in episodes:
|
||||
sections = []
|
||||
try:
|
||||
preprocessed_data = load_file(get_filepath(_season, _episode, 'raw'))
|
||||
for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1):
|
||||
section = {
|
||||
'quotes': []
|
||||
}
|
||||
|
||||
section_data = list(raw_section.strip().split('\n'))
|
||||
if section_data[0].startswith('!'):
|
||||
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
|
||||
|
||||
for quote in section_data:
|
||||
quote = quote.split('|', 1)
|
||||
section['quotes'].append(
|
||||
{
|
||||
'speaker': quote[0],
|
||||
'text': quote[1]
|
||||
}
|
||||
)
|
||||
sections.append(section)
|
||||
except FileNotFoundError:
|
||||
logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
|
||||
except:
|
||||
logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
|
||||
logger.info(
|
||||
f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
|
||||
else:
|
||||
# Save processed data
|
||||
save_file(get_filepath(_season, _episode, 'processed'), sections, True)
|
||||
|
||||
if report:
|
||||
deleted_count = [0, set()]
|
||||
quote_count = 0
|
||||
speakers = set()
|
||||
|
||||
for section in sections:
|
||||
quote_count += len(section['quotes'])
|
||||
|
||||
if 'deleted' in section.keys():
|
||||
deleted_count[0] += 1
|
||||
deleted_count[1].add(section['deleted'])
|
||||
|
||||
for quote in section['quotes']:
|
||||
speakers.add(quote['speaker'])
|
||||
|
||||
logger.debug(f'{quote_count} quotes.')
|
||||
logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
|
||||
logger.info(f'{len(speakers)} Speakers:')
|
||||
logger.info(', '.join(speakers))
|
||||
|
||||
|
||||
@cli.group('build')
|
||||
def build():
|
||||
"""Build final data files used by Algolia and the backend API."""
|
||||
pass
|
||||
|
||||
|
||||
@build.command('algolia')
|
||||
def algolia():
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||
@click.option('--process', is_flag=True, help='Run processing before building final data.')
|
||||
def algolia(silent_skip: bool):
|
||||
"""
|
||||
Generates algolia.json, a all encompassing file for Algolia's search index.
|
||||
"""
|
||||
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
|
||||
data = []
|
||||
episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0
|
||||
for season, episode in get_episodes():
|
||||
episode_num_abs += 1
|
||||
try:
|
||||
episode_data = load_file(get_filepath(season, episode, 'processed'), True)
|
||||
except FileNotFoundError:
|
||||
if not silent_skip:
|
||||
logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.')
|
||||
else:
|
||||
for section_num_rel, section in enumerate(episode_data, start=1):
|
||||
section_num_abs += 1
|
||||
for quote_num_rel, quote in enumerate(section['quotes'], start=1):
|
||||
quote_num_abs += 1
|
||||
|
||||
# Relative position
|
||||
quote['quote_rel'] = quote_num_rel
|
||||
quote['section_rel'] = section_num_rel
|
||||
quote['episode_rel'] = episode
|
||||
# Absolute position
|
||||
quote['quote_abs'] = quote_num_abs
|
||||
quote['section_abs'] = section_num_abs
|
||||
quote['episode_abs'] = episode_num_abs
|
||||
|
||||
quote['season'] = season
|
||||
|
||||
quote['is_deleted'] = 'deleted' in section.keys()
|
||||
quote['deleted_section'] = section.get('deleted')
|
||||
|
||||
data.append(quote)
|
||||
|
||||
logger.info(f'Saving {len(data):,} quotes to algolia.json')
|
||||
save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True)
|
||||
|
||||
|
||||
@build.command('final')
|
||||
def final():
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||
@click.option('--process', is_flag=True, help='Run processing before building final data.')
|
||||
def final(silent_skip: bool):
|
||||
"""Generates the latest application static data.json file, used by the backend API."""
|
||||
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
|
||||
descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True)
|
||||
seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)]
|
||||
for season_id, episode_id in get_episodes():
|
||||
# Load data file
|
||||
try:
|
||||
episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True)
|
||||
except FileNotFoundError:
|
||||
if not silent_skip:
|
||||
logger.warning(f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
|
||||
episode_data = None
|
||||
|
||||
description = descriptions[season_id - 1][episode_id - 1]
|
||||
seasons[season_id - 1]['episodes'].append(
|
||||
{
|
||||
'title': description['title'].strip(),
|
||||
'description': description['description'].strip(),
|
||||
'episode_id': episode_id,
|
||||
'characters': get_characters(season_id, episode_id),
|
||||
'scenes': episode_data
|
||||
}
|
||||
)
|
||||
|
||||
logger.info('Saving to data.json')
|
||||
save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,14 +1,18 @@
|
||||
"""
|
||||
process.py
|
||||
|
||||
Functions and shortcuts for loading/saving/extracting data for processing quote data.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from math import ceil
|
||||
from typing import Iterable, List, Tuple
|
||||
from typing import Dict, Iterable, List, Tuple, Union
|
||||
|
||||
import enlighten
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
session = requests.Session()
|
||||
|
||||
@@ -31,7 +35,7 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
|
||||
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
|
||||
|
||||
|
||||
def load_file(filepath: str, json_decode: bool):
|
||||
def load_file(filepath: str, json_decode: bool = False):
|
||||
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
|
||||
if json_decode:
|
||||
with open(filepath, 'r', encoding='utf-8') as file:
|
||||
@@ -68,7 +72,7 @@ def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
|
||||
|
||||
def verify_episode(season: int, episode: int = None) -> bool:
|
||||
"""
|
||||
Verifies that a Season or Season + Episode is valid.
|
||||
Verifies that specific Season and/or Episode is valid.
|
||||
"""
|
||||
return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])
|
||||
|
||||
@@ -99,149 +103,20 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
|
||||
return 0
|
||||
|
||||
|
||||
def preprocess(page_data: str) -> List[str]:
|
||||
soup = BeautifulSoup(page_data, "html.parser")
|
||||
|
||||
data = []
|
||||
sections = soup.find_all(attrs={"class": "quote"})
|
||||
for section in sections:
|
||||
for br in section.find_all('br'):
|
||||
br.replace_with("\n" + br.text)
|
||||
|
||||
for line in section.get_text().split('\n'):
|
||||
data.append(line.strip())
|
||||
|
||||
data.append('-')
|
||||
data.pop(-1)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def process(season, episode):
|
||||
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r',
|
||||
encoding='utf-8') as file:
|
||||
|
||||
sections = []
|
||||
for s in re.split('^-', file.read(), flags=re.MULTILINE):
|
||||
section = {
|
||||
'quotes': []
|
||||
}
|
||||
|
||||
section_data = list(s.strip().split('\n'))
|
||||
if section_data[0].startswith('!'):
|
||||
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
|
||||
|
||||
for q in section_data:
|
||||
quote = q.split('|', 1)
|
||||
print(quote)
|
||||
section['quotes'].append(
|
||||
{
|
||||
'speaker': quote[0],
|
||||
'text': quote[1]
|
||||
}
|
||||
)
|
||||
sections.append(section)
|
||||
|
||||
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w',
|
||||
encoding='utf-8') as file:
|
||||
json.dump(sections, file, indent=4, ensure_ascii=False)
|
||||
|
||||
deleted_count = [0, set()]
|
||||
quote_count = 0
|
||||
speakers = set()
|
||||
|
||||
for section in sections:
|
||||
quote_count += len(section['quotes'])
|
||||
|
||||
if 'deleted' in section.keys():
|
||||
deleted_count[0] += 1
|
||||
deleted_count[1].add(section['deleted'])
|
||||
|
||||
for quote in section['quotes']:
|
||||
speakers.add(quote['speaker'])
|
||||
|
||||
print(f'{quote_count} quotes.')
|
||||
print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
|
||||
print(f'{len(speakers)} Speakers:')
|
||||
print(', '.join(speakers))
|
||||
|
||||
|
||||
def generate_algolia():
|
||||
data = []
|
||||
quote_num = 0
|
||||
for season, episode in episodes():
|
||||
try:
|
||||
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r',
|
||||
encoding='utf-8') as file:
|
||||
episode_data = json.load(file)
|
||||
except FileNotFoundError:
|
||||
print(f'No JSON data for Season {season} Episode {episode}')
|
||||
else:
|
||||
for section_num, section in enumerate(episode_data, start=1):
|
||||
for quote in section['quotes']:
|
||||
quote_num += 1
|
||||
quote['quote'] = quote_num
|
||||
quote['section'] = section_num
|
||||
quote['episode'] = episode
|
||||
quote['season'] = season
|
||||
|
||||
quote['is_deleted'] = 'deleted' in section.keys()
|
||||
quote['deleted_section'] = section.get('deleted')
|
||||
|
||||
data.append(quote)
|
||||
|
||||
with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file:
|
||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
def get_episode_scenes(season, episode):
|
||||
filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json')
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, 'r', encoding='utf-8') as file:
|
||||
return json.load(file)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_characters(season, episode):
|
||||
scenes = get_episode_scenes(season, episode)
|
||||
if scenes is None:
|
||||
return None
|
||||
def get_characters(season, episode) -> List[Dict[str, Union[int, str]]]:
|
||||
"""
|
||||
Extracts all characters and their number of appearances from a specific episode.
|
||||
Prepared in a list of dictionary, preferable storage/for loop method.
|
||||
"""
|
||||
filepath = get_filepath(season, episode, 'processed')
|
||||
if not os.path.exists(filepath):
|
||||
return []
|
||||
scenes = load_file(filepath, True)
|
||||
|
||||
characters = defaultdict(int)
|
||||
for scene in scenes:
|
||||
for quote in scene['quotes']:
|
||||
characters[quote['speaker']] += 1
|
||||
for quote in scene.get('quotes', []):
|
||||
characters[quote.get('speaker')] += 1
|
||||
characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()}
|
||||
for character, appearances in characters.items()]
|
||||
return list(sorted(characters, key=lambda item: item['appearances'], reverse=True))
|
||||
|
||||
|
||||
def generate_final():
|
||||
"""Merge episode descriptions/titles and quotes into final JSON file."""
|
||||
with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
output = []
|
||||
for season_id, season in enumerate(data, start=1):
|
||||
output.append({
|
||||
'season_id': season_id,
|
||||
'episodes': [
|
||||
{
|
||||
'title': episode['title'].strip(),
|
||||
'description': episode['description'].strip(),
|
||||
'episode_id': episode_id,
|
||||
'characters': get_characters(season_id, episode_id),
|
||||
'scenes': get_episode_scenes(season_id, episode_id)
|
||||
}
|
||||
for episode_id, episode in enumerate(season, start=1)
|
||||
]
|
||||
})
|
||||
|
||||
with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file:
|
||||
json.dump(output, file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
# generate_algolia()
|
||||
# process(3, 10)
|
||||
generate_final()
|
||||
|
||||
Reference in New Issue
Block a user