diff --git a/README.md b/README.md index 3fd2f60..9620893 100644 --- a/README.md +++ b/README.md @@ -46,10 +46,10 @@ The data has to be parsed, but due to high irregularity (at least too much for m inspected and manually processed. ```python server/cli.py preprocess - --season SEASON Pre-processes all episodes from a specific season. - --episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified. - --all Pre-processes all episodes from every season. - --overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever. + -s --season SEASON Pre-processes all episodes from a specific season. + -e --episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified. + -a --all Pre-processes all episodes from every season. + -o --overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever. ``` From then on, once all files have been pre-processed, you will have to begin the long, annoying process of editing them into my custom format. @@ -91,9 +91,9 @@ they are just the JSON format of the previous stage. ``` python server/cli.py process - --season SEASON Processes all episodes from a specific season. - --epsiode EPISODE Processes a specific episode. Requires SEASON to be specified. - --all Processes all episodes from all seasons. + -s --season SEASON Processes all episodes from a specific season. + -e --epsiode EPISODE Processes a specific episode. Requires SEASON to be specified. + -a --all Processes all episodes from all seasons. ``` Now that they're all in individual files, the final commands can be ran to compile them into one file, a static @@ -109,6 +109,34 @@ Each command is ran with no special arguments (as of now), generating a `algolia This `data.json` file is loaded by the Flask server and the `algolia.json` can be uploaded to your primary index. +For every command mentioned, you can read all arguments with `--help`: + +``` +$ python cli.py preprocess --help +Usage: cli.py preprocess [OPTIONS] + + Pre-processes raw HTML files into mangled custom quote data. + + Custom quote data requires manual inspection and formatting, making it a + dangerous operation that may overwrite precious quote data. + +Options: + -s, --season INTEGER Season to be fetched. Without --episode, will + download all episodes in a season. + + -e, --episode INTEGER Specific episode to be fetched. Requires + --season to be specified. + + --all Fetch all episodes, regardless of previous + specifications. + + -o, --overwrite Overwrite if a file already exists. + -ss, --silent-skip Skip missing/existing files silently + -ssm, --silent-skip-missing Skip missing files silently + -sse, --silent-skip-existing Skip overwrite skips silently + --help Show this message and exit. +``` + ## Setup This project was built on Python 3.7 and Node v12.18.3 / npm 6.14.6. diff --git a/client/package-lock.json b/client/package-lock.json index 2b73a94..f07f4ab 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -1,6 +1,6 @@ { - "name": "client", - "version": "0.1.0", + "name": "TheOfficeQuotes", + "version": "0.2.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -1171,6 +1171,32 @@ "to-fast-properties": "^2.0.0" } }, + "@fortawesome/fontawesome-common-types": { + "version": "0.2.30", + "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-0.2.30.tgz", + "integrity": "sha512-TsRwpTuKwFNiPhk1UfKgw7zNPeV5RhNp2Uw3pws+9gDAkPGKrtjR1y2lI3SYn7+YzyfuNknflpBA1LRKjt7hMg==" + }, + "@fortawesome/fontawesome-svg-core": { + "version": "1.2.30", + "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-svg-core/-/fontawesome-svg-core-1.2.30.tgz", + "integrity": "sha512-E3sAXATKCSVnT17HYmZjjbcmwihrNOCkoU7dVMlasrcwiJAHxSKeZ+4WN5O+ElgO/FaYgJmASl8p9N7/B/RttA==", + "requires": { + "@fortawesome/fontawesome-common-types": "^0.2.30" + } + }, + "@fortawesome/free-solid-svg-icons": { + "version": "5.14.0", + "resolved": "https://registry.npmjs.org/@fortawesome/free-solid-svg-icons/-/free-solid-svg-icons-5.14.0.tgz", + "integrity": "sha512-M933RDM8cecaKMWDSk3FRYdnzWGW7kBBlGNGfvqLVwcwhUPNj9gcw+xZMrqBdRqxnSXdl3zWzTCNNGEtFUq67Q==", + "requires": { + "@fortawesome/fontawesome-common-types": "^0.2.30" + } + }, + "@fortawesome/vue-fontawesome": { + "version": "0.1.10", + "resolved": "https://registry.npmjs.org/@fortawesome/vue-fontawesome/-/vue-fontawesome-0.1.10.tgz", + "integrity": "sha512-b2+SLF31h32LSepVcXe+BQ63yvbq5qmTCy4KfFogCYm2bn68H5sDWUnX+U7MBqnM2aeEk9M7xSoqGnu+wSdY6w==" + }, "@hapi/address": { "version": "2.1.4", "resolved": "https://registry.npm.taobao.org/@hapi/address/download/@hapi/address-2.1.4.tgz?cache=0&sync_timestamp=1593993773437&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2F%40hapi%2Faddress%2Fdownload%2F%40hapi%2Faddress-2.1.4.tgz", diff --git a/client/package.json b/client/package.json index 03cd22b..5499c0e 100644 --- a/client/package.json +++ b/client/package.json @@ -1,5 +1,5 @@ { - "name": "The Office Quotes", + "name": "TheOfficeQuotes", "version": "0.2.0", "private": true, "scripts": { @@ -8,6 +8,9 @@ "lint": "vue-cli-service lint" }, "dependencies": { + "@fortawesome/fontawesome-svg-core": "^1.2.30", + "@fortawesome/free-solid-svg-icons": "^5.14.0", + "@fortawesome/vue-fontawesome": "^0.1.10", "algoliasearch": "^4.3.1", "axios": ">=0.18.1", "bootstrap": "^4.3.1", diff --git a/client/vue.config.js b/client/vue.config.js new file mode 100644 index 0000000..a9237fc --- /dev/null +++ b/client/vue.config.js @@ -0,0 +1,4 @@ +module.exports = { + indexPath: '../../dist/index.html', + assetsDir: '../../dist', +}; diff --git a/server/cli.py b/server/cli.py index 901d5d7..dcb085f 100644 --- a/server/cli.py +++ b/server/cli.py @@ -5,16 +5,20 @@ CLI entrypoint for fetching, processing and compiling quote data. """ import logging import os +import re import sys import time -from typing import List, Tuple +from typing import List, Tuple, Union import click import enlighten import requests +from bs4 import BeautifulSoup sys.path[0] += '\\..' -from server.process import get_episodes, get_filepath, sleep_from, verify_episode +from server.process import DATA_DIR, get_characters, get_episodes, get_filepath, load_file, \ + save_file, sleep_from, \ + verify_episode logging.basicConfig(level=logging.INFO) logger = logging.getLogger('cli') @@ -24,7 +28,6 @@ manager = enlighten.get_manager() @click.group() def cli(): - """Base command group.""" pass @@ -90,9 +93,8 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s last_request = time.time() if resp.ok: # Write data to file - with open(filepath, 'w', encoding='utf-8') as file: - file.write(resp.text) - logger.debug('Successfully fetched.') + save_file(filepath, resp.text, False) + logger.debug('Successfully fetched & saved.') else: logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.') pbar.update() @@ -105,15 +107,18 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') @click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') -@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') -@click.option('-d', '--dry-run', is_flag=True) -def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool): +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently') +@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently') +@click.option('-sse', '--silent-skip-existing', is_flag=True, help='Skip overwrite skips silently') +def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool, + silent_skip_existing: bool): """ Pre-processes raw HTML files into mangled custom quote data. Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite precious quote data. """ + print(silent_skip_existing) episodes: List[Tuple[int, int]] if all: @@ -134,6 +139,38 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski else: logger.info('You must specify which episodes to pre-process.') logger.info('Check --help for more information on this command.') + return + + for season, episode in episodes: + # Overwrite protection + save_path = get_filepath(season, episode, 'raw') + if os.path.exists(save_path) and not overwrite: + if (not silent_skip) or (not silent_skip_existing): + logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.') + continue + + try: + page_data = load_file(get_filepath(season, episode, 'html'), False) + except FileNotFoundError: + if not silent_skip or not silent_skip_missing: + logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.') + else: + soup = BeautifulSoup(page_data, "html.parser") + data = [] + + sections = soup.find_all(attrs={"class": "quote"}) + for section in sections: + for br in section.find_all('br'): + br.replace_with("\n" + br.text) + + for line in section.get_text().split('\n'): + data.append(line.strip()) + + data.append('-') + data.pop(-1) + + data = '\n'.join(data) + save_file(save_path, data, False) @cli.command('process') @@ -141,9 +178,8 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski help='Season to be fetched. Without --episode, will download all episodes in a season.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') @click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') -@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') -@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') -def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool): +@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed') +def process(season: int, episode: int, all: bool, report: bool): """ Processes manually processed raw quote data into JSON. """ @@ -169,24 +205,138 @@ def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: logger.info('Check --help for more information on this command.') return + quote: Union[str, List[str]] + section_num: int + for _season, _episode in episodes: + sections = [] + try: + preprocessed_data = load_file(get_filepath(_season, _episode, 'raw')) + for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1): + section = { + 'quotes': [] + } + + section_data = list(raw_section.strip().split('\n')) + if section_data[0].startswith('!'): + section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1)) + + for quote in section_data: + quote = quote.split('|', 1) + section['quotes'].append( + { + 'speaker': quote[0], + 'text': quote[1] + } + ) + sections.append(section) + except FileNotFoundError: + logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.') + except: + logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.') + logger.info( + f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}') + else: + # Save processed data + save_file(get_filepath(_season, _episode, 'processed'), sections, True) + + if report: + deleted_count = [0, set()] + quote_count = 0 + speakers = set() + + for section in sections: + quote_count += len(section['quotes']) + + if 'deleted' in section.keys(): + deleted_count[0] += 1 + deleted_count[1].add(section['deleted']) + + for quote in section['quotes']: + speakers.add(quote['speaker']) + + logger.debug(f'{quote_count} quotes.') + logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.') + logger.info(f'{len(speakers)} Speakers:') + logger.info(', '.join(speakers)) + @cli.group('build') def build(): """Build final data files used by Algolia and the backend API.""" + pass @build.command('algolia') -def algolia(): +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') +@click.option('--process', is_flag=True, help='Run processing before building final data.') +def algolia(silent_skip: bool): """ Generates algolia.json, a all encompassing file for Algolia's search index. """ - files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) + data = [] + episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0 + for season, episode in get_episodes(): + episode_num_abs += 1 + try: + episode_data = load_file(get_filepath(season, episode, 'processed'), True) + except FileNotFoundError: + if not silent_skip: + logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.') + else: + for section_num_rel, section in enumerate(episode_data, start=1): + section_num_abs += 1 + for quote_num_rel, quote in enumerate(section['quotes'], start=1): + quote_num_abs += 1 + + # Relative position + quote['quote_rel'] = quote_num_rel + quote['section_rel'] = section_num_rel + quote['episode_rel'] = episode + # Absolute position + quote['quote_abs'] = quote_num_abs + quote['section_abs'] = section_num_abs + quote['episode_abs'] = episode_num_abs + + quote['season'] = season + + quote['is_deleted'] = 'deleted' in section.keys() + quote['deleted_section'] = section.get('deleted') + + data.append(quote) + + logger.info(f'Saving {len(data):,} quotes to algolia.json') + save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True) @build.command('final') -def final(): +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') +@click.option('--process', is_flag=True, help='Run processing before building final data.') +def final(silent_skip: bool): """Generates the latest application static data.json file, used by the backend API.""" - files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) + descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True) + seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)] + for season_id, episode_id in get_episodes(): + # Load data file + try: + episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True) + except FileNotFoundError: + if not silent_skip: + logger.warning(f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.') + episode_data = None + + description = descriptions[season_id - 1][episode_id - 1] + seasons[season_id - 1]['episodes'].append( + { + 'title': description['title'].strip(), + 'description': description['description'].strip(), + 'episode_id': episode_id, + 'characters': get_characters(season_id, episode_id), + 'scenes': episode_data + } + ) + + logger.info('Saving to data.json') + save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True) if __name__ == "__main__": diff --git a/server/process.py b/server/process.py index badf0b5..fd12426 100644 --- a/server/process.py +++ b/server/process.py @@ -1,14 +1,18 @@ +""" +process.py + +Functions and shortcuts for loading/saving/extracting data for processing quote data. +""" + import json import os -import re import time from collections import defaultdict from math import ceil -from typing import Iterable, List, Tuple +from typing import Dict, Iterable, List, Tuple, Union import enlighten import requests -from bs4 import BeautifulSoup session = requests.Session() @@ -31,7 +35,7 @@ def get_filepath(season: int, episode: int, folder: str) -> str: return os.path.join(DATA_DIR, get_filename(season, episode, 'json')) -def load_file(filepath: str, json_decode: bool): +def load_file(filepath: str, json_decode: bool = False): """Shortcut function for loading file from filepath, with JSON parsing flag.""" if json_decode: with open(filepath, 'r', encoding='utf-8') as file: @@ -68,7 +72,7 @@ def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]: def verify_episode(season: int, episode: int = None) -> bool: """ - Verifies that a Season or Season + Episode is valid. + Verifies that specific Season and/or Episode is valid. """ return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season]) @@ -99,149 +103,20 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non return 0 -def preprocess(page_data: str) -> List[str]: - soup = BeautifulSoup(page_data, "html.parser") - - data = [] - sections = soup.find_all(attrs={"class": "quote"}) - for section in sections: - for br in section.find_all('br'): - br.replace_with("\n" + br.text) - - for line in section.get_text().split('\n'): - data.append(line.strip()) - - data.append('-') - data.pop(-1) - - return data - - -def process(season, episode): - with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r', - encoding='utf-8') as file: - - sections = [] - for s in re.split('^-', file.read(), flags=re.MULTILINE): - section = { - 'quotes': [] - } - - section_data = list(s.strip().split('\n')) - if section_data[0].startswith('!'): - section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1)) - - for q in section_data: - quote = q.split('|', 1) - print(quote) - section['quotes'].append( - { - 'speaker': quote[0], - 'text': quote[1] - } - ) - sections.append(section) - - with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w', - encoding='utf-8') as file: - json.dump(sections, file, indent=4, ensure_ascii=False) - - deleted_count = [0, set()] - quote_count = 0 - speakers = set() - - for section in sections: - quote_count += len(section['quotes']) - - if 'deleted' in section.keys(): - deleted_count[0] += 1 - deleted_count[1].add(section['deleted']) - - for quote in section['quotes']: - speakers.add(quote['speaker']) - - print(f'{quote_count} quotes.') - print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.') - print(f'{len(speakers)} Speakers:') - print(', '.join(speakers)) - - -def generate_algolia(): - data = [] - quote_num = 0 - for season, episode in episodes(): - try: - with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r', - encoding='utf-8') as file: - episode_data = json.load(file) - except FileNotFoundError: - print(f'No JSON data for Season {season} Episode {episode}') - else: - for section_num, section in enumerate(episode_data, start=1): - for quote in section['quotes']: - quote_num += 1 - quote['quote'] = quote_num - quote['section'] = section_num - quote['episode'] = episode - quote['season'] = season - - quote['is_deleted'] = 'deleted' in section.keys() - quote['deleted_section'] = section.get('deleted') - - data.append(quote) - - with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file: - json.dump(data, file, ensure_ascii=False, indent=4) - - -def get_episode_scenes(season, episode): - filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json') - if os.path.exists(filepath): - with open(filepath, 'r', encoding='utf-8') as file: - return json.load(file) - else: - return None - - -def get_characters(season, episode): - scenes = get_episode_scenes(season, episode) - if scenes is None: - return None +def get_characters(season, episode) -> List[Dict[str, Union[int, str]]]: + """ + Extracts all characters and their number of appearances from a specific episode. + Prepared in a list of dictionary, preferable storage/for loop method. + """ + filepath = get_filepath(season, episode, 'processed') + if not os.path.exists(filepath): + return [] + scenes = load_file(filepath, True) characters = defaultdict(int) for scene in scenes: - for quote in scene['quotes']: - characters[quote['speaker']] += 1 + for quote in scene.get('quotes', []): + characters[quote.get('speaker')] += 1 characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()} for character, appearances in characters.items()] return list(sorted(characters, key=lambda item: item['appearances'], reverse=True)) - - -def generate_final(): - """Merge episode descriptions/titles and quotes into final JSON file.""" - with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file: - data = json.load(file) - - output = [] - for season_id, season in enumerate(data, start=1): - output.append({ - 'season_id': season_id, - 'episodes': [ - { - 'title': episode['title'].strip(), - 'description': episode['description'].strip(), - 'episode_id': episode_id, - 'characters': get_characters(season_id, episode_id), - 'scenes': get_episode_scenes(season_id, episode_id) - } - for episode_id, episode in enumerate(season, start=1) - ] - }) - - with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file: - json.dump(output, file, ensure_ascii=False, indent=4) - - -# generate_algolia() -# process(3, 10) -generate_final()