From d6e2ce1df4c88c41fc36d8c26a47b53ad0da2da7 Mon Sep 17 00:00:00 2001 From: Xevion Date: Sat, 8 Aug 2020 03:59:03 -0500 Subject: [PATCH] template preprocess/process/algolia/final commands, cleanup process.py functions removing obsolete funcs, adding save_file & preprocess --- server/cli.py | 79 ++++++++++++++++++++++++++++++++++++++++++----- server/process.py | 65 ++++++++++++-------------------------- 2 files changed, 90 insertions(+), 54 deletions(-) diff --git a/server/cli.py b/server/cli.py index 65b7a23..901d5d7 100644 --- a/server/cli.py +++ b/server/cli.py @@ -3,7 +3,6 @@ cli.py CLI entrypoint for fetching, processing and compiling quote data. """ - import logging import os import sys @@ -39,7 +38,9 @@ def cli(): @click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool): """ - Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files. + Downloads raw quote pages from 'officequotes.net'. + + Fetches quote pages, placing them in 'html' folder in unmodified UTF-8 HTML files. """ episodes: List[Tuple[int, int]] @@ -98,18 +99,80 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s logger.info('Fetching complete.') +@cli.command('preprocess') +@click.option('-s', '--season', type=int, + help='Season to be fetched. Without --episode, will download all episodes in a season.') +@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') +@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') +@click.option('-d', '--dry-run', is_flag=True) +def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool): + """ + Pre-processes raw HTML files into mangled custom quote data. + + Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite + precious quote data. + """ + episodes: List[Tuple[int, int]] + + if all: + episodes = list(get_episodes()) + elif season: + if episode: + if verify_episode(season, episode): + episodes = [(season, episode)] + else: + logger.error(f'Season {season}, Episode {episode} is not a valid combination.') + return + else: + episodes = list(get_episodes(season=season)) + logger.info(f'Preprocessing Season {season}...') + else: + if episode: + logger.info('You must specify more than just an episode.') + else: + logger.info('You must specify which episodes to pre-process.') + logger.info('Check --help for more information on this command.') + + @cli.command('process') -def process(): +@click.option('-s', '--season', type=int, + help='Season to be fetched. Without --episode, will download all episodes in a season.') +@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') +@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') +def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool): """ Processes manually processed raw quote data into JSON. """ - pass + episodes: List[Tuple[int, int]] + + if all: + episodes = list(get_episodes()) + elif season: + if episode: + if verify_episode(season, episode): + episodes = [(season, episode)] + else: + logger.error(f'Season {season}, Episode {episode} is not a valid combination.') + return + else: + episodes = list(get_episodes(season=season)) + logger.info(f'Processing Season {season}...') + else: + if episode: + logger.info('You must specify more than just an episode.') + else: + logger.info('You must specify which episodes to process.') + logger.info('Check --help for more information on this command.') + return @cli.group('build') def build(): - """Data building command group.""" - pass + """Build final data files used by Algolia and the backend API.""" @build.command('algolia') @@ -117,13 +180,13 @@ def algolia(): """ Generates algolia.json, a all encompassing file for Algolia's search index. """ - pass + files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) @build.command('final') def final(): """Generates the latest application static data.json file, used by the backend API.""" - pass + files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) if __name__ == "__main__": diff --git a/server/process.py b/server/process.py index fe0fff9..badf0b5 100644 --- a/server/process.py +++ b/server/process.py @@ -2,10 +2,9 @@ import json import os import re import time -import traceback from collections import defaultdict from math import ceil -from typing import Iterable, Tuple +from typing import Iterable, List, Tuple import enlighten import requests @@ -32,16 +31,26 @@ def get_filepath(season: int, episode: int, folder: str) -> str: return os.path.join(DATA_DIR, get_filename(season, episode, 'json')) -def load_file(filepath: str, parse_json: bool): +def load_file(filepath: str, json_decode: bool): """Shortcut function for loading file from filepath, with JSON parsing flag.""" - if parse_json: - with open(filepath, 'r') as file: + if json_decode: + with open(filepath, 'r', encoding='utf-8') as file: return json.load(file) else: - with open(filepath, 'r') as file: + with open(filepath, 'r', encoding='utf-8') as file: return file.read() +def save_file(filepath: str, data, json_encode: bool): + """Shortcut function for saving data to a file, JSON encoding flag.""" + if json_encode: + with open(filepath, 'w', encoding='utf-8') as file: + json.dump(data, file, ensure_ascii=False, indent=4) + else: + with open(filepath, 'w', encoding='utf-8') as file: + file.write(data) + + def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]: """ Yields a list of Episode & Season tuples. @@ -90,26 +99,7 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non return 0 -def get_raw(season, episode): - html_filename = f'{season}-{str(episode).zfill(2)}.html' - html_filepath = os.path.join(DATA_DIR, 'html', html_filename) - - # If .html file exists, read - if os.path.exists(html_filepath): - # print('Reading from disk...') - with open(html_filepath, 'r', encoding='utf-8') as file: - page_data = file.read() - # If not, write to disk for later usage - else: - link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php" - resp = session.get(link) - if resp.ok: - page_data = resp.text - with open(html_filepath, 'w', encoding='utf-8') as file: - file.write(page_data) - else: - raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"') - +def preprocess(page_data: str) -> List[str]: soup = BeautifulSoup(page_data, "html.parser") data = [] @@ -117,31 +107,14 @@ def get_raw(season, episode): for section in sections: for br in section.find_all('br'): br.replace_with("\n" + br.text) + for line in section.get_text().split('\n'): data.append(line.strip()) + data.append('-') data.pop(-1) - with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w', - encoding='utf-8') as file: - file.write('\n'.join(data)) - - -def episodes(): - ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23] - for season_num, ep_count in enumerate(ep_nums, start=1): - for episode_num in range(1, ep_count + 1): - yield season_num, episode_num - - -def download_all_raw(): - for season_num, episode_num in episodes(): - print(f'{season_num}-{str(episode_num).zfill(2)}') - try: - get_raw(season_num, episode_num) - except Exception as exception: - print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})') - traceback.print_exc() + return data def process(season, episode):