template preprocess/process/algolia/final commands, cleanup process.py functions removing obsolete funcs, adding save_file & preprocess

2026-01-31 06:26:18 -06:00 · 2020-08-08 03:59:03 -05:00
parent 5c80d47404
commit d6e2ce1df4
2 changed files with 90 additions and 54 deletions
@@ -3,7 +3,6 @@ cli.py

 CLI entrypoint for fetching, processing and compiling quote data.
 """
-
 import logging
 import os
 import sys
@@ -39,7 +38,9 @@ def cli():
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
 def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
    """
-    Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
+    Downloads raw quote pages from 'officequotes.net'.
+
+    Fetches quote pages, placing them in 'html' folder in unmodified UTF-8 HTML files.
    """
    episodes: List[Tuple[int, int]]

@@ -98,18 +99,80 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
        logger.info('Fetching complete.')


+@cli.command('preprocess')
+@click.option('-s', '--season', type=int,
+              help='Season to be fetched. Without --episode, will download all episodes in a season.')
+@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
+@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
+@click.option('-d', '--dry-run', is_flag=True)
+def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
+    """
+    Pre-processes raw HTML files into mangled custom quote data.
+
+    Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
+    precious quote data.
+    """
+    episodes: List[Tuple[int, int]]
+
+    if all:
+        episodes = list(get_episodes())
+    elif season:
+        if episode:
+            if verify_episode(season, episode):
+                episodes = [(season, episode)]
+            else:
+                logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
+                return
+        else:
+            episodes = list(get_episodes(season=season))
+            logger.info(f'Preprocessing Season {season}...')
+    else:
+        if episode:
+            logger.info('You must specify more than just an episode.')
+        else:
+            logger.info('You must specify which episodes to pre-process.')
+        logger.info('Check --help for more information on this command.')
+
+
@cli.command('process')
-def process():
+@click.option('-s', '--season', type=int,
+              help='Season to be fetched. Without --episode, will download all episodes in a season.')
+@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
+@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
+def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
    """
    Processes manually processed raw quote data into JSON.
    """
-    pass
+    episodes: List[Tuple[int, int]]
+
+    if all:
+        episodes = list(get_episodes())
+    elif season:
+        if episode:
+            if verify_episode(season, episode):
+                episodes = [(season, episode)]
+            else:
+                logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
+                return
+        else:
+            episodes = list(get_episodes(season=season))
+            logger.info(f'Processing Season {season}...')
+    else:
+        if episode:
+            logger.info('You must specify more than just an episode.')
+        else:
+            logger.info('You must specify which episodes to process.')
+        logger.info('Check --help for more information on this command.')
+        return


@cli.group('build')
 def build():
-    """Data building command group."""
-    pass
+    """Build final data files used by Algolia and the backend API."""


@build.command('algolia')
@@ -117,13 +180,13 @@ def algolia():
    """
    Generates algolia.json, a all encompassing file for Algolia's search index.
    """
-    pass
+    files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))


@build.command('final')
 def final():
    """Generates the latest application static data.json file, used by the backend API."""
-    pass
+    files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))


 if __name__ == "__main__":
@@ -2,10 +2,9 @@ import json
 import os
 import re
 import time
-import traceback
 from collections import defaultdict
 from math import ceil
-from typing import Iterable, Tuple
+from typing import Iterable, List, Tuple

 import enlighten
 import requests
@@ -32,16 +31,26 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
    return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))


-def load_file(filepath: str, parse_json: bool):
+def load_file(filepath: str, json_decode: bool):
    """Shortcut function for loading file from filepath, with JSON parsing flag."""
-    if parse_json:
-        with open(filepath, 'r') as file:
+    if json_decode:
+        with open(filepath, 'r', encoding='utf-8') as file:
            return json.load(file)
    else:
-        with open(filepath, 'r') as file:
+        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()


+def save_file(filepath: str, data, json_encode: bool):
+    """Shortcut function for saving data to a file, JSON encoding flag."""
+    if json_encode:
+        with open(filepath, 'w', encoding='utf-8') as file:
+            json.dump(data, file, ensure_ascii=False, indent=4)
+    else:
+        with open(filepath, 'w', encoding='utf-8') as file:
+            file.write(data)
+
+
 def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
    """
    Yields a list of Episode & Season tuples.
@@ -90,26 +99,7 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
        return 0


-def get_raw(season, episode):
-    html_filename = f'{season}-{str(episode).zfill(2)}.html'
-    html_filepath = os.path.join(DATA_DIR, 'html', html_filename)
-
-    # If .html file exists, read
-    if os.path.exists(html_filepath):
-        # print('Reading from disk...')
-        with open(html_filepath, 'r', encoding='utf-8') as file:
-            page_data = file.read()
-    # If not, write to disk for later usage
-    else:
-        link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
-        resp = session.get(link)
-        if resp.ok:
-            page_data = resp.text
-            with open(html_filepath, 'w', encoding='utf-8') as file:
-                file.write(page_data)
-        else:
-            raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"')
-
+def preprocess(page_data: str) -> List[str]:
    soup = BeautifulSoup(page_data, "html.parser")

    data = []
@@ -117,31 +107,14 @@ def get_raw(season, episode):
    for section in sections:
        for br in section.find_all('br'):
            br.replace_with("\n" + br.text)
+
        for line in section.get_text().split('\n'):
            data.append(line.strip())
+
        data.append('-')
    data.pop(-1)

-    with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w',
-              encoding='utf-8') as file:
-        file.write('\n'.join(data))
-
-
-def episodes():
-    ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23]
-    for season_num, ep_count in enumerate(ep_nums, start=1):
-        for episode_num in range(1, ep_count + 1):
-            yield season_num, episode_num
-
-
-def download_all_raw():
-    for season_num, episode_num in episodes():
-        print(f'{season_num}-{str(episode_num).zfill(2)}')
-        try:
-            get_raw(season_num, episode_num)
-        except Exception as exception:
-            print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})')
-            traceback.print_exc()
+    return data


 def process(season, episode):