finish preprocess, build algolia, build final commands, update README, remove obsolete code in process.py

2025-12-09 12:08:59 -06:00 · 2020-08-09 17:50:26 -05:00
parent d6e2ce1df4
commit 99b15168e1
6 changed files with 257 additions and 171 deletions
--- a/README.md
+++ b/README.md
@@ -46,10 +46,10 @@ The data has to be parsed, but due to high irregularity (at least too much for m
 inspected and manually processed.

 ```python server/cli.py preprocess
-    --season SEASON     Pre-processes all episodes from a specific season.
-    --episode EPISODE   Pre-processes a specific episode. Requires SEASON to be specified.
-    --all               Pre-processes all episodes from every season.
-    --overwrite         DANGER: Will overwrite files. May result in manually processed files to be lost forever.
+    -s --season SEASON     Pre-processes all episodes from a specific season.
+    -e --episode EPISODE   Pre-processes a specific episode. Requires SEASON to be specified.
+    -a --all               Pre-processes all episodes from every season.
+    -o --overwrite         DANGER: Will overwrite files. May result in manually processed files to be lost forever.
 ```

 From then on, once all files have been pre-processed, you will have to begin the long, annoying process of editing them into my custom format.
@@ -91,9 +91,9 @@ they are just the JSON format of the previous stage.

 ```
 python server/cli.py process
-    --season SEASON     Processes all episodes from a specific season.
-    --epsiode EPISODE   Processes a specific episode. Requires SEASON to be specified.
-    --all               Processes all episodes from all seasons.
+    -s --season SEASON     Processes all episodes from a specific season.
+    -e --epsiode EPISODE   Processes a specific episode. Requires SEASON to be specified.
+    -a --all               Processes all episodes from all seasons.
 ```

 Now that they're all in individual files, the final commands can be ran to compile them into one file, a static
@@ -109,6 +109,34 @@ Each command is ran with no special arguments (as of now), generating a `algolia

 This `data.json` file is loaded by the Flask server and the `algolia.json` can be uploaded to your primary index.

+For every command mentioned, you can read all arguments with `--help`:
+
+```
+$ python cli.py preprocess --help
+Usage: cli.py preprocess [OPTIONS]
+
+  Pre-processes raw HTML files into mangled custom quote data.
+
+  Custom quote data requires manual inspection and formatting, making it a
+  dangerous operation that may overwrite precious quote data.
+
+Options:
+  -s, --season INTEGER          Season to be fetched. Without --episode, will
+                                download all episodes in a season.
+
+  -e, --episode INTEGER         Specific episode to be fetched. Requires
+                                --season to be specified.
+
+  --all                         Fetch all episodes, regardless of previous
+                                specifications.
+
+  -o, --overwrite               Overwrite if a file already exists.
+  -ss, --silent-skip            Skip missing/existing files silently
+  -ssm, --silent-skip-missing   Skip missing files silently
+  -sse, --silent-skip-existing  Skip overwrite skips silently
+  --help                        Show this message and exit.
+```
+
 ## Setup

 This project was built on Python 3.7 and Node v12.18.3 / npm 6.14.6.
--- a/client/package-lock.json
+++ b/client/package-lock.json
@@ -1,6 +1,6 @@
 {
-  "name": "client",
-  "version": "0.1.0",
+  "name": "TheOfficeQuotes",
+  "version": "0.2.0",
  "lockfileVersion": 1,
  "requires": true,
  "dependencies": {
@@ -1171,6 +1171,32 @@
        "to-fast-properties": "^2.0.0"
      }
    },
+    "@fortawesome/fontawesome-common-types": {
+      "version": "0.2.30",
+      "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-0.2.30.tgz",
+      "integrity": "sha512-TsRwpTuKwFNiPhk1UfKgw7zNPeV5RhNp2Uw3pws+9gDAkPGKrtjR1y2lI3SYn7+YzyfuNknflpBA1LRKjt7hMg=="
+    },
+    "@fortawesome/fontawesome-svg-core": {
+      "version": "1.2.30",
+      "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-svg-core/-/fontawesome-svg-core-1.2.30.tgz",
+      "integrity": "sha512-E3sAXATKCSVnT17HYmZjjbcmwihrNOCkoU7dVMlasrcwiJAHxSKeZ+4WN5O+ElgO/FaYgJmASl8p9N7/B/RttA==",
+      "requires": {
+        "@fortawesome/fontawesome-common-types": "^0.2.30"
+      }
+    },
+    "@fortawesome/free-solid-svg-icons": {
+      "version": "5.14.0",
+      "resolved": "https://registry.npmjs.org/@fortawesome/free-solid-svg-icons/-/free-solid-svg-icons-5.14.0.tgz",
+      "integrity": "sha512-M933RDM8cecaKMWDSk3FRYdnzWGW7kBBlGNGfvqLVwcwhUPNj9gcw+xZMrqBdRqxnSXdl3zWzTCNNGEtFUq67Q==",
+      "requires": {
+        "@fortawesome/fontawesome-common-types": "^0.2.30"
+      }
+    },
+    "@fortawesome/vue-fontawesome": {
+      "version": "0.1.10",
+      "resolved": "https://registry.npmjs.org/@fortawesome/vue-fontawesome/-/vue-fontawesome-0.1.10.tgz",
+      "integrity": "sha512-b2+SLF31h32LSepVcXe+BQ63yvbq5qmTCy4KfFogCYm2bn68H5sDWUnX+U7MBqnM2aeEk9M7xSoqGnu+wSdY6w=="
+    },
    "@hapi/address": {
      "version": "2.1.4",
      "resolved": "https://registry.npm.taobao.org/@hapi/address/download/@hapi/address-2.1.4.tgz?cache=0&sync_timestamp=1593993773437&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2F%40hapi%2Faddress%2Fdownload%2F%40hapi%2Faddress-2.1.4.tgz",
--- a/client/package.json
+++ b/client/package.json
@@ -1,5 +1,5 @@
 {
-    "name": "The Office Quotes",
+    "name": "TheOfficeQuotes",
    "version": "0.2.0",
    "private": true,
    "scripts": {
@@ -8,6 +8,9 @@
        "lint": "vue-cli-service lint"
    },
    "dependencies": {
+        "@fortawesome/fontawesome-svg-core": "^1.2.30",
+        "@fortawesome/free-solid-svg-icons": "^5.14.0",
+        "@fortawesome/vue-fontawesome": "^0.1.10",
        "algoliasearch": "^4.3.1",
        "axios": ">=0.18.1",
        "bootstrap": "^4.3.1",
--- a/client/vue.config.js
+++ b/client/vue.config.js
@@ -0,0 +1,4 @@
+module.exports = {
+  indexPath: '../../dist/index.html',
+  assetsDir: '../../dist',
+};
--- a/server/cli.py
+++ b/server/cli.py
@@ -5,16 +5,20 @@ CLI entrypoint for fetching, processing and compiling quote data.
 """
 import logging
 import os
+import re
 import sys
 import time
-from typing import List, Tuple
+from typing import List, Tuple, Union

 import click
 import enlighten
 import requests
+from bs4 import BeautifulSoup

 sys.path[0] += '\\..'
-from server.process import get_episodes, get_filepath, sleep_from, verify_episode
+from server.process import DATA_DIR, get_characters, get_episodes, get_filepath, load_file, \
+    save_file, sleep_from, \
+    verify_episode

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger('cli')
@@ -24,7 +28,6 @@ manager = enlighten.get_manager()

@click.group()
 def cli():
-    """Base command group."""
    pass


@@ -90,9 +93,8 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
                    last_request = time.time()
                    if resp.ok:
                        # Write data to file
-                        with open(filepath, 'w', encoding='utf-8') as file:
-                            file.write(resp.text)
-                        logger.debug('Successfully fetched.')
+                        save_file(filepath, resp.text, False)
+                        logger.debug('Successfully fetched & saved.')
                    else:
                        logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
                pbar.update()
@@ -105,15 +107,18 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
-@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
-@click.option('-d', '--dry-run', is_flag=True)
-def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
+@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
+@click.option('-sse', '--silent-skip-existing', is_flag=True, help='Skip overwrite skips silently')
+def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool,
+               silent_skip_existing: bool):
    """
    Pre-processes raw HTML files into mangled custom quote data.

    Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
    precious quote data.
    """
+    print(silent_skip_existing)
    episodes: List[Tuple[int, int]]

    if all:
@@ -134,6 +139,38 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
        else:
            logger.info('You must specify which episodes to pre-process.')
        logger.info('Check --help for more information on this command.')
+        return
+
+    for season, episode in episodes:
+        # Overwrite protection
+        save_path = get_filepath(season, episode, 'raw')
+        if os.path.exists(save_path) and not overwrite:
+            if (not silent_skip) or (not silent_skip_existing):
+                logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.')
+                continue
+
+        try:
+            page_data = load_file(get_filepath(season, episode, 'html'), False)
+        except FileNotFoundError:
+            if not silent_skip or not silent_skip_missing:
+                logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.')
+        else:
+            soup = BeautifulSoup(page_data, "html.parser")
+            data = []
+
+            sections = soup.find_all(attrs={"class": "quote"})
+            for section in sections:
+                for br in section.find_all('br'):
+                    br.replace_with("\n" + br.text)
+
+                for line in section.get_text().split('\n'):
+                    data.append(line.strip())
+
+                data.append('-')
+            data.pop(-1)
+
+            data = '\n'.join(data)
+            save_file(save_path, data, False)


@cli.command('process')
@@ -141,9 +178,8 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
              help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
-@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
-@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
-def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
+@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed')
+def process(season: int, episode: int, all: bool, report: bool):
    """
    Processes manually processed raw quote data into JSON.
    """
@@ -169,24 +205,138 @@ def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip:
        logger.info('Check --help for more information on this command.')
        return

+    quote: Union[str, List[str]]
+    section_num: int
+    for _season, _episode in episodes:
+        sections = []
+        try:
+            preprocessed_data = load_file(get_filepath(_season, _episode, 'raw'))
+            for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1):
+                section = {
+                    'quotes': []
+                }
+
+                section_data = list(raw_section.strip().split('\n'))
+                if section_data[0].startswith('!'):
+                    section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
+
+                for quote in section_data:
+                    quote = quote.split('|', 1)
+                    section['quotes'].append(
+                        {
+                            'speaker': quote[0],
+                            'text': quote[1]
+                        }
+                    )
+                sections.append(section)
+        except FileNotFoundError:
+            logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
+        except:
+            logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
+            logger.info(
+                f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
+        else:
+            # Save processed data
+            save_file(get_filepath(_season, _episode, 'processed'), sections, True)
+
+        if report:
+            deleted_count = [0, set()]
+            quote_count = 0
+            speakers = set()
+
+            for section in sections:
+                quote_count += len(section['quotes'])
+
+                if 'deleted' in section.keys():
+                    deleted_count[0] += 1
+                    deleted_count[1].add(section['deleted'])
+
+                for quote in section['quotes']:
+                    speakers.add(quote['speaker'])
+
+            logger.debug(f'{quote_count} quotes.')
+            logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
+            logger.info(f'{len(speakers)} Speakers:')
+            logger.info(', '.join(speakers))
+

@cli.group('build')
 def build():
    """Build final data files used by Algolia and the backend API."""
+    pass


@build.command('algolia')
-def algolia():
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
+@click.option('--process', is_flag=True, help='Run processing before building final data.')
+def algolia(silent_skip: bool):
    """
    Generates algolia.json, a all encompassing file for Algolia's search index.
    """
-    files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
+    data = []
+    episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0
+    for season, episode in get_episodes():
+        episode_num_abs += 1
+        try:
+            episode_data = load_file(get_filepath(season, episode, 'processed'), True)
+        except FileNotFoundError:
+            if not silent_skip:
+                logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.')
+        else:
+            for section_num_rel, section in enumerate(episode_data, start=1):
+                section_num_abs += 1
+                for quote_num_rel, quote in enumerate(section['quotes'], start=1):
+                    quote_num_abs += 1
+
+                    # Relative position
+                    quote['quote_rel'] = quote_num_rel
+                    quote['section_rel'] = section_num_rel
+                    quote['episode_rel'] = episode
+                    # Absolute position
+                    quote['quote_abs'] = quote_num_abs
+                    quote['section_abs'] = section_num_abs
+                    quote['episode_abs'] = episode_num_abs
+
+                    quote['season'] = season
+
+                    quote['is_deleted'] = 'deleted' in section.keys()
+                    quote['deleted_section'] = section.get('deleted')
+
+                    data.append(quote)
+
+    logger.info(f'Saving {len(data):,} quotes to algolia.json')
+    save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True)


@build.command('final')
-def final():
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
+@click.option('--process', is_flag=True, help='Run processing before building final data.')
+def final(silent_skip: bool):
    """Generates the latest application static data.json file, used by the backend API."""
-    files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
+    descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True)
+    seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)]
+    for season_id, episode_id in get_episodes():
+        # Load data file
+        try:
+            episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True)
+        except FileNotFoundError:
+            if not silent_skip:
+                logger.warning(f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
+            episode_data = None
+
+        description = descriptions[season_id - 1][episode_id - 1]
+        seasons[season_id - 1]['episodes'].append(
+            {
+                'title': description['title'].strip(),
+                'description': description['description'].strip(),
+                'episode_id': episode_id,
+                'characters': get_characters(season_id, episode_id),
+                'scenes': episode_data
+            }
+        )
+
+    logger.info('Saving to data.json')
+    save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True)


 if __name__ == "__main__":
--- a/server/process.py
+++ b/server/process.py
@@ -1,14 +1,18 @@
+"""
+process.py
+
+Functions and shortcuts for loading/saving/extracting data for processing quote data.
+"""
+
 import json
 import os
-import re
 import time
 from collections import defaultdict
 from math import ceil
-from typing import Iterable, List, Tuple
+from typing import Dict, Iterable, List, Tuple, Union

 import enlighten
 import requests
-from bs4 import BeautifulSoup

 session = requests.Session()

@@ -31,7 +35,7 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
    return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))


-def load_file(filepath: str, json_decode: bool):
+def load_file(filepath: str, json_decode: bool = False):
    """Shortcut function for loading file from filepath, with JSON parsing flag."""
    if json_decode:
        with open(filepath, 'r', encoding='utf-8') as file:
@@ -68,7 +72,7 @@ def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:

 def verify_episode(season: int, episode: int = None) -> bool:
    """
-    Verifies that a Season or Season + Episode is valid.
+    Verifies that specific Season and/or Episode is valid.
    """
    return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])

@@ -99,149 +103,20 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
        return 0


-def preprocess(page_data: str) -> List[str]:
-    soup = BeautifulSoup(page_data, "html.parser")
-
-    data = []
-    sections = soup.find_all(attrs={"class": "quote"})
-    for section in sections:
-        for br in section.find_all('br'):
-            br.replace_with("\n" + br.text)
-
-        for line in section.get_text().split('\n'):
-            data.append(line.strip())
-
-        data.append('-')
-    data.pop(-1)
-
-    return data
-
-
-def process(season, episode):
-    with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r',
-              encoding='utf-8') as file:
-
-        sections = []
-        for s in re.split('^-', file.read(), flags=re.MULTILINE):
-            section = {
-                'quotes': []
-            }
-
-            section_data = list(s.strip().split('\n'))
-            if section_data[0].startswith('!'):
-                section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
-
-            for q in section_data:
-                quote = q.split('|', 1)
-                print(quote)
-                section['quotes'].append(
-                    {
-                        'speaker': quote[0],
-                        'text': quote[1]
-                    }
-                )
-            sections.append(section)
-
-        with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w',
-                  encoding='utf-8') as file:
-            json.dump(sections, file, indent=4, ensure_ascii=False)
-
-        deleted_count = [0, set()]
-        quote_count = 0
-        speakers = set()
-
-        for section in sections:
-            quote_count += len(section['quotes'])
-
-            if 'deleted' in section.keys():
-                deleted_count[0] += 1
-                deleted_count[1].add(section['deleted'])
-
-            for quote in section['quotes']:
-                speakers.add(quote['speaker'])
-
-        print(f'{quote_count} quotes.')
-        print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
-        print(f'{len(speakers)} Speakers:')
-        print(', '.join(speakers))
-
-
-def generate_algolia():
-    data = []
-    quote_num = 0
-    for season, episode in episodes():
-        try:
-            with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r',
-                      encoding='utf-8') as file:
-                episode_data = json.load(file)
-        except FileNotFoundError:
-            print(f'No JSON data for Season {season} Episode {episode}')
-        else:
-            for section_num, section in enumerate(episode_data, start=1):
-                for quote in section['quotes']:
-                    quote_num += 1
-                    quote['quote'] = quote_num
-                    quote['section'] = section_num
-                    quote['episode'] = episode
-                    quote['season'] = season
-
-                    quote['is_deleted'] = 'deleted' in section.keys()
-                    quote['deleted_section'] = section.get('deleted')
-
-                    data.append(quote)
-
-    with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file:
-        json.dump(data, file, ensure_ascii=False, indent=4)
-
-
-def get_episode_scenes(season, episode):
-    filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json')
-    if os.path.exists(filepath):
-        with open(filepath, 'r', encoding='utf-8') as file:
-            return json.load(file)
-    else:
-        return None
-
-
-def get_characters(season, episode):
-    scenes = get_episode_scenes(season, episode)
-    if scenes is None:
-        return None
+def get_characters(season, episode) -> List[Dict[str, Union[int, str]]]:
+    """
+    Extracts all characters and their number of appearances from a specific episode.
+    Prepared in a list of dictionary, preferable storage/for loop method.
+    """
+    filepath = get_filepath(season, episode, 'processed')
+    if not os.path.exists(filepath):
+        return []
+    scenes = load_file(filepath, True)

    characters = defaultdict(int)
    for scene in scenes:
-        for quote in scene['quotes']:
-            characters[quote['speaker']] += 1
+        for quote in scene.get('quotes', []):
+            characters[quote.get('speaker')] += 1
    characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()}
                  for character, appearances in characters.items()]
    return list(sorted(characters, key=lambda item: item['appearances'], reverse=True))
-
-
-def generate_final():
-    """Merge episode descriptions/titles and quotes into final JSON file."""
-    with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file:
-        data = json.load(file)
-
-    output = []
-    for season_id, season in enumerate(data, start=1):
-        output.append({
-            'season_id': season_id,
-            'episodes': [
-                {
-                    'title': episode['title'].strip(),
-                    'description': episode['description'].strip(),
-                    'episode_id': episode_id,
-                    'characters': get_characters(season_id, episode_id),
-                    'scenes': get_episode_scenes(season_id, episode_id)
-                }
-                for episode_id, episode in enumerate(season, start=1)
-            ]
-        })
-
-    with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file:
-        json.dump(output, file, ensure_ascii=False, indent=4)
-
-
-# generate_algolia()
-# process(3, 10)
-generate_final()