Commit latest changes to server-side processing system

2026-01-31 08:26:13 -06:00 · 2022-05-08 19:49:18 -05:00
parent 698beb5943
commit cdc409b9a0
4 changed files with 209 additions and 24 deletions
@@ -8,8 +8,9 @@ import json
 import os
 from copy import deepcopy

+# from flask_caching import cache
 import flask_wtf
-from flask import current_app, jsonify, request
+from flask import current_app, jsonify, request, send_from_directory

 from server.helpers import default, get_neighbors

@@ -120,3 +121,8 @@ def api_character_quotes(character: str):
        return jsonify(quotes[index: index + 10])
    else:
        return jsonify(quotes)
+
+
+@current_app.route('/static/img/<path:filename>')
+def custom_static(filename):
+    return send_from_directory('./data/img/', filename)
@@ -8,17 +8,18 @@ import os
 import re
 import sys
 import time
-from collections import defaultdict
+from collections import OrderedDict, defaultdict
 from pprint import pprint
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union

 import click
 import enlighten
 import requests
 from bs4 import BeautifulSoup
+from lxml import etree

 sys.path[0] += '\\..'
-from server.helpers import algolia_transform, character_id
+from server.helpers import algolia_transform, character_id, clean_string
 from server.process import DATA_DIR, get_appearances, get_episodes, get_filepath, load_file, \
    save_file, sleep_from, \
    verify_episode
@@ -42,7 +43,7 @@ def misc():
@misc.command('characters')
@click.option('-s', '--season', type=int, help='Season to be processed for character names')
@click.option('-e', '--episode', type=int, help='Episode to be processed. Requires --season to be specified.')
-@click.option('--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
+@click.option('-a', '--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
@click.option('-i', '--individual', is_flag=True,
              help='List characters from individual episodes instead of just compiling a masterlist')
 def characters(season: int, episode: int, all: bool, individual: bool):
@@ -98,7 +99,7 @@ def characters(season: int, episode: int, all: bool, individual: bool):
              help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
-@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
 def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
@@ -136,7 +137,6 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
    with enlighten.Manager() as manager:
        with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
            for _season, _episode in episodes:
-
                filepath = get_filepath(_season, _episode, 'html')

                # Check if HTML file exists
@@ -167,7 +167,7 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
@click.option('-s', '--season', type=int,
              help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
-@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
@@ -239,7 +239,7 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
@click.option('-s', '--season', type=int,
              help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
-@click.option('--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
 def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
    """
@@ -267,6 +267,9 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
        logger.info('Check --help for more information on this command.')
        return

+    speakers: Dict = load_file(os.path.join(DATA_DIR, 'speakers.json'), True)
+    speakers = {original: new for original, new in speakers.items() if original != new and type(new) == str}
+
    quote: Union[str, List[str]]
    section_num: int
    for _season, _episode in episodes:
@@ -284,10 +287,11 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b

                for quote in section_data:
                    quote = quote.split('|', 1)
+
                    section['quotes'].append(
                            {
-                            'speaker': quote[0],
-                            'text': quote[1]
+                                'speaker': clean_string(speakers.get(quote[0], quote[0])),
+                                'text': clean_string(quote[1])
                            }
                    )
                sections.append(section)
@@ -324,6 +328,81 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
            logger.info(', '.join(speakers))


+@cli.command('xml')
+@click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.')
+@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
+@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
+def xml(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
+    """
+    Processes manually processed raw quote data into JSON.
+    """
+    episodes: List[Tuple[int, int]]
+
+    if all_:
+        episodes = list(get_episodes())
+    elif season:
+        if episode:
+            if verify_episode(season, episode):
+                episodes = [(season, episode)]
+            else:
+                logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
+                return
+        else:
+            episodes = list(get_episodes(season=season))
+            logger.info(f'Processing Season {season}...')
+    else:
+        if episode:
+            logger.info('You must specify more than just an episode.')
+        else:
+            logger.info('You must specify which episodes to process.')
+        logger.info('Check --help for more information on this command.')
+        return
+
+    for _season, _episode in episodes:
+        try:
+            processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
+            rootElement = etree.Element('SceneList')
+            for scene in processed_data:
+                sceneElement = etree.Element('Scene')
+                for quote in scene['quotes']:
+                    charactersElement = etree.Element('Characters')
+                    sceneElement.append(charactersElement)
+
+                rootElement.append(sceneElement)
+
+            save_file(get_filepath(_season, _episode, 'xml'))
+        except FileNotFoundError:
+            logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
+            continue
+
+@cli.command('truth')
+def truth():
+    """Modify"""
+
+
+@cli.command('characters')
+def characters():
+    """Collects all characters from every single processed JSON file."""
+    episodes = list(get_episodes())
+    speakersList = OrderedDict()
+
+    for _season, _episode in episodes:
+        try:
+            processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
+            for scene in processed_data:
+                for quote in scene['quotes']:
+                    speakersList[quote['speaker']] = None
+        except FileNotFoundError:
+            logger.warning(f"Skipped  {_season}-{_episode}, no file found.")
+
+    speaker_data = OrderedDict([(item, item) for item in sorted(speakersList.keys())])
+    print(f'{len(speaker_data)} speakers identified.')
+
+    pprint(list(speaker_data.keys()))
+    save_file(os.path.join(DATA_DIR, 'speakers.json'), speaker_data, True)
+
+
@cli.group('build')
 def build():
    """Build final data files used by Algolia and the backend API."""
@@ -0,0 +1,15 @@
+"""
+data.py
+
+Manages API quote/character data, caching static responses and reloading from disk.
+"""
+import os
+import json
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(BASE_DIR, 'data', 'data.json'), 'r', encoding='utf-8') as file:
+    data = json.load(file)
+
+with open(os.path.join(BASE_DIR, 'data', 'characters.json'), 'r', encoding='utf-8') as file:
+    character_data = json.load(file)
+
@@ -3,8 +3,16 @@ helpers.py


 """
+import random
+import re
+import string
+import unicodedata
+from collections import OrderedDict
+from difflib import SequenceMatcher
+from heapq import nlargest as _nlargest
+from typing import List, Optional, Tuple

-from typing import List, Tuple, Optional
+import unidecode

 episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]

@@ -55,5 +63,82 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s
 def is_main_character(name: str) -> bool:
    return None

+
 def character_id(name: str) -> str:
    return '-'.join(name.split(' ')).lower()
+
+
+alphabet: str = string.ascii_letters + string.digits
+
+
+def random_id(length: int = 8) -> str:
+    """Generate a random {length} character long string."""
+    return ''.join(random.choices(alphabet, k=length))
+
+
+def char_filter(string):
+    latin = re.compile('[a-zA-Z]+')
+    for char in unicodedata.normalize('NFC', string):
+        decoded = unidecode.unidecode(char)
+        if latin.match(decoded):
+            yield char
+        else:
+            yield decoded
+
+
+def clean_string(string):
+    return "".join(char_filter(string))
+
+
+def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
+    """Use SequenceMatcher to return a list of the indexes of the best
+    "good enough" matches. word is a sequence for which close matches
+    are desired (typically a string).
+    possibilities is a list of sequences against which to match word
+    (typically a list of strings).
+    Optional arg n (default 3) is the maximum number of close matches to
+    return.  n must be > 0.
+    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
+    that don't score at least that similar to word are ignored.
+    """
+
+    if not n > 0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher()
+    s.set_seq2(word)
+    for idx, x in enumerate(possibilities):
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+                s.quick_ratio() >= cutoff and \
+                s.ratio() >= cutoff:
+            result.append((s.ratio(), idx))
+
+    # Move the best scorers to head of list
+    result = _nlargest(n, result)
+
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
+
+def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
+    """Add the values of identical keys together, then return both the keys and values"""
+    merge = OrderedDict()
+    for key, value in zip(keys, values):
+        # Already inserted, now make/keep it negative
+        if key in merge.keys():
+            # Keys that haven't been turned over need to be made negative
+            if merge[key] > 0:
+                merge[key] = -merge[key]
+
+            # And then subtract the value in all cases
+            merge[key] -= value
+        else:
+            # Values that are positive didn't merge with other counts.
+            merge[key] = value
+
+    keys, values = zip(*merge.items())
+    values = [f'{-value}*' if value < 0 else str(value) for value in values]
+    return keys, values