diff --git a/server/api.py b/server/api.py index c36e4d8..b963d73 100644 --- a/server/api.py +++ b/server/api.py @@ -8,8 +8,9 @@ import json import os from copy import deepcopy +# from flask_caching import cache import flask_wtf -from flask import current_app, jsonify, request +from flask import current_app, jsonify, request, send_from_directory from server.helpers import default, get_neighbors @@ -120,3 +121,8 @@ def api_character_quotes(character: str): return jsonify(quotes[index: index + 10]) else: return jsonify(quotes) + + +@current_app.route('/static/img/') +def custom_static(filename): + return send_from_directory('./data/img/', filename) diff --git a/server/cli.py b/server/cli.py index 67253b8..39fc16e 100644 --- a/server/cli.py +++ b/server/cli.py @@ -8,17 +8,18 @@ import os import re import sys import time -from collections import defaultdict +from collections import OrderedDict, defaultdict from pprint import pprint -from typing import List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import click import enlighten import requests from bs4 import BeautifulSoup +from lxml import etree sys.path[0] += '\\..' -from server.helpers import algolia_transform, character_id +from server.helpers import algolia_transform, character_id, clean_string from server.process import DATA_DIR, get_appearances, get_episodes, get_filepath, load_file, \ save_file, sleep_from, \ verify_episode @@ -42,7 +43,7 @@ def misc(): @misc.command('characters') @click.option('-s', '--season', type=int, help='Season to be processed for character names') @click.option('-e', '--episode', type=int, help='Episode to be processed. Requires --season to be specified.') -@click.option('--all', is_flag=True, help='Process all episodes, regardless of previous specifications.') +@click.option('-a', '--all', is_flag=True, help='Process all episodes, regardless of previous specifications.') @click.option('-i', '--individual', is_flag=True, help='List characters from individual episodes instead of just compiling a masterlist') def characters(season: int, episode: int, all: bool, individual: bool): @@ -90,7 +91,7 @@ def characters(season: int, episode: int, all: bool, individual: bool): # print(master) logger.info( - ', '.join(item['name'] for item in sorted(master.values(), reverse=True, key=lambda item: item['appearances']))) + ', '.join(item['name'] for item in sorted(master.values(), reverse=True, key=lambda item: item['appearances']))) @cli.command('fetch') @@ -98,7 +99,7 @@ def characters(season: int, episode: int, all: bool, individual: bool): help='Season to be fetched. Without --episode, will download all episodes in a season.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') @click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request') -@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') @click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool): @@ -136,7 +137,6 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s with enlighten.Manager() as manager: with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar: for _season, _episode in episodes: - filepath = get_filepath(_season, _episode, 'html') # Check if HTML file exists @@ -167,7 +167,7 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s @click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') -@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') @click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently') @click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently') @@ -239,7 +239,7 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski @click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') -@click.option('--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.') def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool): """ @@ -267,6 +267,9 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b logger.info('Check --help for more information on this command.') return + speakers: Dict = load_file(os.path.join(DATA_DIR, 'speakers.json'), True) + speakers = {original: new for original, new in speakers.items() if original != new and type(new) == str} + quote: Union[str, List[str]] section_num: int for _season, _episode in episodes: @@ -284,11 +287,12 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b for quote in section_data: quote = quote.split('|', 1) + section['quotes'].append( - { - 'speaker': quote[0], - 'text': quote[1] - } + { + 'speaker': clean_string(speakers.get(quote[0], quote[0])), + 'text': clean_string(quote[1]) + } ) sections.append(section) except FileNotFoundError: @@ -298,7 +302,7 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.') if quote: logger.info( - f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}') + f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}') else: # Save processed data save_file(get_filepath(_season, _episode, 'processed'), sections, True) @@ -324,6 +328,81 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b logger.info(', '.join(speakers)) +@cli.command('xml') +@click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.') +@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') +@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.') +def xml(season: Optional[int], episode: Optional[int], all_: bool, report: bool): + """ + Processes manually processed raw quote data into JSON. + """ + episodes: List[Tuple[int, int]] + + if all_: + episodes = list(get_episodes()) + elif season: + if episode: + if verify_episode(season, episode): + episodes = [(season, episode)] + else: + logger.error(f'Season {season}, Episode {episode} is not a valid combination.') + return + else: + episodes = list(get_episodes(season=season)) + logger.info(f'Processing Season {season}...') + else: + if episode: + logger.info('You must specify more than just an episode.') + else: + logger.info('You must specify which episodes to process.') + logger.info('Check --help for more information on this command.') + return + + for _season, _episode in episodes: + try: + processed_data = load_file(get_filepath(_season, _episode, 'processed'), True) + rootElement = etree.Element('SceneList') + for scene in processed_data: + sceneElement = etree.Element('Scene') + for quote in scene['quotes']: + charactersElement = etree.Element('Characters') + sceneElement.append(charactersElement) + + rootElement.append(sceneElement) + + save_file(get_filepath(_season, _episode, 'xml')) + except FileNotFoundError: + logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.') + continue + +@cli.command('truth') +def truth(): + """Modify""" + + +@cli.command('characters') +def characters(): + """Collects all characters from every single processed JSON file.""" + episodes = list(get_episodes()) + speakersList = OrderedDict() + + for _season, _episode in episodes: + try: + processed_data = load_file(get_filepath(_season, _episode, 'processed'), True) + for scene in processed_data: + for quote in scene['quotes']: + speakersList[quote['speaker']] = None + except FileNotFoundError: + logger.warning(f"Skipped {_season}-{_episode}, no file found.") + + speaker_data = OrderedDict([(item, item) for item in sorted(speakersList.keys())]) + print(f'{len(speaker_data)} speakers identified.') + + pprint(list(speaker_data.keys())) + save_file(os.path.join(DATA_DIR, 'speakers.json'), speaker_data, True) + + @cli.group('build') def build(): """Build final data files used by Algolia and the backend API.""" @@ -437,18 +516,18 @@ def final(silent_skip: bool, process_: bool): except FileNotFoundError: if not silent_skip: logger.warning( - f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.') + f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.') episode_data = None description = descriptions[season_id - 1][episode_id - 1] seasons[season_id - 1]['episodes'].append( - { - 'title': description['title'].strip(), - 'description': description['description'].strip(), - 'episode_id': episode_id, - 'characters': get_appearances(season_id, episode_id), - 'scenes': episode_data - } + { + 'title': description['title'].strip(), + 'description': description['description'].strip(), + 'episode_id': episode_id, + 'characters': get_appearances(season_id, episode_id), + 'scenes': episode_data + } ) logger.info('Saving to data.json') diff --git a/server/data.py b/server/data.py new file mode 100644 index 0000000..267ef7e --- /dev/null +++ b/server/data.py @@ -0,0 +1,15 @@ +""" +data.py + +Manages API quote/character data, caching static responses and reloading from disk. +""" +import os +import json + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(BASE_DIR, 'data', 'data.json'), 'r', encoding='utf-8') as file: + data = json.load(file) + +with open(os.path.join(BASE_DIR, 'data', 'characters.json'), 'r', encoding='utf-8') as file: + character_data = json.load(file) + diff --git a/server/helpers.py b/server/helpers.py index 0b6109f..3d91174 100644 --- a/server/helpers.py +++ b/server/helpers.py @@ -3,8 +3,16 @@ helpers.py """ +import random +import re +import string +import unicodedata +from collections import OrderedDict +from difflib import SequenceMatcher +from heapq import nlargest as _nlargest +from typing import List, Optional, Tuple -from typing import List, Tuple, Optional +import unidecode episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23] @@ -55,5 +63,82 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s def is_main_character(name: str) -> bool: return None + def character_id(name: str) -> str: return '-'.join(name.split(' ')).lower() + + +alphabet: str = string.ascii_letters + string.digits + + +def random_id(length: int = 8) -> str: + """Generate a random {length} character long string.""" + return ''.join(random.choices(alphabet, k=length)) + + +def char_filter(string): + latin = re.compile('[a-zA-Z]+') + for char in unicodedata.normalize('NFC', string): + decoded = unidecode.unidecode(char) + if latin.match(decoded): + yield char + else: + yield decoded + + +def clean_string(string): + return "".join(char_filter(string)) + + +def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6): + """Use SequenceMatcher to return a list of the indexes of the best + "good enough" matches. word is a sequence for which close matches + are desired (typically a string). + possibilities is a list of sequences against which to match word + (typically a list of strings). + Optional arg n (default 3) is the maximum number of close matches to + return. n must be > 0. + Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities + that don't score at least that similar to word are ignored. + """ + + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher() + s.set_seq2(word) + for idx, x in enumerate(possibilities): + s.set_seq1(x) + if s.real_quick_ratio() >= cutoff and \ + s.quick_ratio() >= cutoff and \ + s.ratio() >= cutoff: + result.append((s.ratio(), idx)) + + # Move the best scorers to head of list + result = _nlargest(n, result) + + # Strip scores for the best n matches + return [x for score, x in result] + + +def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]: + """Add the values of identical keys together, then return both the keys and values""" + merge = OrderedDict() + for key, value in zip(keys, values): + # Already inserted, now make/keep it negative + if key in merge.keys(): + # Keys that haven't been turned over need to be made negative + if merge[key] > 0: + merge[key] = -merge[key] + + # And then subtract the value in all cases + merge[key] -= value + else: + # Values that are positive didn't merge with other counts. + merge[key] = value + + keys, values = zip(*merge.items()) + values = [f'{-value}*' if value < 0 else str(value) for value in values] + return keys, values