Commit latest changes to server-side processing system

This commit is contained in:
Xevion
2022-05-08 19:49:18 -05:00
parent 698beb5943
commit cdc409b9a0
4 changed files with 209 additions and 24 deletions

View File

@@ -8,8 +8,9 @@ import json
import os
from copy import deepcopy
# from flask_caching import cache
import flask_wtf
from flask import current_app, jsonify, request
from flask import current_app, jsonify, request, send_from_directory
from server.helpers import default, get_neighbors
@@ -120,3 +121,8 @@ def api_character_quotes(character: str):
return jsonify(quotes[index: index + 10])
else:
return jsonify(quotes)
@current_app.route('/static/img/<path:filename>')
def custom_static(filename):
return send_from_directory('./data/img/', filename)

View File

@@ -8,17 +8,18 @@ import os
import re
import sys
import time
from collections import defaultdict
from collections import OrderedDict, defaultdict
from pprint import pprint
from typing import List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union
import click
import enlighten
import requests
from bs4 import BeautifulSoup
from lxml import etree
sys.path[0] += '\\..'
from server.helpers import algolia_transform, character_id
from server.helpers import algolia_transform, character_id, clean_string
from server.process import DATA_DIR, get_appearances, get_episodes, get_filepath, load_file, \
save_file, sleep_from, \
verify_episode
@@ -42,7 +43,7 @@ def misc():
@misc.command('characters')
@click.option('-s', '--season', type=int, help='Season to be processed for character names')
@click.option('-e', '--episode', type=int, help='Episode to be processed. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
@click.option('-a', '--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
@click.option('-i', '--individual', is_flag=True,
help='List characters from individual episodes instead of just compiling a masterlist')
def characters(season: int, episode: int, all: bool, individual: bool):
@@ -98,7 +99,7 @@ def characters(season: int, episode: int, all: bool, individual: bool):
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
@@ -136,7 +137,6 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
with enlighten.Manager() as manager:
with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
for _season, _episode in episodes:
filepath = get_filepath(_season, _episode, 'html')
# Check if HTML file exists
@@ -167,7 +167,7 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
@click.option('-s', '--season', type=int,
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
@@ -239,7 +239,7 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
@click.option('-s', '--season', type=int,
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
"""
@@ -267,6 +267,9 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
logger.info('Check --help for more information on this command.')
return
speakers: Dict = load_file(os.path.join(DATA_DIR, 'speakers.json'), True)
speakers = {original: new for original, new in speakers.items() if original != new and type(new) == str}
quote: Union[str, List[str]]
section_num: int
for _season, _episode in episodes:
@@ -284,10 +287,11 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
for quote in section_data:
quote = quote.split('|', 1)
section['quotes'].append(
{
'speaker': quote[0],
'text': quote[1]
'speaker': clean_string(speakers.get(quote[0], quote[0])),
'text': clean_string(quote[1])
}
)
sections.append(section)
@@ -324,6 +328,81 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
logger.info(', '.join(speakers))
@cli.command('xml')
@click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
def xml(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
"""
Processes manually processed raw quote data into JSON.
"""
episodes: List[Tuple[int, int]]
if all_:
episodes = list(get_episodes())
elif season:
if episode:
if verify_episode(season, episode):
episodes = [(season, episode)]
else:
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
return
else:
episodes = list(get_episodes(season=season))
logger.info(f'Processing Season {season}...')
else:
if episode:
logger.info('You must specify more than just an episode.')
else:
logger.info('You must specify which episodes to process.')
logger.info('Check --help for more information on this command.')
return
for _season, _episode in episodes:
try:
processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
rootElement = etree.Element('SceneList')
for scene in processed_data:
sceneElement = etree.Element('Scene')
for quote in scene['quotes']:
charactersElement = etree.Element('Characters')
sceneElement.append(charactersElement)
rootElement.append(sceneElement)
save_file(get_filepath(_season, _episode, 'xml'))
except FileNotFoundError:
logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
continue
@cli.command('truth')
def truth():
"""Modify"""
@cli.command('characters')
def characters():
"""Collects all characters from every single processed JSON file."""
episodes = list(get_episodes())
speakersList = OrderedDict()
for _season, _episode in episodes:
try:
processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
for scene in processed_data:
for quote in scene['quotes']:
speakersList[quote['speaker']] = None
except FileNotFoundError:
logger.warning(f"Skipped {_season}-{_episode}, no file found.")
speaker_data = OrderedDict([(item, item) for item in sorted(speakersList.keys())])
print(f'{len(speaker_data)} speakers identified.')
pprint(list(speaker_data.keys()))
save_file(os.path.join(DATA_DIR, 'speakers.json'), speaker_data, True)
@cli.group('build')
def build():
"""Build final data files used by Algolia and the backend API."""

15
server/data.py Normal file
View File

@@ -0,0 +1,15 @@
"""
data.py
Manages API quote/character data, caching static responses and reloading from disk.
"""
import os
import json
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(BASE_DIR, 'data', 'data.json'), 'r', encoding='utf-8') as file:
data = json.load(file)
with open(os.path.join(BASE_DIR, 'data', 'characters.json'), 'r', encoding='utf-8') as file:
character_data = json.load(file)

View File

@@ -3,8 +3,16 @@ helpers.py
"""
import random
import re
import string
import unicodedata
from collections import OrderedDict
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest
from typing import List, Optional, Tuple
from typing import List, Tuple, Optional
import unidecode
episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
@@ -55,5 +63,82 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s
def is_main_character(name: str) -> bool:
return None
def character_id(name: str) -> str:
return '-'.join(name.split(' ')).lower()
alphabet: str = string.ascii_letters + string.digits
def random_id(length: int = 8) -> str:
"""Generate a random {length} character long string."""
return ''.join(random.choices(alphabet, k=length))
def char_filter(string):
latin = re.compile('[a-zA-Z]+')
for char in unicodedata.normalize('NFC', string):
decoded = unidecode.unidecode(char)
if latin.match(decoded):
yield char
else:
yield decoded
def clean_string(string):
return "".join(char_filter(string))
def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
"""Use SequenceMatcher to return a list of the indexes of the best
"good enough" matches. word is a sequence for which close matches
are desired (typically a string).
possibilities is a list of sequences against which to match word
(typically a list of strings).
Optional arg n (default 3) is the maximum number of close matches to
return. n must be > 0.
Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
that don't score at least that similar to word are ignored.
"""
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not 0.0 <= cutoff <= 1.0:
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s = SequenceMatcher()
s.set_seq2(word)
for idx, x in enumerate(possibilities):
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and \
s.quick_ratio() >= cutoff and \
s.ratio() >= cutoff:
result.append((s.ratio(), idx))
# Move the best scorers to head of list
result = _nlargest(n, result)
# Strip scores for the best n matches
return [x for score, x in result]
def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
"""Add the values of identical keys together, then return both the keys and values"""
merge = OrderedDict()
for key, value in zip(keys, values):
# Already inserted, now make/keep it negative
if key in merge.keys():
# Keys that haven't been turned over need to be made negative
if merge[key] > 0:
merge[key] = -merge[key]
# And then subtract the value in all cases
merge[key] -= value
else:
# Values that are positive didn't merge with other counts.
merge[key] = value
keys, values = zip(*merge.items())
values = [f'{-value}*' if value < 0 else str(value) for value in values]
return keys, values