mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-11 10:08:57 -06:00
Commit latest changes to server-side processing system
This commit is contained in:
@@ -8,8 +8,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
|
# from flask_caching import cache
|
||||||
import flask_wtf
|
import flask_wtf
|
||||||
from flask import current_app, jsonify, request
|
from flask import current_app, jsonify, request, send_from_directory
|
||||||
|
|
||||||
from server.helpers import default, get_neighbors
|
from server.helpers import default, get_neighbors
|
||||||
|
|
||||||
@@ -120,3 +121,8 @@ def api_character_quotes(character: str):
|
|||||||
return jsonify(quotes[index: index + 10])
|
return jsonify(quotes[index: index + 10])
|
||||||
else:
|
else:
|
||||||
return jsonify(quotes)
|
return jsonify(quotes)
|
||||||
|
|
||||||
|
|
||||||
|
@current_app.route('/static/img/<path:filename>')
|
||||||
|
def custom_static(filename):
|
||||||
|
return send_from_directory('./data/img/', filename)
|
||||||
|
|||||||
123
server/cli.py
123
server/cli.py
@@ -8,17 +8,18 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import OrderedDict, defaultdict
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import enlighten
|
import enlighten
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
sys.path[0] += '\\..'
|
sys.path[0] += '\\..'
|
||||||
from server.helpers import algolia_transform, character_id
|
from server.helpers import algolia_transform, character_id, clean_string
|
||||||
from server.process import DATA_DIR, get_appearances, get_episodes, get_filepath, load_file, \
|
from server.process import DATA_DIR, get_appearances, get_episodes, get_filepath, load_file, \
|
||||||
save_file, sleep_from, \
|
save_file, sleep_from, \
|
||||||
verify_episode
|
verify_episode
|
||||||
@@ -42,7 +43,7 @@ def misc():
|
|||||||
@misc.command('characters')
|
@misc.command('characters')
|
||||||
@click.option('-s', '--season', type=int, help='Season to be processed for character names')
|
@click.option('-s', '--season', type=int, help='Season to be processed for character names')
|
||||||
@click.option('-e', '--episode', type=int, help='Episode to be processed. Requires --season to be specified.')
|
@click.option('-e', '--episode', type=int, help='Episode to be processed. Requires --season to be specified.')
|
||||||
@click.option('--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
|
@click.option('-a', '--all', is_flag=True, help='Process all episodes, regardless of previous specifications.')
|
||||||
@click.option('-i', '--individual', is_flag=True,
|
@click.option('-i', '--individual', is_flag=True,
|
||||||
help='List characters from individual episodes instead of just compiling a masterlist')
|
help='List characters from individual episodes instead of just compiling a masterlist')
|
||||||
def characters(season: int, episode: int, all: bool, individual: bool):
|
def characters(season: int, episode: int, all: bool, individual: bool):
|
||||||
@@ -90,7 +91,7 @@ def characters(season: int, episode: int, all: bool, individual: bool):
|
|||||||
|
|
||||||
# print(master)
|
# print(master)
|
||||||
logger.info(
|
logger.info(
|
||||||
', '.join(item['name'] for item in sorted(master.values(), reverse=True, key=lambda item: item['appearances'])))
|
', '.join(item['name'] for item in sorted(master.values(), reverse=True, key=lambda item: item['appearances'])))
|
||||||
|
|
||||||
|
|
||||||
@cli.command('fetch')
|
@cli.command('fetch')
|
||||||
@@ -98,7 +99,7 @@ def characters(season: int, episode: int, all: bool, individual: bool):
|
|||||||
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
|
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
|
||||||
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||||
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
|
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
|
||||||
@@ -136,7 +137,6 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
|
|||||||
with enlighten.Manager() as manager:
|
with enlighten.Manager() as manager:
|
||||||
with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
|
with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
|
||||||
for _season, _episode in episodes:
|
for _season, _episode in episodes:
|
||||||
|
|
||||||
filepath = get_filepath(_season, _episode, 'html')
|
filepath = get_filepath(_season, _episode, 'html')
|
||||||
|
|
||||||
# Check if HTML file exists
|
# Check if HTML file exists
|
||||||
@@ -167,7 +167,7 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
|
|||||||
@click.option('-s', '--season', type=int,
|
@click.option('-s', '--season', type=int,
|
||||||
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
@click.option('-a', '--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
|
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
|
||||||
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
|
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
|
||||||
@@ -239,7 +239,7 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
|
|||||||
@click.option('-s', '--season', type=int,
|
@click.option('-s', '--season', type=int,
|
||||||
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
@click.option('--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
|
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
|
||||||
def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
|
def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
|
||||||
"""
|
"""
|
||||||
@@ -267,6 +267,9 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
|
|||||||
logger.info('Check --help for more information on this command.')
|
logger.info('Check --help for more information on this command.')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
speakers: Dict = load_file(os.path.join(DATA_DIR, 'speakers.json'), True)
|
||||||
|
speakers = {original: new for original, new in speakers.items() if original != new and type(new) == str}
|
||||||
|
|
||||||
quote: Union[str, List[str]]
|
quote: Union[str, List[str]]
|
||||||
section_num: int
|
section_num: int
|
||||||
for _season, _episode in episodes:
|
for _season, _episode in episodes:
|
||||||
@@ -284,11 +287,12 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
|
|||||||
|
|
||||||
for quote in section_data:
|
for quote in section_data:
|
||||||
quote = quote.split('|', 1)
|
quote = quote.split('|', 1)
|
||||||
|
|
||||||
section['quotes'].append(
|
section['quotes'].append(
|
||||||
{
|
{
|
||||||
'speaker': quote[0],
|
'speaker': clean_string(speakers.get(quote[0], quote[0])),
|
||||||
'text': quote[1]
|
'text': clean_string(quote[1])
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
sections.append(section)
|
sections.append(section)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
@@ -298,7 +302,7 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
|
|||||||
logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
|
logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
|
||||||
if quote:
|
if quote:
|
||||||
logger.info(
|
logger.info(
|
||||||
f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
|
f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
|
||||||
else:
|
else:
|
||||||
# Save processed data
|
# Save processed data
|
||||||
save_file(get_filepath(_season, _episode, 'processed'), sections, True)
|
save_file(get_filepath(_season, _episode, 'processed'), sections, True)
|
||||||
@@ -324,6 +328,81 @@ def process(season: Optional[int], episode: Optional[int], all_: bool, report: b
|
|||||||
logger.info(', '.join(speakers))
|
logger.info(', '.join(speakers))
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('xml')
|
||||||
|
@click.option('-s', '--season', type=int, help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
|
@click.option('-a', '--all', 'all_', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
|
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed.')
|
||||||
|
def xml(season: Optional[int], episode: Optional[int], all_: bool, report: bool):
|
||||||
|
"""
|
||||||
|
Processes manually processed raw quote data into JSON.
|
||||||
|
"""
|
||||||
|
episodes: List[Tuple[int, int]]
|
||||||
|
|
||||||
|
if all_:
|
||||||
|
episodes = list(get_episodes())
|
||||||
|
elif season:
|
||||||
|
if episode:
|
||||||
|
if verify_episode(season, episode):
|
||||||
|
episodes = [(season, episode)]
|
||||||
|
else:
|
||||||
|
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
episodes = list(get_episodes(season=season))
|
||||||
|
logger.info(f'Processing Season {season}...')
|
||||||
|
else:
|
||||||
|
if episode:
|
||||||
|
logger.info('You must specify more than just an episode.')
|
||||||
|
else:
|
||||||
|
logger.info('You must specify which episodes to process.')
|
||||||
|
logger.info('Check --help for more information on this command.')
|
||||||
|
return
|
||||||
|
|
||||||
|
for _season, _episode in episodes:
|
||||||
|
try:
|
||||||
|
processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
|
||||||
|
rootElement = etree.Element('SceneList')
|
||||||
|
for scene in processed_data:
|
||||||
|
sceneElement = etree.Element('Scene')
|
||||||
|
for quote in scene['quotes']:
|
||||||
|
charactersElement = etree.Element('Characters')
|
||||||
|
sceneElement.append(charactersElement)
|
||||||
|
|
||||||
|
rootElement.append(sceneElement)
|
||||||
|
|
||||||
|
save_file(get_filepath(_season, _episode, 'xml'))
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
|
||||||
|
continue
|
||||||
|
|
||||||
|
@cli.command('truth')
|
||||||
|
def truth():
|
||||||
|
"""Modify"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('characters')
|
||||||
|
def characters():
|
||||||
|
"""Collects all characters from every single processed JSON file."""
|
||||||
|
episodes = list(get_episodes())
|
||||||
|
speakersList = OrderedDict()
|
||||||
|
|
||||||
|
for _season, _episode in episodes:
|
||||||
|
try:
|
||||||
|
processed_data = load_file(get_filepath(_season, _episode, 'processed'), True)
|
||||||
|
for scene in processed_data:
|
||||||
|
for quote in scene['quotes']:
|
||||||
|
speakersList[quote['speaker']] = None
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.warning(f"Skipped {_season}-{_episode}, no file found.")
|
||||||
|
|
||||||
|
speaker_data = OrderedDict([(item, item) for item in sorted(speakersList.keys())])
|
||||||
|
print(f'{len(speaker_data)} speakers identified.')
|
||||||
|
|
||||||
|
pprint(list(speaker_data.keys()))
|
||||||
|
save_file(os.path.join(DATA_DIR, 'speakers.json'), speaker_data, True)
|
||||||
|
|
||||||
|
|
||||||
@cli.group('build')
|
@cli.group('build')
|
||||||
def build():
|
def build():
|
||||||
"""Build final data files used by Algolia and the backend API."""
|
"""Build final data files used by Algolia and the backend API."""
|
||||||
@@ -437,18 +516,18 @@ def final(silent_skip: bool, process_: bool):
|
|||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if not silent_skip:
|
if not silent_skip:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
|
f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
|
||||||
episode_data = None
|
episode_data = None
|
||||||
|
|
||||||
description = descriptions[season_id - 1][episode_id - 1]
|
description = descriptions[season_id - 1][episode_id - 1]
|
||||||
seasons[season_id - 1]['episodes'].append(
|
seasons[season_id - 1]['episodes'].append(
|
||||||
{
|
{
|
||||||
'title': description['title'].strip(),
|
'title': description['title'].strip(),
|
||||||
'description': description['description'].strip(),
|
'description': description['description'].strip(),
|
||||||
'episode_id': episode_id,
|
'episode_id': episode_id,
|
||||||
'characters': get_appearances(season_id, episode_id),
|
'characters': get_appearances(season_id, episode_id),
|
||||||
'scenes': episode_data
|
'scenes': episode_data
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info('Saving to data.json')
|
logger.info('Saving to data.json')
|
||||||
|
|||||||
15
server/data.py
Normal file
15
server/data.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
data.py
|
||||||
|
|
||||||
|
Manages API quote/character data, caching static responses and reloading from disk.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
with open(os.path.join(BASE_DIR, 'data', 'data.json'), 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
with open(os.path.join(BASE_DIR, 'data', 'characters.json'), 'r', encoding='utf-8') as file:
|
||||||
|
character_data = json.load(file)
|
||||||
|
|
||||||
@@ -3,8 +3,16 @@ helpers.py
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
import unicodedata
|
||||||
|
from collections import OrderedDict
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from heapq import nlargest as _nlargest
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from typing import List, Tuple, Optional
|
import unidecode
|
||||||
|
|
||||||
episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
|
episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
|
||||||
|
|
||||||
@@ -55,5 +63,82 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s
|
|||||||
def is_main_character(name: str) -> bool:
|
def is_main_character(name: str) -> bool:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def character_id(name: str) -> str:
|
def character_id(name: str) -> str:
|
||||||
return '-'.join(name.split(' ')).lower()
|
return '-'.join(name.split(' ')).lower()
|
||||||
|
|
||||||
|
|
||||||
|
alphabet: str = string.ascii_letters + string.digits
|
||||||
|
|
||||||
|
|
||||||
|
def random_id(length: int = 8) -> str:
|
||||||
|
"""Generate a random {length} character long string."""
|
||||||
|
return ''.join(random.choices(alphabet, k=length))
|
||||||
|
|
||||||
|
|
||||||
|
def char_filter(string):
|
||||||
|
latin = re.compile('[a-zA-Z]+')
|
||||||
|
for char in unicodedata.normalize('NFC', string):
|
||||||
|
decoded = unidecode.unidecode(char)
|
||||||
|
if latin.match(decoded):
|
||||||
|
yield char
|
||||||
|
else:
|
||||||
|
yield decoded
|
||||||
|
|
||||||
|
|
||||||
|
def clean_string(string):
|
||||||
|
return "".join(char_filter(string))
|
||||||
|
|
||||||
|
|
||||||
|
def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
|
||||||
|
"""Use SequenceMatcher to return a list of the indexes of the best
|
||||||
|
"good enough" matches. word is a sequence for which close matches
|
||||||
|
are desired (typically a string).
|
||||||
|
possibilities is a list of sequences against which to match word
|
||||||
|
(typically a list of strings).
|
||||||
|
Optional arg n (default 3) is the maximum number of close matches to
|
||||||
|
return. n must be > 0.
|
||||||
|
Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
|
||||||
|
that don't score at least that similar to word are ignored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not n > 0:
|
||||||
|
raise ValueError("n must be > 0: %r" % (n,))
|
||||||
|
if not 0.0 <= cutoff <= 1.0:
|
||||||
|
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
|
||||||
|
result = []
|
||||||
|
s = SequenceMatcher()
|
||||||
|
s.set_seq2(word)
|
||||||
|
for idx, x in enumerate(possibilities):
|
||||||
|
s.set_seq1(x)
|
||||||
|
if s.real_quick_ratio() >= cutoff and \
|
||||||
|
s.quick_ratio() >= cutoff and \
|
||||||
|
s.ratio() >= cutoff:
|
||||||
|
result.append((s.ratio(), idx))
|
||||||
|
|
||||||
|
# Move the best scorers to head of list
|
||||||
|
result = _nlargest(n, result)
|
||||||
|
|
||||||
|
# Strip scores for the best n matches
|
||||||
|
return [x for score, x in result]
|
||||||
|
|
||||||
|
|
||||||
|
def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
|
||||||
|
"""Add the values of identical keys together, then return both the keys and values"""
|
||||||
|
merge = OrderedDict()
|
||||||
|
for key, value in zip(keys, values):
|
||||||
|
# Already inserted, now make/keep it negative
|
||||||
|
if key in merge.keys():
|
||||||
|
# Keys that haven't been turned over need to be made negative
|
||||||
|
if merge[key] > 0:
|
||||||
|
merge[key] = -merge[key]
|
||||||
|
|
||||||
|
# And then subtract the value in all cases
|
||||||
|
merge[key] -= value
|
||||||
|
else:
|
||||||
|
# Values that are positive didn't merge with other counts.
|
||||||
|
merge[key] = value
|
||||||
|
|
||||||
|
keys, values = zip(*merge.items())
|
||||||
|
values = [f'{-value}*' if value < 0 else str(value) for value in values]
|
||||||
|
return keys, values
|
||||||
|
|||||||
Reference in New Issue
Block a user