finish preprocess, build algolia, build final commands, update README, remove obsolete code in process.py

This commit is contained in:
Xevion
2020-08-09 17:50:26 -05:00
parent d6e2ce1df4
commit 99b15168e1
6 changed files with 257 additions and 171 deletions

View File

@@ -46,10 +46,10 @@ The data has to be parsed, but due to high irregularity (at least too much for m
inspected and manually processed. inspected and manually processed.
```python server/cli.py preprocess ```python server/cli.py preprocess
--season SEASON Pre-processes all episodes from a specific season. -s --season SEASON Pre-processes all episodes from a specific season.
--episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified. -e --episode EPISODE Pre-processes a specific episode. Requires SEASON to be specified.
--all Pre-processes all episodes from every season. -a --all Pre-processes all episodes from every season.
--overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever. -o --overwrite DANGER: Will overwrite files. May result in manually processed files to be lost forever.
``` ```
From then on, once all files have been pre-processed, you will have to begin the long, annoying process of editing them into my custom format. From then on, once all files have been pre-processed, you will have to begin the long, annoying process of editing them into my custom format.
@@ -91,9 +91,9 @@ they are just the JSON format of the previous stage.
``` ```
python server/cli.py process python server/cli.py process
--season SEASON Processes all episodes from a specific season. -s --season SEASON Processes all episodes from a specific season.
--epsiode EPISODE Processes a specific episode. Requires SEASON to be specified. -e --epsiode EPISODE Processes a specific episode. Requires SEASON to be specified.
--all Processes all episodes from all seasons. -a --all Processes all episodes from all seasons.
``` ```
Now that they're all in individual files, the final commands can be ran to compile them into one file, a static Now that they're all in individual files, the final commands can be ran to compile them into one file, a static
@@ -109,6 +109,34 @@ Each command is ran with no special arguments (as of now), generating a `algolia
This `data.json` file is loaded by the Flask server and the `algolia.json` can be uploaded to your primary index. This `data.json` file is loaded by the Flask server and the `algolia.json` can be uploaded to your primary index.
For every command mentioned, you can read all arguments with `--help`:
```
$ python cli.py preprocess --help
Usage: cli.py preprocess [OPTIONS]
Pre-processes raw HTML files into mangled custom quote data.
Custom quote data requires manual inspection and formatting, making it a
dangerous operation that may overwrite precious quote data.
Options:
-s, --season INTEGER Season to be fetched. Without --episode, will
download all episodes in a season.
-e, --episode INTEGER Specific episode to be fetched. Requires
--season to be specified.
--all Fetch all episodes, regardless of previous
specifications.
-o, --overwrite Overwrite if a file already exists.
-ss, --silent-skip Skip missing/existing files silently
-ssm, --silent-skip-missing Skip missing files silently
-sse, --silent-skip-existing Skip overwrite skips silently
--help Show this message and exit.
```
## Setup ## Setup
This project was built on Python 3.7 and Node v12.18.3 / npm 6.14.6. This project was built on Python 3.7 and Node v12.18.3 / npm 6.14.6.

View File

@@ -1,6 +1,6 @@
{ {
"name": "client", "name": "TheOfficeQuotes",
"version": "0.1.0", "version": "0.2.0",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {
@@ -1171,6 +1171,32 @@
"to-fast-properties": "^2.0.0" "to-fast-properties": "^2.0.0"
} }
}, },
"@fortawesome/fontawesome-common-types": {
"version": "0.2.30",
"resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-0.2.30.tgz",
"integrity": "sha512-TsRwpTuKwFNiPhk1UfKgw7zNPeV5RhNp2Uw3pws+9gDAkPGKrtjR1y2lI3SYn7+YzyfuNknflpBA1LRKjt7hMg=="
},
"@fortawesome/fontawesome-svg-core": {
"version": "1.2.30",
"resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-svg-core/-/fontawesome-svg-core-1.2.30.tgz",
"integrity": "sha512-E3sAXATKCSVnT17HYmZjjbcmwihrNOCkoU7dVMlasrcwiJAHxSKeZ+4WN5O+ElgO/FaYgJmASl8p9N7/B/RttA==",
"requires": {
"@fortawesome/fontawesome-common-types": "^0.2.30"
}
},
"@fortawesome/free-solid-svg-icons": {
"version": "5.14.0",
"resolved": "https://registry.npmjs.org/@fortawesome/free-solid-svg-icons/-/free-solid-svg-icons-5.14.0.tgz",
"integrity": "sha512-M933RDM8cecaKMWDSk3FRYdnzWGW7kBBlGNGfvqLVwcwhUPNj9gcw+xZMrqBdRqxnSXdl3zWzTCNNGEtFUq67Q==",
"requires": {
"@fortawesome/fontawesome-common-types": "^0.2.30"
}
},
"@fortawesome/vue-fontawesome": {
"version": "0.1.10",
"resolved": "https://registry.npmjs.org/@fortawesome/vue-fontawesome/-/vue-fontawesome-0.1.10.tgz",
"integrity": "sha512-b2+SLF31h32LSepVcXe+BQ63yvbq5qmTCy4KfFogCYm2bn68H5sDWUnX+U7MBqnM2aeEk9M7xSoqGnu+wSdY6w=="
},
"@hapi/address": { "@hapi/address": {
"version": "2.1.4", "version": "2.1.4",
"resolved": "https://registry.npm.taobao.org/@hapi/address/download/@hapi/address-2.1.4.tgz?cache=0&sync_timestamp=1593993773437&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2F%40hapi%2Faddress%2Fdownload%2F%40hapi%2Faddress-2.1.4.tgz", "resolved": "https://registry.npm.taobao.org/@hapi/address/download/@hapi/address-2.1.4.tgz?cache=0&sync_timestamp=1593993773437&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2F%40hapi%2Faddress%2Fdownload%2F%40hapi%2Faddress-2.1.4.tgz",

View File

@@ -8,6 +8,9 @@
"lint": "vue-cli-service lint" "lint": "vue-cli-service lint"
}, },
"dependencies": { "dependencies": {
"@fortawesome/fontawesome-svg-core": "^1.2.30",
"@fortawesome/free-solid-svg-icons": "^5.14.0",
"@fortawesome/vue-fontawesome": "^0.1.10",
"algoliasearch": "^4.3.1", "algoliasearch": "^4.3.1",
"axios": ">=0.18.1", "axios": ">=0.18.1",
"bootstrap": "^4.3.1", "bootstrap": "^4.3.1",

4
client/vue.config.js Normal file
View File

@@ -0,0 +1,4 @@
module.exports = {
indexPath: '../../dist/index.html',
assetsDir: '../../dist',
};

View File

@@ -5,16 +5,20 @@ CLI entrypoint for fetching, processing and compiling quote data.
""" """
import logging import logging
import os import os
import re
import sys import sys
import time import time
from typing import List, Tuple from typing import List, Tuple, Union
import click import click
import enlighten import enlighten
import requests import requests
from bs4 import BeautifulSoup
sys.path[0] += '\\..' sys.path[0] += '\\..'
from server.process import get_episodes, get_filepath, sleep_from, verify_episode from server.process import DATA_DIR, get_characters, get_episodes, get_filepath, load_file, \
save_file, sleep_from, \
verify_episode
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('cli') logger = logging.getLogger('cli')
@@ -24,7 +28,6 @@ manager = enlighten.get_manager()
@click.group() @click.group()
def cli(): def cli():
"""Base command group."""
pass pass
@@ -90,9 +93,8 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
last_request = time.time() last_request = time.time()
if resp.ok: if resp.ok:
# Write data to file # Write data to file
with open(filepath, 'w', encoding='utf-8') as file: save_file(filepath, resp.text, False)
file.write(resp.text) logger.debug('Successfully fetched & saved.')
logger.debug('Successfully fetched.')
else: else:
logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.') logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
pbar.update() pbar.update()
@@ -105,15 +107,18 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') @click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') @click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
@click.option('-d', '--dry-run', is_flag=True) @click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool): @click.option('-sse', '--silent-skip-existing', is_flag=True, help='Skip overwrite skips silently')
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool,
silent_skip_existing: bool):
""" """
Pre-processes raw HTML files into mangled custom quote data. Pre-processes raw HTML files into mangled custom quote data.
Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
precious quote data. precious quote data.
""" """
print(silent_skip_existing)
episodes: List[Tuple[int, int]] episodes: List[Tuple[int, int]]
if all: if all:
@@ -134,6 +139,38 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
else: else:
logger.info('You must specify which episodes to pre-process.') logger.info('You must specify which episodes to pre-process.')
logger.info('Check --help for more information on this command.') logger.info('Check --help for more information on this command.')
return
for season, episode in episodes:
# Overwrite protection
save_path = get_filepath(season, episode, 'raw')
if os.path.exists(save_path) and not overwrite:
if (not silent_skip) or (not silent_skip_existing):
logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.')
continue
try:
page_data = load_file(get_filepath(season, episode, 'html'), False)
except FileNotFoundError:
if not silent_skip or not silent_skip_missing:
logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.')
else:
soup = BeautifulSoup(page_data, "html.parser")
data = []
sections = soup.find_all(attrs={"class": "quote"})
for section in sections:
for br in section.find_all('br'):
br.replace_with("\n" + br.text)
for line in section.get_text().split('\n'):
data.append(line.strip())
data.append('-')
data.pop(-1)
data = '\n'.join(data)
save_file(save_path, data, False)
@cli.command('process') @cli.command('process')
@@ -141,9 +178,8 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
help='Season to be fetched. Without --episode, will download all episodes in a season.') help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') @click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') @click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') @click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') def process(season: int, episode: int, all: bool, report: bool):
def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
""" """
Processes manually processed raw quote data into JSON. Processes manually processed raw quote data into JSON.
""" """
@@ -169,24 +205,138 @@ def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip:
logger.info('Check --help for more information on this command.') logger.info('Check --help for more information on this command.')
return return
quote: Union[str, List[str]]
section_num: int
for _season, _episode in episodes:
sections = []
try:
preprocessed_data = load_file(get_filepath(_season, _episode, 'raw'))
for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1):
section = {
'quotes': []
}
section_data = list(raw_section.strip().split('\n'))
if section_data[0].startswith('!'):
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
for quote in section_data:
quote = quote.split('|', 1)
section['quotes'].append(
{
'speaker': quote[0],
'text': quote[1]
}
)
sections.append(section)
except FileNotFoundError:
logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
except:
logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
logger.info(
f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
else:
# Save processed data
save_file(get_filepath(_season, _episode, 'processed'), sections, True)
if report:
deleted_count = [0, set()]
quote_count = 0
speakers = set()
for section in sections:
quote_count += len(section['quotes'])
if 'deleted' in section.keys():
deleted_count[0] += 1
deleted_count[1].add(section['deleted'])
for quote in section['quotes']:
speakers.add(quote['speaker'])
logger.debug(f'{quote_count} quotes.')
logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
logger.info(f'{len(speakers)} Speakers:')
logger.info(', '.join(speakers))
@cli.group('build') @cli.group('build')
def build(): def build():
"""Build final data files used by Algolia and the backend API.""" """Build final data files used by Algolia and the backend API."""
pass
@build.command('algolia') @build.command('algolia')
def algolia(): @click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('--process', is_flag=True, help='Run processing before building final data.')
def algolia(silent_skip: bool):
""" """
Generates algolia.json, a all encompassing file for Algolia's search index. Generates algolia.json, a all encompassing file for Algolia's search index.
""" """
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) data = []
episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0
for season, episode in get_episodes():
episode_num_abs += 1
try:
episode_data = load_file(get_filepath(season, episode, 'processed'), True)
except FileNotFoundError:
if not silent_skip:
logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.')
else:
for section_num_rel, section in enumerate(episode_data, start=1):
section_num_abs += 1
for quote_num_rel, quote in enumerate(section['quotes'], start=1):
quote_num_abs += 1
# Relative position
quote['quote_rel'] = quote_num_rel
quote['section_rel'] = section_num_rel
quote['episode_rel'] = episode
# Absolute position
quote['quote_abs'] = quote_num_abs
quote['section_abs'] = section_num_abs
quote['episode_abs'] = episode_num_abs
quote['season'] = season
quote['is_deleted'] = 'deleted' in section.keys()
quote['deleted_section'] = section.get('deleted')
data.append(quote)
logger.info(f'Saving {len(data):,} quotes to algolia.json')
save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True)
@build.command('final') @build.command('final')
def final(): @click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('--process', is_flag=True, help='Run processing before building final data.')
def final(silent_skip: bool):
"""Generates the latest application static data.json file, used by the backend API.""" """Generates the latest application static data.json file, used by the backend API."""
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()])) descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True)
seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)]
for season_id, episode_id in get_episodes():
# Load data file
try:
episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True)
except FileNotFoundError:
if not silent_skip:
logger.warning(f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
episode_data = None
description = descriptions[season_id - 1][episode_id - 1]
seasons[season_id - 1]['episodes'].append(
{
'title': description['title'].strip(),
'description': description['description'].strip(),
'episode_id': episode_id,
'characters': get_characters(season_id, episode_id),
'scenes': episode_data
}
)
logger.info('Saving to data.json')
save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,14 +1,18 @@
"""
process.py
Functions and shortcuts for loading/saving/extracting data for processing quote data.
"""
import json import json
import os import os
import re
import time import time
from collections import defaultdict from collections import defaultdict
from math import ceil from math import ceil
from typing import Iterable, List, Tuple from typing import Dict, Iterable, List, Tuple, Union
import enlighten import enlighten
import requests import requests
from bs4 import BeautifulSoup
session = requests.Session() session = requests.Session()
@@ -31,7 +35,7 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
return os.path.join(DATA_DIR, get_filename(season, episode, 'json')) return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
def load_file(filepath: str, json_decode: bool): def load_file(filepath: str, json_decode: bool = False):
"""Shortcut function for loading file from filepath, with JSON parsing flag.""" """Shortcut function for loading file from filepath, with JSON parsing flag."""
if json_decode: if json_decode:
with open(filepath, 'r', encoding='utf-8') as file: with open(filepath, 'r', encoding='utf-8') as file:
@@ -68,7 +72,7 @@ def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
def verify_episode(season: int, episode: int = None) -> bool: def verify_episode(season: int, episode: int = None) -> bool:
""" """
Verifies that a Season or Season + Episode is valid. Verifies that specific Season and/or Episode is valid.
""" """
return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season]) return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])
@@ -99,149 +103,20 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
return 0 return 0
def preprocess(page_data: str) -> List[str]: def get_characters(season, episode) -> List[Dict[str, Union[int, str]]]:
soup = BeautifulSoup(page_data, "html.parser") """
Extracts all characters and their number of appearances from a specific episode.
data = [] Prepared in a list of dictionary, preferable storage/for loop method.
sections = soup.find_all(attrs={"class": "quote"}) """
for section in sections: filepath = get_filepath(season, episode, 'processed')
for br in section.find_all('br'): if not os.path.exists(filepath):
br.replace_with("\n" + br.text) return []
scenes = load_file(filepath, True)
for line in section.get_text().split('\n'):
data.append(line.strip())
data.append('-')
data.pop(-1)
return data
def process(season, episode):
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r',
encoding='utf-8') as file:
sections = []
for s in re.split('^-', file.read(), flags=re.MULTILINE):
section = {
'quotes': []
}
section_data = list(s.strip().split('\n'))
if section_data[0].startswith('!'):
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
for q in section_data:
quote = q.split('|', 1)
print(quote)
section['quotes'].append(
{
'speaker': quote[0],
'text': quote[1]
}
)
sections.append(section)
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w',
encoding='utf-8') as file:
json.dump(sections, file, indent=4, ensure_ascii=False)
deleted_count = [0, set()]
quote_count = 0
speakers = set()
for section in sections:
quote_count += len(section['quotes'])
if 'deleted' in section.keys():
deleted_count[0] += 1
deleted_count[1].add(section['deleted'])
for quote in section['quotes']:
speakers.add(quote['speaker'])
print(f'{quote_count} quotes.')
print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
print(f'{len(speakers)} Speakers:')
print(', '.join(speakers))
def generate_algolia():
data = []
quote_num = 0
for season, episode in episodes():
try:
with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r',
encoding='utf-8') as file:
episode_data = json.load(file)
except FileNotFoundError:
print(f'No JSON data for Season {season} Episode {episode}')
else:
for section_num, section in enumerate(episode_data, start=1):
for quote in section['quotes']:
quote_num += 1
quote['quote'] = quote_num
quote['section'] = section_num
quote['episode'] = episode
quote['season'] = season
quote['is_deleted'] = 'deleted' in section.keys()
quote['deleted_section'] = section.get('deleted')
data.append(quote)
with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def get_episode_scenes(season, episode):
filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json')
if os.path.exists(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
return json.load(file)
else:
return None
def get_characters(season, episode):
scenes = get_episode_scenes(season, episode)
if scenes is None:
return None
characters = defaultdict(int) characters = defaultdict(int)
for scene in scenes: for scene in scenes:
for quote in scene['quotes']: for quote in scene.get('quotes', []):
characters[quote['speaker']] += 1 characters[quote.get('speaker')] += 1
characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()} characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()}
for character, appearances in characters.items()] for character, appearances in characters.items()]
return list(sorted(characters, key=lambda item: item['appearances'], reverse=True)) return list(sorted(characters, key=lambda item: item['appearances'], reverse=True))
def generate_final():
"""Merge episode descriptions/titles and quotes into final JSON file."""
with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file:
data = json.load(file)
output = []
for season_id, season in enumerate(data, start=1):
output.append({
'season_id': season_id,
'episodes': [
{
'title': episode['title'].strip(),
'description': episode['description'].strip(),
'episode_id': episode_id,
'characters': get_characters(season_id, episode_id),
'scenes': get_episode_scenes(season_id, episode_id)
}
for episode_id, episode in enumerate(season, start=1)
]
})
with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file:
json.dump(output, file, ensure_ascii=False, indent=4)
# generate_algolia()
# process(3, 10)
generate_final()