finish preprocess, build algolia, build final commands, update README, remove obsolete code in process.py

This commit is contained in:
Xevion
2020-08-09 17:50:26 -05:00
parent d6e2ce1df4
commit 99b15168e1
6 changed files with 257 additions and 171 deletions

View File

@@ -5,16 +5,20 @@ CLI entrypoint for fetching, processing and compiling quote data.
"""
import logging
import os
import re
import sys
import time
from typing import List, Tuple
from typing import List, Tuple, Union
import click
import enlighten
import requests
from bs4 import BeautifulSoup
sys.path[0] += '\\..'
from server.process import get_episodes, get_filepath, sleep_from, verify_episode
from server.process import DATA_DIR, get_characters, get_episodes, get_filepath, load_file, \
save_file, sleep_from, \
verify_episode
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('cli')
@@ -24,7 +28,6 @@ manager = enlighten.get_manager()
@click.group()
def cli():
"""Base command group."""
pass
@@ -90,9 +93,8 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
last_request = time.time()
if resp.ok:
# Write data to file
with open(filepath, 'w', encoding='utf-8') as file:
file.write(resp.text)
logger.debug('Successfully fetched.')
save_file(filepath, resp.text, False)
logger.debug('Successfully fetched & saved.')
else:
logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
pbar.update()
@@ -105,15 +107,18 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('-d', '--dry-run', is_flag=True)
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip missing/existing files silently')
@click.option('-ssm', '--silent-skip-missing', is_flag=True, help='Skip missing files silently')
@click.option('-sse', '--silent-skip-existing', is_flag=True, help='Skip overwrite skips silently')
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool,
silent_skip_existing: bool):
"""
Pre-processes raw HTML files into mangled custom quote data.
Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
precious quote data.
"""
print(silent_skip_existing)
episodes: List[Tuple[int, int]]
if all:
@@ -134,6 +139,38 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
else:
logger.info('You must specify which episodes to pre-process.')
logger.info('Check --help for more information on this command.')
return
for season, episode in episodes:
# Overwrite protection
save_path = get_filepath(season, episode, 'raw')
if os.path.exists(save_path) and not overwrite:
if (not silent_skip) or (not silent_skip_existing):
logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.')
continue
try:
page_data = load_file(get_filepath(season, episode, 'html'), False)
except FileNotFoundError:
if not silent_skip or not silent_skip_missing:
logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.')
else:
soup = BeautifulSoup(page_data, "html.parser")
data = []
sections = soup.find_all(attrs={"class": "quote"})
for section in sections:
for br in section.find_all('br'):
br.replace_with("\n" + br.text)
for line in section.get_text().split('\n'):
data.append(line.strip())
data.append('-')
data.pop(-1)
data = '\n'.join(data)
save_file(save_path, data, False)
@cli.command('process')
@@ -141,9 +178,8 @@ def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_ski
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
@click.option('-r', '--report', is_flag=True, help='Report quote statistics once processing completed')
def process(season: int, episode: int, all: bool, report: bool):
"""
Processes manually processed raw quote data into JSON.
"""
@@ -169,24 +205,138 @@ def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip:
logger.info('Check --help for more information on this command.')
return
quote: Union[str, List[str]]
section_num: int
for _season, _episode in episodes:
sections = []
try:
preprocessed_data = load_file(get_filepath(_season, _episode, 'raw'))
for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1):
section = {
'quotes': []
}
section_data = list(raw_section.strip().split('\n'))
if section_data[0].startswith('!'):
section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))
for quote in section_data:
quote = quote.split('|', 1)
section['quotes'].append(
{
'speaker': quote[0],
'text': quote[1]
}
)
sections.append(section)
except FileNotFoundError:
logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.')
except:
logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.')
logger.info(
f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
else:
# Save processed data
save_file(get_filepath(_season, _episode, 'processed'), sections, True)
if report:
deleted_count = [0, set()]
quote_count = 0
speakers = set()
for section in sections:
quote_count += len(section['quotes'])
if 'deleted' in section.keys():
deleted_count[0] += 1
deleted_count[1].add(section['deleted'])
for quote in section['quotes']:
speakers.add(quote['speaker'])
logger.debug(f'{quote_count} quotes.')
logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
logger.info(f'{len(speakers)} Speakers:')
logger.info(', '.join(speakers))
@cli.group('build')
def build():
"""Build final data files used by Algolia and the backend API."""
pass
@build.command('algolia')
def algolia():
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('--process', is_flag=True, help='Run processing before building final data.')
def algolia(silent_skip: bool):
"""
Generates algolia.json, a all encompassing file for Algolia's search index.
"""
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
data = []
episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0
for season, episode in get_episodes():
episode_num_abs += 1
try:
episode_data = load_file(get_filepath(season, episode, 'processed'), True)
except FileNotFoundError:
if not silent_skip:
logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.')
else:
for section_num_rel, section in enumerate(episode_data, start=1):
section_num_abs += 1
for quote_num_rel, quote in enumerate(section['quotes'], start=1):
quote_num_abs += 1
# Relative position
quote['quote_rel'] = quote_num_rel
quote['section_rel'] = section_num_rel
quote['episode_rel'] = episode
# Absolute position
quote['quote_abs'] = quote_num_abs
quote['section_abs'] = section_num_abs
quote['episode_abs'] = episode_num_abs
quote['season'] = season
quote['is_deleted'] = 'deleted' in section.keys()
quote['deleted_section'] = section.get('deleted')
data.append(quote)
logger.info(f'Saving {len(data):,} quotes to algolia.json')
save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True)
@build.command('final')
def final():
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('--process', is_flag=True, help='Run processing before building final data.')
def final(silent_skip: bool):
"""Generates the latest application static data.json file, used by the backend API."""
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True)
seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)]
for season_id, episode_id in get_episodes():
# Load data file
try:
episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True)
except FileNotFoundError:
if not silent_skip:
logger.warning(f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.')
episode_data = None
description = descriptions[season_id - 1][episode_id - 1]
seasons[season_id - 1]['episodes'].append(
{
'title': description['title'].strip(),
'description': description['description'].strip(),
'episode_id': episode_id,
'characters': get_characters(season_id, episode_id),
'scenes': episode_data
}
)
logger.info('Saving to data.json')
save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True)
if __name__ == "__main__":