template preprocess/process/algolia/final commands, cleanup process.py functions removing obsolete funcs, adding save_file & preprocess

This commit is contained in:
Xevion
2020-08-08 03:59:03 -05:00
parent 5c80d47404
commit d6e2ce1df4
2 changed files with 90 additions and 54 deletions

View File

@@ -3,7 +3,6 @@ cli.py
CLI entrypoint for fetching, processing and compiling quote data.
"""
import logging
import os
import sys
@@ -39,7 +38,9 @@ def cli():
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
"""
Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
Downloads raw quote pages from 'officequotes.net'.
Fetches quote pages, placing them in 'html' folder in unmodified UTF-8 HTML files.
"""
episodes: List[Tuple[int, int]]
@@ -98,18 +99,80 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
logger.info('Fetching complete.')
@cli.command('preprocess')
@click.option('-s', '--season', type=int,
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
@click.option('-d', '--dry-run', is_flag=True)
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
"""
Pre-processes raw HTML files into mangled custom quote data.
Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
precious quote data.
"""
episodes: List[Tuple[int, int]]
if all:
episodes = list(get_episodes())
elif season:
if episode:
if verify_episode(season, episode):
episodes = [(season, episode)]
else:
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
return
else:
episodes = list(get_episodes(season=season))
logger.info(f'Preprocessing Season {season}...')
else:
if episode:
logger.info('You must specify more than just an episode.')
else:
logger.info('You must specify which episodes to pre-process.')
logger.info('Check --help for more information on this command.')
@cli.command('process')
def process():
@click.option('-s', '--season', type=int,
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
"""
Processes manually processed raw quote data into JSON.
"""
pass
episodes: List[Tuple[int, int]]
if all:
episodes = list(get_episodes())
elif season:
if episode:
if verify_episode(season, episode):
episodes = [(season, episode)]
else:
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
return
else:
episodes = list(get_episodes(season=season))
logger.info(f'Processing Season {season}...')
else:
if episode:
logger.info('You must specify more than just an episode.')
else:
logger.info('You must specify which episodes to process.')
logger.info('Check --help for more information on this command.')
return
@cli.group('build')
def build():
"""Data building command group."""
pass
"""Build final data files used by Algolia and the backend API."""
@build.command('algolia')
@@ -117,13 +180,13 @@ def algolia():
"""
Generates algolia.json, a all encompassing file for Algolia's search index.
"""
pass
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
@build.command('final')
def final():
"""Generates the latest application static data.json file, used by the backend API."""
pass
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
if __name__ == "__main__":

View File

@@ -2,10 +2,9 @@ import json
import os
import re
import time
import traceback
from collections import defaultdict
from math import ceil
from typing import Iterable, Tuple
from typing import Iterable, List, Tuple
import enlighten
import requests
@@ -32,16 +31,26 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
def load_file(filepath: str, parse_json: bool):
def load_file(filepath: str, json_decode: bool):
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
if parse_json:
with open(filepath, 'r') as file:
if json_decode:
with open(filepath, 'r', encoding='utf-8') as file:
return json.load(file)
else:
with open(filepath, 'r') as file:
with open(filepath, 'r', encoding='utf-8') as file:
return file.read()
def save_file(filepath: str, data, json_encode: bool):
"""Shortcut function for saving data to a file, JSON encoding flag."""
if json_encode:
with open(filepath, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
else:
with open(filepath, 'w', encoding='utf-8') as file:
file.write(data)
def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
"""
Yields a list of Episode & Season tuples.
@@ -90,26 +99,7 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
return 0
def get_raw(season, episode):
html_filename = f'{season}-{str(episode).zfill(2)}.html'
html_filepath = os.path.join(DATA_DIR, 'html', html_filename)
# If .html file exists, read
if os.path.exists(html_filepath):
# print('Reading from disk...')
with open(html_filepath, 'r', encoding='utf-8') as file:
page_data = file.read()
# If not, write to disk for later usage
else:
link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
resp = session.get(link)
if resp.ok:
page_data = resp.text
with open(html_filepath, 'w', encoding='utf-8') as file:
file.write(page_data)
else:
raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"')
def preprocess(page_data: str) -> List[str]:
soup = BeautifulSoup(page_data, "html.parser")
data = []
@@ -117,31 +107,14 @@ def get_raw(season, episode):
for section in sections:
for br in section.find_all('br'):
br.replace_with("\n" + br.text)
for line in section.get_text().split('\n'):
data.append(line.strip())
data.append('-')
data.pop(-1)
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w',
encoding='utf-8') as file:
file.write('\n'.join(data))
def episodes():
ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23]
for season_num, ep_count in enumerate(ep_nums, start=1):
for episode_num in range(1, ep_count + 1):
yield season_num, episode_num
def download_all_raw():
for season_num, episode_num in episodes():
print(f'{season_num}-{str(episode_num).zfill(2)}')
try:
get_raw(season_num, episode_num)
except Exception as exception:
print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})')
traceback.print_exc()
return data
def process(season, episode):