mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-10 10:08:57 -06:00
template preprocess/process/algolia/final commands, cleanup process.py functions removing obsolete funcs, adding save_file & preprocess
This commit is contained in:
@@ -3,7 +3,6 @@ cli.py
|
|||||||
|
|
||||||
CLI entrypoint for fetching, processing and compiling quote data.
|
CLI entrypoint for fetching, processing and compiling quote data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -39,7 +38,9 @@ def cli():
|
|||||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||||
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
|
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
|
||||||
"""
|
"""
|
||||||
Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
|
Downloads raw quote pages from 'officequotes.net'.
|
||||||
|
|
||||||
|
Fetches quote pages, placing them in 'html' folder in unmodified UTF-8 HTML files.
|
||||||
"""
|
"""
|
||||||
episodes: List[Tuple[int, int]]
|
episodes: List[Tuple[int, int]]
|
||||||
|
|
||||||
@@ -98,18 +99,80 @@ def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, s
|
|||||||
logger.info('Fetching complete.')
|
logger.info('Fetching complete.')
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('preprocess')
|
||||||
|
@click.option('-s', '--season', type=int,
|
||||||
|
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
|
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
|
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||||
|
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||||
|
@click.option('-d', '--dry-run', is_flag=True)
|
||||||
|
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, dry_run: bool):
|
||||||
|
"""
|
||||||
|
Pre-processes raw HTML files into mangled custom quote data.
|
||||||
|
|
||||||
|
Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite
|
||||||
|
precious quote data.
|
||||||
|
"""
|
||||||
|
episodes: List[Tuple[int, int]]
|
||||||
|
|
||||||
|
if all:
|
||||||
|
episodes = list(get_episodes())
|
||||||
|
elif season:
|
||||||
|
if episode:
|
||||||
|
if verify_episode(season, episode):
|
||||||
|
episodes = [(season, episode)]
|
||||||
|
else:
|
||||||
|
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
episodes = list(get_episodes(season=season))
|
||||||
|
logger.info(f'Preprocessing Season {season}...')
|
||||||
|
else:
|
||||||
|
if episode:
|
||||||
|
logger.info('You must specify more than just an episode.')
|
||||||
|
else:
|
||||||
|
logger.info('You must specify which episodes to pre-process.')
|
||||||
|
logger.info('Check --help for more information on this command.')
|
||||||
|
|
||||||
|
|
||||||
@cli.command('process')
|
@cli.command('process')
|
||||||
def process():
|
@click.option('-s', '--season', type=int,
|
||||||
|
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||||
|
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||||
|
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||||
|
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||||
|
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||||
|
def process(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool):
|
||||||
"""
|
"""
|
||||||
Processes manually processed raw quote data into JSON.
|
Processes manually processed raw quote data into JSON.
|
||||||
"""
|
"""
|
||||||
pass
|
episodes: List[Tuple[int, int]]
|
||||||
|
|
||||||
|
if all:
|
||||||
|
episodes = list(get_episodes())
|
||||||
|
elif season:
|
||||||
|
if episode:
|
||||||
|
if verify_episode(season, episode):
|
||||||
|
episodes = [(season, episode)]
|
||||||
|
else:
|
||||||
|
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
episodes = list(get_episodes(season=season))
|
||||||
|
logger.info(f'Processing Season {season}...')
|
||||||
|
else:
|
||||||
|
if episode:
|
||||||
|
logger.info('You must specify more than just an episode.')
|
||||||
|
else:
|
||||||
|
logger.info('You must specify which episodes to process.')
|
||||||
|
logger.info('Check --help for more information on this command.')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@cli.group('build')
|
@cli.group('build')
|
||||||
def build():
|
def build():
|
||||||
"""Data building command group."""
|
"""Build final data files used by Algolia and the backend API."""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@build.command('algolia')
|
@build.command('algolia')
|
||||||
@@ -117,13 +180,13 @@ def algolia():
|
|||||||
"""
|
"""
|
||||||
Generates algolia.json, a all encompassing file for Algolia's search index.
|
Generates algolia.json, a all encompassing file for Algolia's search index.
|
||||||
"""
|
"""
|
||||||
pass
|
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
|
||||||
|
|
||||||
|
|
||||||
@build.command('final')
|
@build.command('final')
|
||||||
def final():
|
def final():
|
||||||
"""Generates the latest application static data.json file, used by the backend API."""
|
"""Generates the latest application static data.json file, used by the backend API."""
|
||||||
pass
|
files = list(filter(os.path.exists, [get_filepath(season, episode, 'processed') for season, episode in get_episodes()]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -2,10 +2,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import traceback
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from math import ceil
|
from math import ceil
|
||||||
from typing import Iterable, Tuple
|
from typing import Iterable, List, Tuple
|
||||||
|
|
||||||
import enlighten
|
import enlighten
|
||||||
import requests
|
import requests
|
||||||
@@ -32,16 +31,26 @@ def get_filepath(season: int, episode: int, folder: str) -> str:
|
|||||||
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
|
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
|
||||||
|
|
||||||
|
|
||||||
def load_file(filepath: str, parse_json: bool):
|
def load_file(filepath: str, json_decode: bool):
|
||||||
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
|
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
|
||||||
if parse_json:
|
if json_decode:
|
||||||
with open(filepath, 'r') as file:
|
with open(filepath, 'r', encoding='utf-8') as file:
|
||||||
return json.load(file)
|
return json.load(file)
|
||||||
else:
|
else:
|
||||||
with open(filepath, 'r') as file:
|
with open(filepath, 'r', encoding='utf-8') as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
|
def save_file(filepath: str, data, json_encode: bool):
|
||||||
|
"""Shortcut function for saving data to a file, JSON encoding flag."""
|
||||||
|
if json_encode:
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
|
else:
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(data)
|
||||||
|
|
||||||
|
|
||||||
def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
|
def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
|
||||||
"""
|
"""
|
||||||
Yields a list of Episode & Season tuples.
|
Yields a list of Episode & Season tuples.
|
||||||
@@ -90,26 +99,7 @@ def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = Non
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def get_raw(season, episode):
|
def preprocess(page_data: str) -> List[str]:
|
||||||
html_filename = f'{season}-{str(episode).zfill(2)}.html'
|
|
||||||
html_filepath = os.path.join(DATA_DIR, 'html', html_filename)
|
|
||||||
|
|
||||||
# If .html file exists, read
|
|
||||||
if os.path.exists(html_filepath):
|
|
||||||
# print('Reading from disk...')
|
|
||||||
with open(html_filepath, 'r', encoding='utf-8') as file:
|
|
||||||
page_data = file.read()
|
|
||||||
# If not, write to disk for later usage
|
|
||||||
else:
|
|
||||||
link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
|
|
||||||
resp = session.get(link)
|
|
||||||
if resp.ok:
|
|
||||||
page_data = resp.text
|
|
||||||
with open(html_filepath, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(page_data)
|
|
||||||
else:
|
|
||||||
raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"')
|
|
||||||
|
|
||||||
soup = BeautifulSoup(page_data, "html.parser")
|
soup = BeautifulSoup(page_data, "html.parser")
|
||||||
|
|
||||||
data = []
|
data = []
|
||||||
@@ -117,31 +107,14 @@ def get_raw(season, episode):
|
|||||||
for section in sections:
|
for section in sections:
|
||||||
for br in section.find_all('br'):
|
for br in section.find_all('br'):
|
||||||
br.replace_with("\n" + br.text)
|
br.replace_with("\n" + br.text)
|
||||||
|
|
||||||
for line in section.get_text().split('\n'):
|
for line in section.get_text().split('\n'):
|
||||||
data.append(line.strip())
|
data.append(line.strip())
|
||||||
|
|
||||||
data.append('-')
|
data.append('-')
|
||||||
data.pop(-1)
|
data.pop(-1)
|
||||||
|
|
||||||
with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w',
|
return data
|
||||||
encoding='utf-8') as file:
|
|
||||||
file.write('\n'.join(data))
|
|
||||||
|
|
||||||
|
|
||||||
def episodes():
|
|
||||||
ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23]
|
|
||||||
for season_num, ep_count in enumerate(ep_nums, start=1):
|
|
||||||
for episode_num in range(1, ep_count + 1):
|
|
||||||
yield season_num, episode_num
|
|
||||||
|
|
||||||
|
|
||||||
def download_all_raw():
|
|
||||||
for season_num, episode_num in episodes():
|
|
||||||
print(f'{season_num}-{str(episode_num).zfill(2)}')
|
|
||||||
try:
|
|
||||||
get_raw(season_num, episode_num)
|
|
||||||
except Exception as exception:
|
|
||||||
print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})')
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
|
|
||||||
def process(season, episode):
|
def process(season, episode):
|
||||||
|
|||||||
Reference in New Issue
Block a user