diff --git a/server/cli.py b/server/cli.py index 1d4c48c..65b7a23 100644 --- a/server/cli.py +++ b/server/cli.py @@ -1,4 +1,26 @@ +""" +cli.py + +CLI entrypoint for fetching, processing and compiling quote data. +""" + +import logging +import os +import sys +import time +from typing import List, Tuple + import click +import enlighten +import requests + +sys.path[0] += '\\..' +from server.process import get_episodes, get_filepath, sleep_from, verify_episode + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger('cli') +logger.setLevel(logging.DEBUG) +manager = enlighten.get_manager() @click.group() @@ -8,11 +30,72 @@ def cli(): @cli.command('fetch') -def fetch(): +@click.option('-s', '--season', type=int, + help='Season to be fetched. Without --episode, will download all episodes in a season.') +@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.') +@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request') +@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.') +@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.') +@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently') +def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool): """ Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files. """ - pass + episodes: List[Tuple[int, int]] + + if all: + episodes = list(get_episodes()) + elif season: + if episode: + if verify_episode(season, episode): + episodes = [(season, episode)] + else: + logger.error(f'Season {season}, Episode {episode} is not a valid combination.') + return + else: + episodes = list(get_episodes(season=season)) + logger.info(f'Fetching Season {season}...') + else: + if episode: + logger.info('You must specify more than just an episode.') + else: + logger.info('You must specify which episodes to fetch.') + logger.info('Check --help for more information on this command.') + return + + logger.debug(f'Ready to start fetching {len(episodes)} quote page{"s" if len(episodes) > 1 else ""}') + session = requests.Session() + last_request = time.time() - delay + + with enlighten.Manager() as manager: + with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar: + for _season, _episode in episodes: + + filepath = get_filepath(_season, _episode, 'html') + + # Check if HTML file exists + if not overwrite and os.path.exists(filepath): + if not silent_skip: + logger.debug(f'Skipping Season {_season}, Episode {_episode}: File already exists.') + else: + logger.info(f'Fetching Season {_season}, Episode {_episode}...') + + # Generate link, make request + link = f"http://officequotes.net/no{_season}-{str(_episode).zfill(2)}.php" + + sleep_from(delay, last_request, manager) # Sleep at least :delay: seconds. + + resp = session.get(link) + last_request = time.time() + if resp.ok: + # Write data to file + with open(filepath, 'w', encoding='utf-8') as file: + file.write(resp.text) + logger.debug('Successfully fetched.') + else: + logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.') + pbar.update() + logger.info('Fetching complete.') @cli.command('process') diff --git a/server/process.py b/server/process.py index 5013e5e..fe0fff9 100644 --- a/server/process.py +++ b/server/process.py @@ -1,15 +1,93 @@ import json import os import re +import time import traceback from collections import defaultdict +from math import ceil +from typing import Iterable, Tuple +import enlighten import requests from bs4 import BeautifulSoup -s = requests.Session() +session = requests.Session() + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -DATA_DIR = os.path.join(BASE_DIR, 'server', 'data') +DATA_DIR = os.path.join(BASE_DIR, 'data') + +folder_exts = {'html': 'html', 'processed': 'json', 'raw': 'txt'} +episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23] + + +def get_filename(season: int, episode: int, extension: str) -> str: + """Get filename for any given episode in standardized format""" + return f'{season}-{str(episode).zfill(2)}.{extension}' + + +def get_filepath(season: int, episode: int, folder: str) -> str: + """Get full filepath for a episode's datafile for a given folder.""" + if folder: + return os.path.join(DATA_DIR, folder, get_filename(season, episode, folder_exts.get(folder, 'json'))) + return os.path.join(DATA_DIR, get_filename(season, episode, 'json')) + + +def load_file(filepath: str, parse_json: bool): + """Shortcut function for loading file from filepath, with JSON parsing flag.""" + if parse_json: + with open(filepath, 'r') as file: + return json.load(file) + else: + with open(filepath, 'r') as file: + return file.read() + + +def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]: + """ + Yields a list of Episode & Season tuples. + If Season is specified, it yields + """ + if season: + if 1 <= season <= 9: + for episode in range(1, episode_counts[season - 1]): + yield season, episode + else: + for season, ep_count in enumerate(episode_counts, start=1): + for episode in range(1, ep_count + 1): + yield season, episode + + +def verify_episode(season: int, episode: int = None) -> bool: + """ + Verifies that a Season or Season + Episode is valid. + """ + return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season]) + + +def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = None) -> float: + """ + Sleeps for a specific amount of time, accordingly to a previous moment. + + :param wait_time: The minimum amount of time that must be waited since the specified moment. + :param moment: Epoch time. + :param manager: Progressbar Manager + """ + passed = time.time() - moment + time_slept = wait_time - passed + if time_slept > 0.01: + if manager: + time_slept = round(time_slept, 2) + total, delay = ceil(time_slept * 100), time_slept / 100 + bar = manager.counter(total=total, desc='Sleeping...', leave=False) + for _ in range(total): + time.sleep(delay) + bar.update() + bar.close() + else: + time.sleep(time_slept) + return time_slept + else: + return 0 def get_raw(season, episode): @@ -24,7 +102,7 @@ def get_raw(season, episode): # If not, write to disk for later usage else: link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php" - resp = s.get(link) + resp = session.get(link) if resp.ok: page_data = resp.text with open(html_filepath, 'w', encoding='utf-8') as file: