clean up basic filename/filepath functions, add rich argument fetch command with progressbars for fetch/delayed requests

2026-01-31 08:26:13 -06:00 · 2020-08-08 01:15:08 -05:00
parent 437d132d18
commit 5c80d47404
2 changed files with 166 additions and 5 deletions
@@ -1,4 +1,26 @@
+"""
+cli.py
+
+CLI entrypoint for fetching, processing and compiling quote data.
+"""
+
+import logging
+import os
+import sys
+import time
+from typing import List, Tuple
+
 import click
+import enlighten
+import requests
+
+sys.path[0] += '\\..'
+from server.process import get_episodes, get_filepath, sleep_from, verify_episode
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('cli')
+logger.setLevel(logging.DEBUG)
+manager = enlighten.get_manager()


@click.group()
@@ -8,11 +30,72 @@ def cli():


@cli.command('fetch')
-def fetch():
+@click.option('-s', '--season', type=int,
+              help='Season to be fetched. Without --episode, will download all episodes in a season.')
+@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
+@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
+@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
+@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
+@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
+def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
    """
    Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
    """
-    pass
+    episodes: List[Tuple[int, int]]
+
+    if all:
+        episodes = list(get_episodes())
+    elif season:
+        if episode:
+            if verify_episode(season, episode):
+                episodes = [(season, episode)]
+            else:
+                logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
+                return
+        else:
+            episodes = list(get_episodes(season=season))
+            logger.info(f'Fetching Season {season}...')
+    else:
+        if episode:
+            logger.info('You must specify more than just an episode.')
+        else:
+            logger.info('You must specify which episodes to fetch.')
+        logger.info('Check --help for more information on this command.')
+        return
+
+    logger.debug(f'Ready to start fetching {len(episodes)} quote page{"s" if len(episodes) > 1 else ""}')
+    session = requests.Session()
+    last_request = time.time() - delay
+
+    with enlighten.Manager() as manager:
+        with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
+            for _season, _episode in episodes:
+
+                filepath = get_filepath(_season, _episode, 'html')
+
+                # Check if HTML file exists
+                if not overwrite and os.path.exists(filepath):
+                    if not silent_skip:
+                        logger.debug(f'Skipping Season {_season}, Episode {_episode}: File already exists.')
+                else:
+                    logger.info(f'Fetching Season {_season}, Episode {_episode}...')
+
+                    # Generate link, make request
+                    link = f"http://officequotes.net/no{_season}-{str(_episode).zfill(2)}.php"
+
+                    sleep_from(delay, last_request, manager)  # Sleep at least :delay: seconds.
+
+                    resp = session.get(link)
+                    last_request = time.time()
+                    if resp.ok:
+                        # Write data to file
+                        with open(filepath, 'w', encoding='utf-8') as file:
+                            file.write(resp.text)
+                        logger.debug('Successfully fetched.')
+                    else:
+                        logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
+                pbar.update()
+        logger.info('Fetching complete.')


@cli.command('process')
@@ -1,15 +1,93 @@
 import json
 import os
 import re
+import time
 import traceback
 from collections import defaultdict
+from math import ceil
+from typing import Iterable, Tuple

+import enlighten
 import requests
 from bs4 import BeautifulSoup

-s = requests.Session()
+session = requests.Session()
+
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DATA_DIR = os.path.join(BASE_DIR, 'server', 'data')
+DATA_DIR = os.path.join(BASE_DIR, 'data')
+
+folder_exts = {'html': 'html', 'processed': 'json', 'raw': 'txt'}
+episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
+
+
+def get_filename(season: int, episode: int, extension: str) -> str:
+    """Get filename for any given episode in standardized format"""
+    return f'{season}-{str(episode).zfill(2)}.{extension}'
+
+
+def get_filepath(season: int, episode: int, folder: str) -> str:
+    """Get full filepath for a episode's datafile for a given folder."""
+    if folder:
+        return os.path.join(DATA_DIR, folder, get_filename(season, episode, folder_exts.get(folder, 'json')))
+    return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
+
+
+def load_file(filepath: str, parse_json: bool):
+    """Shortcut function for loading file from filepath, with JSON parsing flag."""
+    if parse_json:
+        with open(filepath, 'r') as file:
+            return json.load(file)
+    else:
+        with open(filepath, 'r') as file:
+            return file.read()
+
+
+def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
+    """
+    Yields a list of Episode & Season tuples.
+    If Season is specified, it yields
+    """
+    if season:
+        if 1 <= season <= 9:
+            for episode in range(1, episode_counts[season - 1]):
+                yield season, episode
+    else:
+        for season, ep_count in enumerate(episode_counts, start=1):
+            for episode in range(1, ep_count + 1):
+                yield season, episode
+
+
+def verify_episode(season: int, episode: int = None) -> bool:
+    """
+    Verifies that a Season or Season + Episode is valid.
+    """
+    return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])
+
+
+def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = None) -> float:
+    """
+    Sleeps for a specific amount of time, accordingly to a previous moment.
+
+    :param wait_time: The minimum amount of time that must be waited since the specified moment.
+    :param moment: Epoch time.
+    :param manager: Progressbar Manager
+    """
+    passed = time.time() - moment
+    time_slept = wait_time - passed
+    if time_slept > 0.01:
+        if manager:
+            time_slept = round(time_slept, 2)
+            total, delay = ceil(time_slept * 100), time_slept / 100
+            bar = manager.counter(total=total, desc='Sleeping...', leave=False)
+            for _ in range(total):
+                time.sleep(delay)
+                bar.update()
+            bar.close()
+        else:
+            time.sleep(time_slept)
+        return time_slept
+    else:
+        return 0


 def get_raw(season, episode):
@@ -24,7 +102,7 @@ def get_raw(season, episode):
    # If not, write to disk for later usage
    else:
        link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
-        resp = s.get(link)
+        resp = session.get(link)
        if resp.ok:
            page_data = resp.text
            with open(html_filepath, 'w', encoding='utf-8') as file: