clean up basic filename/filepath functions, add rich argument fetch command with progressbars for fetch/delayed requests

This commit is contained in:
Xevion
2020-08-08 01:15:08 -05:00
parent 437d132d18
commit 5c80d47404
2 changed files with 166 additions and 5 deletions

View File

@@ -1,4 +1,26 @@
"""
cli.py
CLI entrypoint for fetching, processing and compiling quote data.
"""
import logging
import os
import sys
import time
from typing import List, Tuple
import click
import enlighten
import requests
sys.path[0] += '\\..'
from server.process import get_episodes, get_filepath, sleep_from, verify_episode
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('cli')
logger.setLevel(logging.DEBUG)
manager = enlighten.get_manager()
@click.group()
@@ -8,11 +30,72 @@ def cli():
@cli.command('fetch')
def fetch():
@click.option('-s', '--season', type=int,
help='Season to be fetched. Without --episode, will download all episodes in a season.')
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
"""
Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
"""
pass
episodes: List[Tuple[int, int]]
if all:
episodes = list(get_episodes())
elif season:
if episode:
if verify_episode(season, episode):
episodes = [(season, episode)]
else:
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
return
else:
episodes = list(get_episodes(season=season))
logger.info(f'Fetching Season {season}...')
else:
if episode:
logger.info('You must specify more than just an episode.')
else:
logger.info('You must specify which episodes to fetch.')
logger.info('Check --help for more information on this command.')
return
logger.debug(f'Ready to start fetching {len(episodes)} quote page{"s" if len(episodes) > 1 else ""}')
session = requests.Session()
last_request = time.time() - delay
with enlighten.Manager() as manager:
with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
for _season, _episode in episodes:
filepath = get_filepath(_season, _episode, 'html')
# Check if HTML file exists
if not overwrite and os.path.exists(filepath):
if not silent_skip:
logger.debug(f'Skipping Season {_season}, Episode {_episode}: File already exists.')
else:
logger.info(f'Fetching Season {_season}, Episode {_episode}...')
# Generate link, make request
link = f"http://officequotes.net/no{_season}-{str(_episode).zfill(2)}.php"
sleep_from(delay, last_request, manager) # Sleep at least :delay: seconds.
resp = session.get(link)
last_request = time.time()
if resp.ok:
# Write data to file
with open(filepath, 'w', encoding='utf-8') as file:
file.write(resp.text)
logger.debug('Successfully fetched.')
else:
logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
pbar.update()
logger.info('Fetching complete.')
@cli.command('process')

View File

@@ -1,15 +1,93 @@
import json
import os
import re
import time
import traceback
from collections import defaultdict
from math import ceil
from typing import Iterable, Tuple
import enlighten
import requests
from bs4 import BeautifulSoup
s = requests.Session()
session = requests.Session()
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, 'server', 'data')
DATA_DIR = os.path.join(BASE_DIR, 'data')
folder_exts = {'html': 'html', 'processed': 'json', 'raw': 'txt'}
episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
def get_filename(season: int, episode: int, extension: str) -> str:
"""Get filename for any given episode in standardized format"""
return f'{season}-{str(episode).zfill(2)}.{extension}'
def get_filepath(season: int, episode: int, folder: str) -> str:
"""Get full filepath for a episode's datafile for a given folder."""
if folder:
return os.path.join(DATA_DIR, folder, get_filename(season, episode, folder_exts.get(folder, 'json')))
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
def load_file(filepath: str, parse_json: bool):
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
if parse_json:
with open(filepath, 'r') as file:
return json.load(file)
else:
with open(filepath, 'r') as file:
return file.read()
def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
"""
Yields a list of Episode & Season tuples.
If Season is specified, it yields
"""
if season:
if 1 <= season <= 9:
for episode in range(1, episode_counts[season - 1]):
yield season, episode
else:
for season, ep_count in enumerate(episode_counts, start=1):
for episode in range(1, ep_count + 1):
yield season, episode
def verify_episode(season: int, episode: int = None) -> bool:
"""
Verifies that a Season or Season + Episode is valid.
"""
return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])
def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = None) -> float:
"""
Sleeps for a specific amount of time, accordingly to a previous moment.
:param wait_time: The minimum amount of time that must be waited since the specified moment.
:param moment: Epoch time.
:param manager: Progressbar Manager
"""
passed = time.time() - moment
time_slept = wait_time - passed
if time_slept > 0.01:
if manager:
time_slept = round(time_slept, 2)
total, delay = ceil(time_slept * 100), time_slept / 100
bar = manager.counter(total=total, desc='Sleeping...', leave=False)
for _ in range(total):
time.sleep(delay)
bar.update()
bar.close()
else:
time.sleep(time_slept)
return time_slept
else:
return 0
def get_raw(season, episode):
@@ -24,7 +102,7 @@ def get_raw(season, episode):
# If not, write to disk for later usage
else:
link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
resp = s.get(link)
resp = session.get(link)
if resp.ok:
page_data = resp.text
with open(html_filepath, 'w', encoding='utf-8') as file: