mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-15 02:13:27 -06:00
clean up basic filename/filepath functions, add rich argument fetch command with progressbars for fetch/delayed requests
This commit is contained in:
@@ -1,4 +1,26 @@
|
||||
"""
|
||||
cli.py
|
||||
|
||||
CLI entrypoint for fetching, processing and compiling quote data.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
import click
|
||||
import enlighten
|
||||
import requests
|
||||
|
||||
sys.path[0] += '\\..'
|
||||
from server.process import get_episodes, get_filepath, sleep_from, verify_episode
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger('cli')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
manager = enlighten.get_manager()
|
||||
|
||||
|
||||
@click.group()
|
||||
@@ -8,11 +30,72 @@ def cli():
|
||||
|
||||
|
||||
@cli.command('fetch')
|
||||
def fetch():
|
||||
@click.option('-s', '--season', type=int,
|
||||
help='Season to be fetched. Without --episode, will download all episodes in a season.')
|
||||
@click.option('-e', '--episode', type=int, help='Specific episode to be fetched. Requires --season to be specified.')
|
||||
@click.option('-d', '--delay', type=float, default=0.5, help='Delay between each request')
|
||||
@click.option('--all', is_flag=True, help='Fetch all episodes, regardless of previous specifications.')
|
||||
@click.option('-o', '--overwrite', is_flag=True, help='Overwrite if a file already exists.')
|
||||
@click.option('-ss', '--silent-skip', is_flag=True, help='Skip existing files silently')
|
||||
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool):
|
||||
"""
|
||||
Fetches data from officequotes.net, placing them in unmodified UTF-8 HTML files.
|
||||
"""
|
||||
pass
|
||||
episodes: List[Tuple[int, int]]
|
||||
|
||||
if all:
|
||||
episodes = list(get_episodes())
|
||||
elif season:
|
||||
if episode:
|
||||
if verify_episode(season, episode):
|
||||
episodes = [(season, episode)]
|
||||
else:
|
||||
logger.error(f'Season {season}, Episode {episode} is not a valid combination.')
|
||||
return
|
||||
else:
|
||||
episodes = list(get_episodes(season=season))
|
||||
logger.info(f'Fetching Season {season}...')
|
||||
else:
|
||||
if episode:
|
||||
logger.info('You must specify more than just an episode.')
|
||||
else:
|
||||
logger.info('You must specify which episodes to fetch.')
|
||||
logger.info('Check --help for more information on this command.')
|
||||
return
|
||||
|
||||
logger.debug(f'Ready to start fetching {len(episodes)} quote page{"s" if len(episodes) > 1 else ""}')
|
||||
session = requests.Session()
|
||||
last_request = time.time() - delay
|
||||
|
||||
with enlighten.Manager() as manager:
|
||||
with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar:
|
||||
for _season, _episode in episodes:
|
||||
|
||||
filepath = get_filepath(_season, _episode, 'html')
|
||||
|
||||
# Check if HTML file exists
|
||||
if not overwrite and os.path.exists(filepath):
|
||||
if not silent_skip:
|
||||
logger.debug(f'Skipping Season {_season}, Episode {_episode}: File already exists.')
|
||||
else:
|
||||
logger.info(f'Fetching Season {_season}, Episode {_episode}...')
|
||||
|
||||
# Generate link, make request
|
||||
link = f"http://officequotes.net/no{_season}-{str(_episode).zfill(2)}.php"
|
||||
|
||||
sleep_from(delay, last_request, manager) # Sleep at least :delay: seconds.
|
||||
|
||||
resp = session.get(link)
|
||||
last_request = time.time()
|
||||
if resp.ok:
|
||||
# Write data to file
|
||||
with open(filepath, 'w', encoding='utf-8') as file:
|
||||
file.write(resp.text)
|
||||
logger.debug('Successfully fetched.')
|
||||
else:
|
||||
logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.')
|
||||
pbar.update()
|
||||
logger.info('Fetching complete.')
|
||||
|
||||
|
||||
@cli.command('process')
|
||||
|
||||
@@ -1,15 +1,93 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from math import ceil
|
||||
from typing import Iterable, Tuple
|
||||
|
||||
import enlighten
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
s = requests.Session()
|
||||
session = requests.Session()
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
DATA_DIR = os.path.join(BASE_DIR, 'server', 'data')
|
||||
DATA_DIR = os.path.join(BASE_DIR, 'data')
|
||||
|
||||
folder_exts = {'html': 'html', 'processed': 'json', 'raw': 'txt'}
|
||||
episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]
|
||||
|
||||
|
||||
def get_filename(season: int, episode: int, extension: str) -> str:
|
||||
"""Get filename for any given episode in standardized format"""
|
||||
return f'{season}-{str(episode).zfill(2)}.{extension}'
|
||||
|
||||
|
||||
def get_filepath(season: int, episode: int, folder: str) -> str:
|
||||
"""Get full filepath for a episode's datafile for a given folder."""
|
||||
if folder:
|
||||
return os.path.join(DATA_DIR, folder, get_filename(season, episode, folder_exts.get(folder, 'json')))
|
||||
return os.path.join(DATA_DIR, get_filename(season, episode, 'json'))
|
||||
|
||||
|
||||
def load_file(filepath: str, parse_json: bool):
|
||||
"""Shortcut function for loading file from filepath, with JSON parsing flag."""
|
||||
if parse_json:
|
||||
with open(filepath, 'r') as file:
|
||||
return json.load(file)
|
||||
else:
|
||||
with open(filepath, 'r') as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def get_episodes(season: int = None) -> Iterable[Tuple[int, int]]:
|
||||
"""
|
||||
Yields a list of Episode & Season tuples.
|
||||
If Season is specified, it yields
|
||||
"""
|
||||
if season:
|
||||
if 1 <= season <= 9:
|
||||
for episode in range(1, episode_counts[season - 1]):
|
||||
yield season, episode
|
||||
else:
|
||||
for season, ep_count in enumerate(episode_counts, start=1):
|
||||
for episode in range(1, ep_count + 1):
|
||||
yield season, episode
|
||||
|
||||
|
||||
def verify_episode(season: int, episode: int = None) -> bool:
|
||||
"""
|
||||
Verifies that a Season or Season + Episode is valid.
|
||||
"""
|
||||
return 1 <= season <= 9 and (episode is None or 1 <= episode <= episode_counts[season])
|
||||
|
||||
|
||||
def sleep_from(wait_time: float, moment: float, manager: enlighten.Manager = None) -> float:
|
||||
"""
|
||||
Sleeps for a specific amount of time, accordingly to a previous moment.
|
||||
|
||||
:param wait_time: The minimum amount of time that must be waited since the specified moment.
|
||||
:param moment: Epoch time.
|
||||
:param manager: Progressbar Manager
|
||||
"""
|
||||
passed = time.time() - moment
|
||||
time_slept = wait_time - passed
|
||||
if time_slept > 0.01:
|
||||
if manager:
|
||||
time_slept = round(time_slept, 2)
|
||||
total, delay = ceil(time_slept * 100), time_slept / 100
|
||||
bar = manager.counter(total=total, desc='Sleeping...', leave=False)
|
||||
for _ in range(total):
|
||||
time.sleep(delay)
|
||||
bar.update()
|
||||
bar.close()
|
||||
else:
|
||||
time.sleep(time_slept)
|
||||
return time_slept
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def get_raw(season, episode):
|
||||
@@ -24,7 +102,7 @@ def get_raw(season, episode):
|
||||
# If not, write to disk for later usage
|
||||
else:
|
||||
link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
|
||||
resp = s.get(link)
|
||||
resp = session.get(link)
|
||||
if resp.ok:
|
||||
page_data = resp.text
|
||||
with open(html_filepath, 'w', encoding='utf-8') as file:
|
||||
|
||||
Reference in New Issue
Block a user