the-office/process.py

import json
import os
import re
import traceback
from collections import defaultdict

import requests
from bs4 import BeautifulSoup

s = requests.Session()
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, 'server', 'data')


def get_raw(season, episode):
    html_filename = f'{season}-{str(episode).zfill(2)}.html'
    html_filepath = os.path.join(DATA_DIR, 'html', html_filename)

    # If .html file exists, read
    if os.path.exists(html_filepath):
        # print('Reading from disk...')
        with open(html_filepath, 'r', encoding='utf-8') as file:
            page_data = file.read()
    # If not, write to disk for later usage
    else:
        link = f"http://officequotes.net/no{season}-{str(episode).zfill(2)}.php"
        resp = s.get(link)
        if resp.ok:
            page_data = resp.text
            with open(html_filepath, 'w', encoding='utf-8') as file:
                file.write(page_data)
        else:
            raise Exception(f'HTTPError: {resp.status_code} at "{resp.url}"')

    soup = BeautifulSoup(page_data, "html.parser")

    data = []
    sections = soup.find_all(attrs={"class": "quote"})
    for section in sections:
        for br in section.find_all('br'):
            br.replace_with("\n" + br.text)
        for line in section.get_text().split('\n'):
            data.append(line.strip())
        data.append('-')
    data.pop(-1)

    with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'w',
              encoding='utf-8') as file:
        file.write('\n'.join(data))


def episodes():
    ep_nums = [6, 22, 23, 14, 26, 24, 24, 24, 23]
    for season_num, ep_count in enumerate(ep_nums, start=1):
        for episode_num in range(1, ep_count + 1):
            yield season_num, episode_num


def download_all_raw():
    for season_num, episode_num in episodes():
        print(f'{season_num}-{str(episode_num).zfill(2)}')
        try:
            get_raw(season_num, episode_num)
        except Exception as exception:
            print(f'Failed to process Season {season_num} Episode {episode_num} - ({type(exception).__name__})')
            traceback.print_exc()


def process(season, episode):
    with open(os.path.join(DATA_DIR, 'raw', f'{season}-{str(episode).zfill(2)}.txt'), 'r',
              encoding='utf-8') as file:

        sections = []
        for s in re.split('^-', file.read(), flags=re.MULTILINE):
            section = {
                'quotes': []
            }

            section_data = list(s.strip().split('\n'))
            if section_data[0].startswith('!'):
                section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1))

            for q in section_data:
                quote = q.split('|', 1)
                print(quote)
                section['quotes'].append(
                    {
                        'speaker': quote[0],
                        'text': quote[1]
                    }
                )
            sections.append(section)

        with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'w',
                  encoding='utf-8') as file:
            json.dump(sections, file, indent=4, ensure_ascii=False)

        deleted_count = [0, set()]
        quote_count = 0
        speakers = set()

        for section in sections:
            quote_count += len(section['quotes'])

            if 'deleted' in section.keys():
                deleted_count[0] += 1
                deleted_count[1].add(section['deleted'])

            for quote in section['quotes']:
                speakers.add(quote['speaker'])

        print(f'{quote_count} quotes.')
        print(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.')
        print(f'{len(speakers)} Speakers:')
        print(', '.join(speakers))


def generate_algolia():
    data = []
    quote_num = 0
    for season, episode in episodes():
        try:
            with open(os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json'), 'r',
                      encoding='utf-8') as file:
                episode_data = json.load(file)
        except FileNotFoundError:
            print(f'No JSON data for Season {season} Episode {episode}')
        else:
            for section_num, section in enumerate(episode_data, start=1):
                for quote in section['quotes']:
                    quote_num += 1
                    quote['quote'] = quote_num
                    quote['section'] = section_num
                    quote['episode'] = episode
                    quote['season'] = season

                    quote['is_deleted'] = 'deleted' in section.keys()
                    quote['deleted_section'] = section.get('deleted')

                    data.append(quote)

    with open(os.path.join(DATA_DIR, 'algolia.json'), 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def get_episode_scenes(season, episode):
    filepath = os.path.join(DATA_DIR, 'processed', f'{season}-{str(episode).zfill(2)}.json')
    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8') as file:
            return json.load(file)
    else:
        return None


def get_characters(season, episode):
    scenes = get_episode_scenes(season, episode)
    if scenes is None:
        return None

    characters = defaultdict(int)
    for scene in scenes:
        for quote in scene['quotes']:
            characters[quote['speaker']] += 1
    characters = [{'name': character, 'appearances': appearances, 'id': '-'.join(character.split(' ')).lower()}
                  for character, appearances in characters.items()]
    return list(sorted(characters, key=lambda item: item['appearances'], reverse=True))


def generate_final():
    """Merge episode descriptions/titles and quotes into final JSON file."""
    with open(os.path.join(DATA_DIR, 'descriptions.json'), 'r', encoding='utf-8') as file:
        data = json.load(file)

    output = []
    for season_id, season in enumerate(data, start=1):
        output.append({
            'season_id': season_id,
            'episodes': [
                {
                    'title': episode['title'].strip(),
                    'description': episode['description'].strip(),
                    'episode_id': episode_id,
                    'characters': get_characters(season_id, episode_id),
                    'scenes': get_episode_scenes(season_id, episode_id)
                }
                for episode_id, episode in enumerate(season, start=1)
            ]
        })

    with open(os.path.join(DATA_DIR, 'data.json'), 'w', encoding='utf-8') as file:
        json.dump(output, file, ensure_ascii=False, indent=4)


# generate_algolia()
# process(3, 10)
generate_final()