From e69afb7bf2c6aa83e92176f981266fde6cd2c610 Mon Sep 17 00:00:00 2001
From: Xevion <xevioni@yandex.com>
Date: Wed, 11 May 2022 02:25:44 -0500
Subject: [PATCH] Last stage of quote building: building app JSON datafiles

---
 server/normalization/main.py | 110 +++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/server/normalization/main.py b/server/normalization/main.py
index 6518322..e29f479 100644
--- a/server/normalization/main.py
+++ b/server/normalization/main.py
@@ -34,11 +34,18 @@ def cli():
     pass
 
 
+@cli.group()
+def build():
+    """The last stage of data processing, building the JSON files used by the application & Algolia indexing."""
+
+
 class Constants:
     SPEAKER_MAPPING_XML = 'speaker_mapping.xml'
     IDENTIFIERS_XML = 'identifiers.xml'
     CHARACTERS_XML = 'characters.xml'
     META_JSON = 'meta.json'
+    EPISODE_DESCRIPTION_JSON = 'episode_descriptions.json'
+    CHARACTER_DESCRIPTION_JSON = 'character_descriptions.json'
 
 
 class ConstantPaths:
@@ -46,6 +53,8 @@ class ConstantPaths:
     IDENTIFIERS = os.path.join(CHARACTERS_DIR, Constants.IDENTIFIERS_XML)
     CHARACTERS = os.path.join(TRUTH_DIR, Constants.CHARACTERS_XML)
     META = os.path.join(TRUTH_DIR, Constants.META_JSON)
+    EP_DESC = os.path.join(CUR_DIR, Constants.EPISODE_DESCRIPTION_JSON)
+    CHAR_DESC = os.path.join(CUR_DIR, Constants.CHARACTER_DESCRIPTION_JSON)
 
 
 @cli.command('truth')
@@ -528,5 +537,106 @@ def check(verbose: bool) -> None:
     # TODO: Check for character IDs in identifiers.xml that don't look correct (voice--on-phone)
 
 
+@build.command('app')
+@click.option('--path', type=str, default=BUILD_DIR, help='The output path for the application data files.')
+@click.option('--make-dir', is_flag=True, help='Create the output directory if it does not exist.')
+def app(path: str, make_dir: bool) -> None:
+    """Build the data files used by the application."""
+    logger.debug('Build process called for "app".')
+    logger.debug(f'Output Directory: "{os.path.relpath(path, os.getcwd())}"')
+
+    with open(ConstantPaths.EP_DESC, 'r') as episode_desc_file:
+        episode_desc = json.loads(episode_desc_file.read())
+
+    with open(ConstantPaths.CHAR_DESC, 'r') as character_desc_file:
+        character_desc = json.loads(character_desc_file.read())
+
+    if not os.path.exists(path):
+        if path == BUILD_DIR or make_dir:
+            os.makedirs(BUILD_DIR)
+            logger.debug('Build directory did not exist; it has been created.')
+        else:
+            logger.error('The output directory given does not exist.', click.BadOptionUsage("path", "Path supplied does not exist."))
+    elif not os.path.isdir(path):
+        logger.error("The output directory given is not a directory.", click.BadOptionUsage("path", "Path supplied is not a directory."))
+
+    episode_files = os.listdir(COMPILE_DIR)
+    logger.debug(f'Beginning processing of {len(episode_files)} compiled episode directories.')
+
+    progress = Progress(SpinnerColumn('dots10'), *Progress.get_default_columns(), MofNCompleteColumn(), TimeElapsedColumn())
+
+    all_season_data: List[List[dict]] = [[] for _ in episode_desc]
+
+    no_char_data = OrderedDict()
+
+    with progress:
+        for episodeFile in progress.track(episode_files, description='Building Episodes', update_period=0.01):
+            with open(os.path.join(COMPILE_DIR, episodeFile), 'r') as ep_file:
+                episode_root: etree.ElementBase = etree.parse(ep_file)
+
+            seasonNum, episodeNum = map(int, re.match(r'(\d+)-(\d+).xml', episodeFile).groups())
+            description = episode_desc[seasonNum - 1][episodeNum - 1]
+
+            # Count character appearances
+            characters = Counter()
+            all_characters = episode_root.xpath('./Scene/Quote/Speaker/Characters/Character')
+            for character in all_characters:
+                character_type = character.attrib['type']
+                if character_type in ['main', 'recurring']:
+                    characters[character.text] += 1
+
+            episode_characters: Dict[str, Dict[str, Union[str, int]]] = {}
+            for character_id, count in sorted(characters.items(), key=lambda item: item[1], reverse=True):
+                if character_id in character_desc.keys():
+                    character_name = character_desc[character_id]['name']
+                else:
+                    print(f'No character description: {character_id}')
+                    character_name = f'\"{character_id.capitalize()}\"'
+                    no_char_data[character_id] = None
+
+                episode_characters[character_id] = {
+                    'name': character_name,
+                    'appearances': count
+                }
+
+            scenes = [
+                {
+                    'quotes': [
+                        {
+                            'speaker': quote.xpath('./Speaker/SpeakerText')[0].text,
+                            'text': quote.find('QuoteText').text
+                        }
+                        for quote in scene.xpath('./Quote')]
+                } for scene in episode_root.xpath('./Scene')
+            ]
+
+            all_season_data[seasonNum - 1].append({
+                'title': description['title'],
+                'description': description['description'],
+                'characters': episode_characters,
+                'season_number': seasonNum,
+                'episode_number': episodeNum,
+                "scenes": scenes
+            })
+
+    season_episode_data: List[Tuple[int, int, Any]] = []
+
+    for season, season_data in enumerate(all_season_data, start=1):
+        for episode, episode_data in enumerate(season_data, start=1):
+            season_episode_data.append((season, episode, episode_data))
+
+    with progress:
+
+        for season, episode, episode_data in progress.track(season_episode_data, description='Saving episode data...', update_period=0.1):
+            season_directory = os.path.join(path, f'{season:02}')
+            if not os.path.exists(season_directory):
+                os.makedirs(season_directory)
+
+            episode_path = os.path.join(season_directory, f'{episode:02}.json')
+
+            with open(episode_path, 'w') as episode_file:
+                json.dump(episode_data, episode_file)
+
+
 if __name__ == '__main__':
     cli()