From 407b456dae6ddfd6413ea6ae47b07842e2be1c66 Mon Sep 17 00:00:00 2001 From: Xevion Date: Sun, 8 May 2022 23:52:00 -0500 Subject: [PATCH] Add normalization.meta command, add command help to truth --- server/normalization/main.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/server/normalization/main.py b/server/normalization/main.py index 5e140c1..19e3b53 100644 --- a/server/normalization/main.py +++ b/server/normalization/main.py @@ -1,3 +1,4 @@ +import json import logging import os import re @@ -35,6 +36,7 @@ class Constants: @cli.command('truth') def truth(): + """Step 1: Builds raw files into truth files.""" logger.info("Processing all raw files into normalized truth files.") speakers = Counter() @@ -103,7 +105,7 @@ def truth(): @cli.command('merge') def merge(): - """Merge all Speaker Mappings from source into one file.""" + """Step 2: Merge all Speaker Mappings from source into one file.""" speakerList = Counter() truth_files: List[str] = os.listdir(os.path.join(TRUTH_DIR, 'episodes')) @@ -172,7 +174,7 @@ def valuify(value: str) -> str: @cli.command('ids') def ids(): - """Builds an XML file for identifying character id mappings""" + """Step 3: Builds an XML file for identifying character id mappings""" logger.info("Building ID Character mapping file...") IDENTIFIER_FILE: str = os.path.join(CHARACTERS_DIR, 'identifiers.xml') @@ -261,12 +263,41 @@ def ids(): identifier_file.write(etree.tostring(root, encoding=str, pretty_print=True)) +@cli.command('meta') +def meta() -> None: + logger.debug('Creating meta.json') + + IDENTIFIERS_FILE = os.path.join(CHARACTERS_DIR, 'identifiers.xml') + with open(IDENTIFIERS_FILE, 'r') as identifiers_file: + speakers: List[str] = etree.parse(identifiers_file).xpath('//SpeakerList/Speaker') + logger.debug(f'{len(speakers)} speakers parsed.') + + meta_data = OrderedDict() + + for speaker in speakers: + characters = speaker.xpath('./Characters/Character') or speaker.xpath('./Character') + for character in characters: + name = character.text + type = character.attrib['type'] + if type == 'null': + type = None + + if type is not None or name not in meta_data.keys(): + meta_data[name] = type + + logger.debug(f'Writing {len(meta_data.keys())} meta values to disk.') + with open(os.path.join(TRUTH_DIR, 'meta.json'), 'w') as meta_file: + json.dump(meta_data, meta_file, indent=4) + logger.debug('Meta file written.') + + @cli.command('all') def all(): """Runs all commands in order one after another.""" truth() merge() ids() + meta() @cli.command('similar')