Improve use of constants in normalization script

This commit is contained in:
Xevion
2022-05-07 00:15:15 -05:00
parent 4009e6c8a7
commit 0be969e5bf

View File

@@ -21,7 +21,9 @@ logger.setLevel(logging.DEBUG)
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
TRUTH_DIR = os.path.join(CUR_DIR, 'truth')
CHARACTERS_DIR = os.path.join(CUR_DIR, 'characters')
EPISODES_DIR = os.path.join(TRUTH_DIR, 'episodes')
RAW_DIR = os.path.abspath(os.path.join(CUR_DIR, '..', 'data', 'raw'))
RAW_FILES = os.listdir(RAW_DIR)
@@ -32,7 +34,15 @@ def cli():
class Constants:
SPEAKER_MAPPING_XML = 'speaker_mapping.xml'
IDENTIFIERS_XML = 'identifiers.xml'
CHARACTERS_XML = 'characters.xml'
META_JSON = 'meta.json'
class ConstantPaths:
SPEAKER_MAPPING = os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML)
IDENTIFIERS = os.path.join(CHARACTERS_DIR, Constants.CHARACTERS_XML)
CHARACTERS = os.path.join(TRUTH_DIR, Constants.CHARACTERS_XML)
META = os.path.join(TRUTH_DIR, Constants.META_JSON)
@cli.command('truth')
def truth():
@@ -79,13 +89,13 @@ def truth():
if quote:
logger.info(f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
else:
truth_path = os.path.join(TRUTH_DIR, 'episodes', truth_filename)
truth_path = os.path.join(EPISODES_DIR, truth_filename)
with open(truth_path, 'w') as truth_file:
etree.indent(root, space=" " * 4)
truth_file.write(etree.tostring(root, encoding=str, pretty_print=True))
logger.debug(f"{len(speakers)} unique speakers identified.")
speaker_mapping_path = os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML)
if not os.path.exists(speaker_mapping_path):
root = etree.Element("SpeakerMappings")
@@ -96,7 +106,7 @@ def truth():
destinationElement = etree.SubElement(rootSpeakerElement, "Destination")
destinationElement.text = speaker
with open(speaker_mapping_path, 'w', encoding='utf-8') as speaker_file:
with open(ConstantPaths.SPEAKER_MAPPING, 'w', encoding='utf-8') as speaker_file:
etree.indent(root, space=" " * 4)
speaker_file.write(etree.tostring(root, encoding=str, pretty_print=True))
else:
@@ -108,12 +118,12 @@ def merge():
"""Step 2: Merge all Speaker Mappings from source into one file."""
speakerList = Counter()
truth_files: List[str] = os.listdir(os.path.join(TRUTH_DIR, 'episodes'))
truth_files: List[str] = os.listdir(EPISODES_DIR)
logger.debug(f"{len(truth_files)} truth files available.")
pbar = enlighten.Counter(total=len(truth_files), unit='Files')
for truth_filename in truth_files:
truth_path = os.path.join(TRUTH_DIR, 'episodes', truth_filename)
truth_path = os.path.join(EPISODES_DIR, truth_filename)
with open(truth_path, 'r') as truth_file:
root = etree.parse(truth_file)
for speaker in root.xpath('//SceneList/Scene/Quote/Speaker/text()'):
@@ -123,7 +133,7 @@ def merge():
logger.debug('Speakers acquired from Truth files.')
speakerMapping = OrderedDict()
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as speaker_mapping_file:
with open(ConstantPaths.SPEAKER_MAPPING, 'r') as speaker_mapping_file:
rootMappingElement: etree.ElementBase = etree.parse(speaker_mapping_file)
for mappingElement in rootMappingElement.xpath('//SpeakerMappings/Mapping'):
source, destination = mappingElement.xpath('.//Source/text()')[0], mappingElement.xpath('.//Destination/text()')[0]
@@ -151,7 +161,7 @@ def merge():
logger.debug("Speaker mappings merged. Exporting to `characters.xml`")
with open(os.path.join(TRUTH_DIR, 'characters.xml'), 'w') as character_file:
with open(ConstantPaths.CHARACTERS, 'w') as character_file:
etree.indent(root, space=" " * 4)
character_file.write(etree.tostring(root, encoding=str, pretty_print=True))
@@ -177,9 +187,8 @@ def ids():
"""Step 3: Builds an XML file for identifying character id mappings"""
logger.info("Building ID Character mapping file...")
IDENTIFIER_FILE: str = os.path.join(CHARACTERS_DIR, 'identifiers.xml')
with open(os.path.join(TRUTH_DIR, 'characters.xml'), 'r') as characters_file:
with open(ConstantPaths.CHARACTERS, 'r') as characters_file:
characters: List[str] = etree.parse(characters_file).xpath('//CharacterList/Character/text()')
logger.debug('Characters parsed.')
@@ -190,10 +199,10 @@ def ids():
logger.info('`characters` directory created.')
pre_existing: Dict[str, etree.Element] = None
if os.path.exists(IDENTIFIER_FILE):
if os.path.exists(ConstantPaths.IDENTIFIERS):
logger.debug('Identifier file exists already. Pre-existing Speakers will be kept.')
with open(IDENTIFIER_FILE, 'r') as identifier_file:
with open(ConstantPaths.IDENTIFIERS, 'r') as identifier_file:
preidentifiers: etree.ElementBase = etree.parse(identifier_file)
pre_existing = OrderedDict()
@@ -258,7 +267,7 @@ def ids():
logger.debug(f'Character preserved but not seen: {unseen}')
logger.debug('Exporting identifiers file.')
with open(IDENTIFIER_FILE, 'w') as identifier_file:
with open(ConstantPaths.IDENTIFIERS, 'w') as identifier_file:
etree.indent(root, space=" " * 4)
identifier_file.write(etree.tostring(root, encoding=str, pretty_print=True))
@@ -267,16 +276,13 @@ def ids():
def meta() -> None:
logger.debug('Creating meta.json')
IDENTIFIERS_FILE = os.path.join(CHARACTERS_DIR, 'identifiers.xml')
META_FILE = os.path.join(TRUTH_DIR, 'meta.json')
with open(IDENTIFIERS_FILE, 'r') as identifiers_file:
with open(ConstantPaths.IDENTIFIERS, 'r') as identifiers_file:
speakers: List[str] = etree.parse(identifiers_file).xpath('//SpeakerList/Speaker')
logger.debug(f'{len(speakers)} speakers parsed.')
meta_data: OrderedDict[str, Optional[str]]
if os.path.exists(META_FILE):
with open(META_FILE, 'r') as meta_file:
if os.path.exists(ConstantPaths.META):
with open(ConstantPaths.META, 'r') as meta_file:
meta_data = OrderedDict(json.load(meta_file))
else:
meta_data = OrderedDict()
@@ -293,7 +299,7 @@ def meta() -> None:
meta_data[name] = meta_type
logger.debug(f'Writing {len(meta_data.keys())} meta values to disk.')
with open(META_FILE, 'w') as meta_file:
with open(ConstantPaths.META, 'w') as meta_file:
json.dump(meta_data, meta_file, indent=4)
logger.debug('Meta file written.')
@@ -315,7 +321,7 @@ def all():
@click.option('-r', '--reversed', is_flag=True, help='Reverse the results direction to help readability in the console.')
def similar(text: str, destination: Optional[bool], results: int, reversed: bool, no_merge: bool) -> None:
"""Locates the most similar character name in speaker mappings. Searches <Source> by default."""
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as mapping_file:
with open(ConstantPaths.SPEAKER_MAPPING, 'r') as mapping_file:
root: etree.ElementBase = etree.parse(mapping_file)
mappingType: str = "Source"