Add complete identifier processing command

2026-01-31 04:26:17 -06:00 · 2022-05-08 22:48:31 -05:00
parent 1ac8347e78
commit da00c51151
1 changed files with 86 additions and 8 deletions
@@ -5,7 +5,7 @@ import sys
 import enlighten
 from collections import Counter, OrderedDict
 from pprint import pprint
-from typing import List, Optional
+from typing import List, Optional, Union
 import click
 from lxml import etree
@@ -19,6 +19,7 @@ logger.setLevel(logging.DEBUG)
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 TRUTH_DIR = os.path.join(CUR_DIR, 'truth')
 CHARACTERS_DIR = os.path.join(CUR_DIR, 'characters')
 RAW_DIR = os.path.abspath(os.path.join(CUR_DIR, '..', 'data', 'raw'))
 RAW_FILES = os.listdir(RAW_DIR)
@@ -158,14 +159,90 @@ def ids():
    """Builds an XML file for identifying character id mappings"""
    logger.info("Building ID Character mapping file...")
-    with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as mapping_file:
+    IDENTIFIER_FILE: str = os.path.join(CHARACTERS_DIR, 'identifiers.xml')
        root: etree.ElementBase = etree.parse(mapping_file)
-    root = etree.Element("IdentifierList")
+    with open(os.path.join(TRUTH_DIR, 'characters.xml'), 'r') as characters_file:
-    # mappings =
+        characters: List[str] = etree.parse(characters_file).xpath('//CharacterList/Character/text()')
-    # for speaker in speakers:
+        logger.debug('Characters parsed.')
    #     if speaker
    logger.debug(f'{len(characters)} characters parsed.')
    if not os.path.exists(CHARACTERS_DIR):
        os.makedirs(CHARACTERS_DIR)
        logger.info('`characters` directory created.')
    pre_existing: Dict[str, etree.Element] = None
    if os.path.exists(IDENTIFIER_FILE):
        logger.debug('Identifier file exists already. Pre-existing Speakers will be kept.')
        with open(IDENTIFIER_FILE, 'r') as identifier_file:
            preidentifiers: etree.ElementBase = etree.parse(identifier_file)
        pre_existing = OrderedDict()
        for speaker in preidentifiers.xpath('//SpeakerList/Speaker'):
            speakerName = speaker.xpath('./RawText/text()')[0]
            pre_existing[speakerName] = speaker
    root = etree.Element('SpeakerList')
    splitPatterns: List[str] = [r'\s*,\s*',
                                r'\s*&\s*',
                                r'\s+and,?(?:\s+|$)',
                                r'\s*[\\/]\s*']
    splitPattern: str = '|'.join(splitPatterns)
    existing_characters_count: int = 0
    new_characters_count: int = 0
    # Pre-existing character identifiers are kept at the top, in order.
    for speakerName in characters:
        if pre_existing is not None:
            if speakerName in pre_existing.keys():
                root.append(pre_existing[speakerName])
                del pre_existing[speakerName]
                existing_characters_count += 1
                continue
            else:
                logger.debug(f'New speaker: `{speakerName}`')
                new_characters_count += 1
        # New speaker to insert
        speaker_element = etree.SubElement(root, 'Speaker', annotated="false")
        raw_text_element = etree.SubElement(speaker_element, "RawText")
        raw_text_element.text = speakerName
        split_text: List[str] = re.split(splitPattern, speakerName)
        split_text = [split for split in split_text if re.match(r'\w{2,}', split) is not None]
        isCompound: bool = len(split_text) > 1
        isBackground: bool = re.search(r'#\d', speakerName) is not None  # Not fool-proof, but filters some out.
        if isCompound:
            speaker_element.attrib['annotated'] = "true"
            annotated_text_element = etree.SubElement(speaker_element, 'AnnotatedText')
            characters_element = etree.SubElement(speaker_element, 'Characters')
            annotated_text_element.text = speakerName
            for sub_character in split_text:
                subcharacter_element = etree.SubElement(characters_element, 'Character')
                subcharacter_element.text = valuify(sub_character)
                subcharacter_element.attrib['type'] = 'null'
        else:
            character_element = etree.SubElement(speaker_element, 'Character')
            character_element.attrib['type'] = 'background' if isBackground else 'null'
            character_element.text = valuify(speakerName)
    logger.debug(f'{new_characters_count} new speaker elements added. {existing_characters_count} speaker elements preserved.')
    if pre_existing is not None:
        unseen_chars = list(pre_existing.keys())
        if len(unseen_chars) > 0:
            for unseen in unseen_chars:
                root.append(pre_existing[unseen])
                logger.debug(f'Character preserved but not seen: {unseen}')
    logger.debug('Exporting identifiers file.')
    with open(IDENTIFIER_FILE, 'w') as identifier_file:
        etree.indent(root, space=" " * 4)
        identifier_file.write(etree.tostring(root, encoding=str, pretty_print=True))
@cli.command('all')
@@ -191,7 +268,8 @@ def similar(text: str, destination: Optional[bool], results: int, reversed: bool
    if destination:
        mappingType = "Destination"
-    counts: List[int] | List[str] = list(map(int, root.xpath('//SpeakerMappings/Mapping/@count')))  # Parse counts into integers for merge
+    counts: Union[List[int], List[str]] = list(
            map(int, root.xpath('//SpeakerMappings/Mapping/@count')))  # Parse counts into integers for merge
    speakers = root.xpath(f"//SpeakerMappings/Mapping/{mappingType}/text()")
    if not no_merge: speakers, counts = marked_item_merge(speakers, counts)  # Merge identical speakers together
    if results == -1: