mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-11 04:08:48 -06:00
Add complete identifier processing command
This commit is contained in:
@@ -5,7 +5,7 @@ import sys
|
|||||||
import enlighten
|
import enlighten
|
||||||
from collections import Counter, OrderedDict
|
from collections import Counter, OrderedDict
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@@ -19,6 +19,7 @@ logger.setLevel(logging.DEBUG)
|
|||||||
|
|
||||||
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
|
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
TRUTH_DIR = os.path.join(CUR_DIR, 'truth')
|
TRUTH_DIR = os.path.join(CUR_DIR, 'truth')
|
||||||
|
CHARACTERS_DIR = os.path.join(CUR_DIR, 'characters')
|
||||||
RAW_DIR = os.path.abspath(os.path.join(CUR_DIR, '..', 'data', 'raw'))
|
RAW_DIR = os.path.abspath(os.path.join(CUR_DIR, '..', 'data', 'raw'))
|
||||||
RAW_FILES = os.listdir(RAW_DIR)
|
RAW_FILES = os.listdir(RAW_DIR)
|
||||||
|
|
||||||
@@ -158,14 +159,90 @@ def ids():
|
|||||||
"""Builds an XML file for identifying character id mappings"""
|
"""Builds an XML file for identifying character id mappings"""
|
||||||
|
|
||||||
logger.info("Building ID Character mapping file...")
|
logger.info("Building ID Character mapping file...")
|
||||||
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as mapping_file:
|
IDENTIFIER_FILE: str = os.path.join(CHARACTERS_DIR, 'identifiers.xml')
|
||||||
root: etree.ElementBase = etree.parse(mapping_file)
|
|
||||||
|
|
||||||
root = etree.Element("IdentifierList")
|
with open(os.path.join(TRUTH_DIR, 'characters.xml'), 'r') as characters_file:
|
||||||
# mappings =
|
characters: List[str] = etree.parse(characters_file).xpath('//CharacterList/Character/text()')
|
||||||
# for speaker in speakers:
|
logger.debug('Characters parsed.')
|
||||||
# if speaker
|
|
||||||
|
|
||||||
|
logger.debug(f'{len(characters)} characters parsed.')
|
||||||
|
|
||||||
|
if not os.path.exists(CHARACTERS_DIR):
|
||||||
|
os.makedirs(CHARACTERS_DIR)
|
||||||
|
logger.info('`characters` directory created.')
|
||||||
|
|
||||||
|
pre_existing: Dict[str, etree.Element] = None
|
||||||
|
if os.path.exists(IDENTIFIER_FILE):
|
||||||
|
logger.debug('Identifier file exists already. Pre-existing Speakers will be kept.')
|
||||||
|
|
||||||
|
with open(IDENTIFIER_FILE, 'r') as identifier_file:
|
||||||
|
preidentifiers: etree.ElementBase = etree.parse(identifier_file)
|
||||||
|
|
||||||
|
pre_existing = OrderedDict()
|
||||||
|
for speaker in preidentifiers.xpath('//SpeakerList/Speaker'):
|
||||||
|
speakerName = speaker.xpath('./RawText/text()')[0]
|
||||||
|
pre_existing[speakerName] = speaker
|
||||||
|
|
||||||
|
root = etree.Element('SpeakerList')
|
||||||
|
splitPatterns: List[str] = [r'\s*,\s*',
|
||||||
|
r'\s*&\s*',
|
||||||
|
r'\s+and,?(?:\s+|$)',
|
||||||
|
r'\s*[\\/]\s*']
|
||||||
|
splitPattern: str = '|'.join(splitPatterns)
|
||||||
|
|
||||||
|
existing_characters_count: int = 0
|
||||||
|
new_characters_count: int = 0
|
||||||
|
|
||||||
|
# Pre-existing character identifiers are kept at the top, in order.
|
||||||
|
for speakerName in characters:
|
||||||
|
if pre_existing is not None:
|
||||||
|
if speakerName in pre_existing.keys():
|
||||||
|
root.append(pre_existing[speakerName])
|
||||||
|
del pre_existing[speakerName]
|
||||||
|
existing_characters_count += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logger.debug(f'New speaker: `{speakerName}`')
|
||||||
|
new_characters_count += 1
|
||||||
|
|
||||||
|
# New speaker to insert
|
||||||
|
speaker_element = etree.SubElement(root, 'Speaker', annotated="false")
|
||||||
|
raw_text_element = etree.SubElement(speaker_element, "RawText")
|
||||||
|
raw_text_element.text = speakerName
|
||||||
|
|
||||||
|
split_text: List[str] = re.split(splitPattern, speakerName)
|
||||||
|
split_text = [split for split in split_text if re.match(r'\w{2,}', split) is not None]
|
||||||
|
|
||||||
|
isCompound: bool = len(split_text) > 1
|
||||||
|
isBackground: bool = re.search(r'#\d', speakerName) is not None # Not fool-proof, but filters some out.
|
||||||
|
|
||||||
|
if isCompound:
|
||||||
|
speaker_element.attrib['annotated'] = "true"
|
||||||
|
annotated_text_element = etree.SubElement(speaker_element, 'AnnotatedText')
|
||||||
|
characters_element = etree.SubElement(speaker_element, 'Characters')
|
||||||
|
annotated_text_element.text = speakerName
|
||||||
|
for sub_character in split_text:
|
||||||
|
subcharacter_element = etree.SubElement(characters_element, 'Character')
|
||||||
|
subcharacter_element.text = valuify(sub_character)
|
||||||
|
subcharacter_element.attrib['type'] = 'null'
|
||||||
|
else:
|
||||||
|
character_element = etree.SubElement(speaker_element, 'Character')
|
||||||
|
character_element.attrib['type'] = 'background' if isBackground else 'null'
|
||||||
|
character_element.text = valuify(speakerName)
|
||||||
|
|
||||||
|
logger.debug(f'{new_characters_count} new speaker elements added. {existing_characters_count} speaker elements preserved.')
|
||||||
|
|
||||||
|
if pre_existing is not None:
|
||||||
|
unseen_chars = list(pre_existing.keys())
|
||||||
|
if len(unseen_chars) > 0:
|
||||||
|
for unseen in unseen_chars:
|
||||||
|
root.append(pre_existing[unseen])
|
||||||
|
logger.debug(f'Character preserved but not seen: {unseen}')
|
||||||
|
|
||||||
|
logger.debug('Exporting identifiers file.')
|
||||||
|
with open(IDENTIFIER_FILE, 'w') as identifier_file:
|
||||||
|
etree.indent(root, space=" " * 4)
|
||||||
|
identifier_file.write(etree.tostring(root, encoding=str, pretty_print=True))
|
||||||
|
|
||||||
|
|
||||||
@cli.command('all')
|
@cli.command('all')
|
||||||
@@ -191,7 +268,8 @@ def similar(text: str, destination: Optional[bool], results: int, reversed: bool
|
|||||||
if destination:
|
if destination:
|
||||||
mappingType = "Destination"
|
mappingType = "Destination"
|
||||||
|
|
||||||
counts: List[int] | List[str] = list(map(int, root.xpath('//SpeakerMappings/Mapping/@count'))) # Parse counts into integers for merge
|
counts: Union[List[int], List[str]] = list(
|
||||||
|
map(int, root.xpath('//SpeakerMappings/Mapping/@count'))) # Parse counts into integers for merge
|
||||||
speakers = root.xpath(f"//SpeakerMappings/Mapping/{mappingType}/text()")
|
speakers = root.xpath(f"//SpeakerMappings/Mapping/{mappingType}/text()")
|
||||||
if not no_merge: speakers, counts = marked_item_merge(speakers, counts) # Merge identical speakers together
|
if not no_merge: speakers, counts = marked_item_merge(speakers, counts) # Merge identical speakers together
|
||||||
if results == -1:
|
if results == -1:
|
||||||
|
|||||||
Reference in New Issue
Block a user