mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-07 13:16:49 -06:00
Create CLI-based normalization system based in XML
This commit is contained in:
200
server/normalization/main.py
Normal file
200
server/normalization/main.py
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import enlighten
|
||||||
|
from collections import Counter, OrderedDict
|
||||||
|
from pprint import pprint
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import click
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
sys.path[0] += '\\..\\..'
|
||||||
|
from server.helpers import clean_string, get_close_matches_indexes, marked_item_merge
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger('normalization.main')
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
TRUTH_DIR = os.path.join(CUR_DIR, 'truth')
|
||||||
|
RAW_DIR = os.path.abspath(os.path.join(CUR_DIR, '..', 'data', 'raw'))
|
||||||
|
RAW_FILES = os.listdir(RAW_DIR)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Constants:
|
||||||
|
SPEAKER_MAPPING_XML = 'speaker_mapping.xml'
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('truth')
|
||||||
|
def truth():
|
||||||
|
logger.info("Processing all raw files into normalized truth files.")
|
||||||
|
|
||||||
|
speakers = Counter()
|
||||||
|
for raw_file in RAW_FILES:
|
||||||
|
raw_path = os.path.join(RAW_DIR, raw_file)
|
||||||
|
|
||||||
|
truth_filename = raw_file.replace('txt', 'xml')
|
||||||
|
|
||||||
|
with open(raw_path, 'r', encoding='utf-8') as file:
|
||||||
|
raw_data = file.read()
|
||||||
|
|
||||||
|
root = etree.Element('SceneList')
|
||||||
|
try:
|
||||||
|
for section_num, raw_section in enumerate(re.split('^-', raw_data, flags=re.MULTILINE), start=1):
|
||||||
|
sceneElement = etree.SubElement(root, 'Scene')
|
||||||
|
|
||||||
|
scene_data = list(raw_section.strip().split('\n'))
|
||||||
|
if scene_data[0].startswith('!'):
|
||||||
|
# Notate that scene is deleted on Scene Element Attributes
|
||||||
|
sceneElement.attrib['deleted'] = re.search(r'!(\d+)', scene_data.pop(0)).group(1)
|
||||||
|
|
||||||
|
# Process quotes in each scene
|
||||||
|
for quote in scene_data:
|
||||||
|
quoteElement = etree.SubElement(sceneElement, 'Quote')
|
||||||
|
speaker, text = quote.split('|', 1)
|
||||||
|
speaker, text = clean_string(speaker), clean_string(text)
|
||||||
|
speakers[speaker] += 1
|
||||||
|
|
||||||
|
if len(speaker) <= 1:
|
||||||
|
raise Exception("Speaker text had less than two characters.")
|
||||||
|
elif len(text) <= 1:
|
||||||
|
raise Exception("Quote text had less than two characters.")
|
||||||
|
|
||||||
|
rootSpeakerElement = etree.SubElement(quoteElement, 'Speaker')
|
||||||
|
rootSpeakerElement.text = speaker
|
||||||
|
textElement = etree.SubElement(quoteElement, "Text")
|
||||||
|
textElement.text = text
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f'Skipped {raw_file}: Malformed data.')
|
||||||
|
if quote:
|
||||||
|
logger.info(f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}')
|
||||||
|
else:
|
||||||
|
truth_path = os.path.join(TRUTH_DIR, 'episodes', truth_filename)
|
||||||
|
with open(truth_path, 'w') as truth_file:
|
||||||
|
etree.indent(root, space=" " * 4)
|
||||||
|
truth_file.write(etree.tostring(root, encoding=str, pretty_print=True))
|
||||||
|
|
||||||
|
logger.debug(f"{len(speakers)} unique speakers identified.")
|
||||||
|
speaker_mapping_path = os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML)
|
||||||
|
if not os.path.exists(speaker_mapping_path):
|
||||||
|
root = etree.Element("SpeakerMappings")
|
||||||
|
|
||||||
|
for speaker, count in sorted(speakers.items(), key=lambda item: item[1], reverse=True):
|
||||||
|
rootSpeakerElement = etree.SubElement(root, "Mapping", count=str(count))
|
||||||
|
sourceElement = etree.SubElement(rootSpeakerElement, "Source")
|
||||||
|
sourceElement.text = speaker
|
||||||
|
destinationElement = etree.SubElement(rootSpeakerElement, "Destination")
|
||||||
|
destinationElement.text = speaker
|
||||||
|
|
||||||
|
with open(speaker_mapping_path, 'w', encoding='utf-8') as speaker_file:
|
||||||
|
etree.indent(root, space=" " * 4)
|
||||||
|
speaker_file.write(etree.tostring(root, encoding=str, pretty_print=True))
|
||||||
|
else:
|
||||||
|
logger.warning('Skipped exporting speakers; delete "speaker_mapping.xml" prior to export next time.')
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('merge')
|
||||||
|
def merge():
|
||||||
|
"""Merge all Speaker Mappings from source into one file."""
|
||||||
|
speakerList = Counter()
|
||||||
|
truth_files: List[str] = os.listdir(os.path.join(TRUTH_DIR, 'episodes'))
|
||||||
|
pbar = enlighten.Counter(total=len(truth_files), unit='Files')
|
||||||
|
for truth_filename in truth_files:
|
||||||
|
truth_path = os.path.join(TRUTH_DIR, 'episodes', truth_filename)
|
||||||
|
with open(truth_path, 'r') as truth_file:
|
||||||
|
root = etree.parse(truth_file)
|
||||||
|
for speaker in root.xpath('//SceneList/Scene/Quote/Speaker/text()'):
|
||||||
|
speakerList[speaker] += 1
|
||||||
|
pbar.update()
|
||||||
|
|
||||||
|
speakerMapping = OrderedDict()
|
||||||
|
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as speaker_mapping_file:
|
||||||
|
rootMappingElement: etree.ElementBase = etree.parse(speaker_mapping_file)
|
||||||
|
for mappingElement in rootMappingElement.xpath('//SpeakerMappings/Mapping'):
|
||||||
|
source, destination = mappingElement.xpath('.//Source/text()')[0], mappingElement.xpath('.//Destination/text()')[0]
|
||||||
|
speakerMapping[source] = destination
|
||||||
|
|
||||||
|
print('Mappings acquired')
|
||||||
|
|
||||||
|
root = etree.Element('CharacterList')
|
||||||
|
pbar = enlighten.Counter(total=len(speakerList.keys()), unit='Speakers')
|
||||||
|
seen = set()
|
||||||
|
|
||||||
|
for speaker in speakerList.keys():
|
||||||
|
while speakerMapping.get(speaker) is not None:
|
||||||
|
if speakerMapping.get(speaker) == speaker:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
speaker = speakerMapping[speaker]
|
||||||
|
|
||||||
|
if speaker not in seen:
|
||||||
|
seen.add(speaker)
|
||||||
|
characterElement = etree.SubElement(root, 'Character')
|
||||||
|
characterElement.text = speaker
|
||||||
|
pbar.update()
|
||||||
|
|
||||||
|
with open(os.path.join(TRUTH_DIR, 'characters.xml'), 'w') as character_file:
|
||||||
|
etree.indent(root, space=" " * 4)
|
||||||
|
character_file.write(etree.tostring(root, encoding=str, pretty_print=True))
|
||||||
|
|
||||||
|
@cli.command('ids')
|
||||||
|
def ids():
|
||||||
|
"""Builds an XML file for identifying character id mappings"""
|
||||||
|
|
||||||
|
logger.info("Building ID Character mapping file...")
|
||||||
|
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as mapping_file:
|
||||||
|
root: etree.ElementBase = etree.parse(mapping_file)
|
||||||
|
|
||||||
|
root = etree.Element("IdentifierList")
|
||||||
|
# mappings =
|
||||||
|
# for speaker in speakers:
|
||||||
|
# if speaker
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('all')
|
||||||
|
def all():
|
||||||
|
"""Runs all commands in order one after another."""
|
||||||
|
truth()
|
||||||
|
merge()
|
||||||
|
ids()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('similar')
|
||||||
|
@click.argument('text')
|
||||||
|
@click.option('-d', '--destination', is_flag=True, help='Search Destination mapping instead of Source.')
|
||||||
|
@click.option('-n', '--results', type=int, default=5, help='Specify the number of results to be returned.')
|
||||||
|
@click.option('--no-merge', is_flag=True, help='Don\'t merge similar items together to make things easier.')
|
||||||
|
@click.option('-r', '--reversed', is_flag=True, help='Reverse the results direction to help readability in the console.')
|
||||||
|
def similar(text: str, destination: Optional[bool], results: int, reversed: bool, no_merge: bool) -> None:
|
||||||
|
"""Locates the most similar character name in speaker mappings. Searches <Source> by default."""
|
||||||
|
with open(os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML), 'r') as mapping_file:
|
||||||
|
root: etree.ElementBase = etree.parse(mapping_file)
|
||||||
|
|
||||||
|
mappingType: str = "Source"
|
||||||
|
if destination:
|
||||||
|
mappingType = "Destination"
|
||||||
|
|
||||||
|
counts: List[int] | List[str] = list(map(int, root.xpath('//SpeakerMappings/Mapping/@count'))) # Parse counts into integers for merge
|
||||||
|
speakers = root.xpath(f"//SpeakerMappings/Mapping/{mappingType}/text()")
|
||||||
|
if not no_merge: speakers, counts = marked_item_merge(speakers, counts) # Merge identical speakers together
|
||||||
|
if results == -1:
|
||||||
|
results = len(speakers)
|
||||||
|
|
||||||
|
resultIndexes: List[int] = get_close_matches_indexes(text, speakers, results, 0)
|
||||||
|
results = [f'{speakers[i]} ({counts[i]})' for i in resultIndexes]
|
||||||
|
results = [f'{i}. {item}' for i, item in enumerate(results, start=1)]
|
||||||
|
if reversed: results.reverse()
|
||||||
|
|
||||||
|
print('\n'.join(results))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cli()
|
||||||
Reference in New Issue
Block a user