diff --git a/Pipfile b/Pipfile index 4c89d5f..5d54375 100644 --- a/Pipfile +++ b/Pipfile @@ -10,6 +10,7 @@ requests = "~=2.24.0" bs4 = "~=0.0.1" beautifulsoup4 = "~=4.9.1" Flask = "~=1.1.2" +coloredlogs = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 2de7ebb..53a228e 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ba1c0e145f712b73a7cc267fef64a02068e76c3084ec627e3973cdcbe5217a12" + "sha256": "c56576435e88c13e4d8b9cfa57a32a0f50bd7113e24c0bd28a0731a6abe4fd7c" }, "pipfile-spec": 6, "requires": { @@ -70,6 +70,14 @@ "index": "pypi", "version": "==7.1.2" }, + "coloredlogs": { + "hashes": [ + "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", + "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0" + ], + "index": "pypi", + "version": "==15.0.1" + }, "enlighten": { "hashes": [ "sha256:db00dfc4027a2dad2aaa4bff4b5fd8d8ab8376e175a02d02e156992f08062437", @@ -86,6 +94,14 @@ "index": "pypi", "version": "==1.1.4" }, + "humanfriendly": { + "hashes": [ + "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", + "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==10.0" + }, "idna": { "hashes": [ "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", @@ -164,6 +180,14 @@ "markers": "python_version >= '3.7'", "version": "==2.1.1" }, + "pyreadline3": { + "hashes": [ + "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae", + "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb" + ], + "markers": "python_version >= '3.8' and sys_platform == 'win32'", + "version": "==3.4.1" + }, "requests": { "hashes": [ "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b", @@ -177,7 +201,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "soupsieve": { diff --git a/server/normalization/main.py b/server/normalization/main.py index e330df2..609fc6b 100644 --- a/server/normalization/main.py +++ b/server/normalization/main.py @@ -4,6 +4,7 @@ import os import re import sys import enlighten +import coloredlogs from collections import Counter, OrderedDict from pprint import pprint from typing import List, Optional, Union @@ -17,6 +18,7 @@ from server.helpers import clean_string, get_close_matches_indexes, marked_item_ logging.basicConfig(level=logging.INFO) logger = logging.getLogger('normalization.main') logger.setLevel(logging.DEBUG) +coloredlogs.install(level=logger.level, logger=logger) CUR_DIR = os.path.dirname(os.path.abspath(__file__)) TRUTH_DIR = os.path.join(CUR_DIR, 'truth') @@ -38,12 +40,14 @@ class Constants: CHARACTERS_XML = 'characters.xml' META_JSON = 'meta.json' + class ConstantPaths: SPEAKER_MAPPING = os.path.join(TRUTH_DIR, Constants.SPEAKER_MAPPING_XML) - IDENTIFIERS = os.path.join(CHARACTERS_DIR, Constants.CHARACTERS_XML) + IDENTIFIERS = os.path.join(CHARACTERS_DIR, Constants.IDENTIFIERS_XML) CHARACTERS = os.path.join(TRUTH_DIR, Constants.CHARACTERS_XML) META = os.path.join(TRUTH_DIR, Constants.META_JSON) + @cli.command('truth') def truth(): """Step 1: Builds raw files into truth files.""" @@ -274,6 +278,7 @@ def ids(): @cli.command('meta') def meta() -> None: + """Creates a meta file for storing each character identifier's meta meaning (main/recurring/background/meta)""" logger.debug('Creating meta.json') with open(ConstantPaths.IDENTIFIERS, 'r') as identifiers_file: @@ -284,6 +289,11 @@ def meta() -> None: if os.path.exists(ConstantPaths.META): with open(ConstantPaths.META, 'r') as meta_file: meta_data = OrderedDict(json.load(meta_file)) + + possible_values = [None, 'main', 'recurring', 'background', 'meta'] + for character_id, character_type in meta_data.items(): + if character_type not in possible_values: + logger.warning(f'Unexpected value for `{character_id}` = `{character_type}`') else: meta_data = OrderedDict()