Take necessary helpers from server.data.helpers for normalization, Add unidecode to Pipfile

- General cleanup, getting ready to delete server.data
2025-12-06 03:16:45 -06:00 · 2022-05-10 00:46:24 -05:00
parent cb06edc0d3
commit fd107b0d60
5 changed files with 94 additions and 77 deletions
--- a/1
+++ b/1
@@ -14,6 +14,7 @@ coloredlogs = "*"
 markupsafe = "<2.1.0"
 flask_cors = "*"
 flask_wtf = "*"
+unidecode = "*"

 [dev-packages]

--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "b3345bfd51f4748db74891f88052c39e8711e7eabb4a90ee6b03b66004685efe"
+            "sha256": "1b7d4db98a117f45dc7130e880f5e49ab9b85de8ec8d18864742a364fab852b0"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -257,6 +257,14 @@
            "markers": "python_version >= '3.0'",
            "version": "==2.3.2.post1"
        },
+        "unidecode": {
+            "hashes": [
+                "sha256:8e4352fb93d5a735c788110d2e7ac8e8031eb06ccbfe8d324ab71735015f9342",
+                "sha256:afa04efcdd818a93237574791be9b2817d7077c25a068b00f8cff7baa4e59257"
+            ],
+            "index": "pypi",
+            "version": "==1.3.4"
+        },
        "urllib3": {
            "hashes": [
                "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
--- a/server/helpers.py
+++ b/server/helpers.py
@@ -59,11 +59,6 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s

    return new_dictionary

-
-def is_main_character(name: str) -> bool:
-    return None
-
-
 def character_id(name: str) -> str:
    return '-'.join(name.split(' ')).lower()

@@ -76,69 +71,4 @@ def random_id(length: int = 8) -> str:
    return ''.join(random.choices(alphabet, k=length))


-def char_filter(string):
-    latin = re.compile('[a-zA-Z]+')
-    for char in unicodedata.normalize('NFC', string):
-        decoded = unidecode.unidecode(char)
-        if latin.match(decoded):
-            yield char
-        else:
-            yield decoded

-
-def clean_string(string):
-    return "".join(char_filter(string))
-
-
-def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
-    """Use SequenceMatcher to return a list of the indexes of the best
-    "good enough" matches. word is a sequence for which close matches
-    are desired (typically a string).
-    possibilities is a list of sequences against which to match word
-    (typically a list of strings).
-    Optional arg n (default 3) is the maximum number of close matches to
-    return.  n must be > 0.
-    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
-    that don't score at least that similar to word are ignored.
-    """
-
-    if not n > 0:
-        raise ValueError("n must be > 0: %r" % (n,))
-    if not 0.0 <= cutoff <= 1.0:
-        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
-    result = []
-    s = SequenceMatcher()
-    s.set_seq2(word)
-    for idx, x in enumerate(possibilities):
-        s.set_seq1(x)
-        if s.real_quick_ratio() >= cutoff and \
-                s.quick_ratio() >= cutoff and \
-                s.ratio() >= cutoff:
-            result.append((s.ratio(), idx))
-
-    # Move the best scorers to head of list
-    result = _nlargest(n, result)
-
-    # Strip scores for the best n matches
-    return [x for score, x in result]
-
-
-def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
-    """Add the values of identical keys together, then return both the keys and values"""
-    merge = OrderedDict()
-    for key, value in zip(keys, values):
-        # Already inserted, now make/keep it negative
-        if key in merge.keys():
-            # Keys that haven't been turned over need to be made negative
-            if merge[key] > 0:
-                merge[key] = -merge[key]
-
-            # And then subtract the value in all cases
-            merge[key] -= value
-        else:
-            # Values that are positive didn't merge with other counts.
-            merge[key] = value
-
-    keys, values = zip(*merge.items())
-    values = [f'{-value}*' if value < 0 else str(value) for value in values]
-    return keys, values
--- a/server/normalization/helpers.py
+++ b/server/normalization/helpers.py
@@ -0,0 +1,80 @@
+import random
+import re
+import string
+import unicodedata
+from collections import OrderedDict
+from difflib import SequenceMatcher
+from heapq import nlargest as _nlargest
+from typing import List, Optional, Tuple, Iterator
+
+import unidecode
+
+
+def char_filter(s: str) -> Iterator[str]:
+    """Returns a generator of characters that are properly converted from their unicode character into their ASCII equivalent."""
+    latin = re.compile('[a-zA-Z]+')
+    for char in unicodedata.normalize('NFC', s):
+        decoded = unidecode.unidecode(char)
+        if latin.match(decoded):
+            yield char
+        else:
+            yield decoded
+
+
+def clean_string(s: str) -> str:
+    """Returns a clean string, devoid of ugly Unicode characters."""
+    return "".join(char_filter(s))
+
+
+def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
+    """Use SequenceMatcher to return a list of the indexes of the best
+    "good enough" matches. word is a sequence for which close matches
+    are desired (typically a string).
+    possibilities is a list of sequences against which to match word
+    (typically a list of strings).
+    Optional arg n (default 3) is the maximum number of close matches to
+    return.  n must be > 0.
+    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
+    that don't score at least that similar to word are ignored.
+    """
+
+    if not n > 0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher()
+    s.set_seq2(word)
+    for idx, x in enumerate(possibilities):
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+                s.quick_ratio() >= cutoff and \
+                s.ratio() >= cutoff:
+            result.append((s.ratio(), idx))
+
+    # Move the best scorers to head of list
+    result = _nlargest(n, result)
+
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
+
+def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
+    """Add the values of identical keys together, then return both the keys and values"""
+    merge = OrderedDict()
+    for key, value in zip(keys, values):
+        # Already inserted, now make/keep it negative
+        if key in merge.keys():
+            # Keys that haven't been turned over need to be made negative
+            if merge[key] > 0:
+                merge[key] = -merge[key]
+
+            # And then subtract the value in all cases
+            merge[key] -= value
+        else:
+            # Values that are positive didn't merge with other counts.
+            merge[key] = value
+
+    keys, values = zip(*merge.items())
+    values = [f'{-value}*' if value < 0 else str(value) for value in values]
+    return keys, values
--- a/server/normalization/main.py
+++ b/server/normalization/main.py
@@ -12,9 +12,7 @@ from typing import List, Optional, Union

 import click
 from lxml import etree
-
-sys.path[0] += '\\..\\..'
-from server.helpers import clean_string, get_close_matches_indexes, marked_item_merge
+from helpers import clean_string, get_close_matches_indexes, marked_item_merge

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger('normalization.main')