Commit latest changes to server-side processing system

2025-12-10 20:08:47 -06:00 · 2022-05-08 19:49:18 -05:00
parent 698beb5943
commit cdc409b9a0
4 changed files with 209 additions and 24 deletions
--- a/server/helpers.py
+++ b/server/helpers.py
@@ -3,8 +3,16 @@ helpers.py


 """
+import random
+import re
+import string
+import unicodedata
+from collections import OrderedDict
+from difflib import SequenceMatcher
+from heapq import nlargest as _nlargest
+from typing import List, Optional, Tuple

-from typing import List, Tuple, Optional
+import unidecode

 episode_counts = [6, 22, 23, 14, 26, 24, 24, 24, 23]

@@ -55,5 +63,82 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s
 def is_main_character(name: str) -> bool:
    return None

+
 def character_id(name: str) -> str:
    return '-'.join(name.split(' ')).lower()
+
+
+alphabet: str = string.ascii_letters + string.digits
+
+
+def random_id(length: int = 8) -> str:
+    """Generate a random {length} character long string."""
+    return ''.join(random.choices(alphabet, k=length))
+
+
+def char_filter(string):
+    latin = re.compile('[a-zA-Z]+')
+    for char in unicodedata.normalize('NFC', string):
+        decoded = unidecode.unidecode(char)
+        if latin.match(decoded):
+            yield char
+        else:
+            yield decoded
+
+
+def clean_string(string):
+    return "".join(char_filter(string))
+
+
+def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
+    """Use SequenceMatcher to return a list of the indexes of the best
+    "good enough" matches. word is a sequence for which close matches
+    are desired (typically a string).
+    possibilities is a list of sequences against which to match word
+    (typically a list of strings).
+    Optional arg n (default 3) is the maximum number of close matches to
+    return.  n must be > 0.
+    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
+    that don't score at least that similar to word are ignored.
+    """
+
+    if not n > 0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher()
+    s.set_seq2(word)
+    for idx, x in enumerate(possibilities):
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+                s.quick_ratio() >= cutoff and \
+                s.ratio() >= cutoff:
+            result.append((s.ratio(), idx))
+
+    # Move the best scorers to head of list
+    result = _nlargest(n, result)
+
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
+
+def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
+    """Add the values of identical keys together, then return both the keys and values"""
+    merge = OrderedDict()
+    for key, value in zip(keys, values):
+        # Already inserted, now make/keep it negative
+        if key in merge.keys():
+            # Keys that haven't been turned over need to be made negative
+            if merge[key] > 0:
+                merge[key] = -merge[key]
+
+            # And then subtract the value in all cases
+            merge[key] -= value
+        else:
+            # Values that are positive didn't merge with other counts.
+            merge[key] = value
+
+    keys, values = zip(*merge.items())
+    values = [f'{-value}*' if value < 0 else str(value) for value in values]
+    return keys, values