mirror of
https://github.com/Xevion/the-office.git
synced 2025-12-06 11:16:40 -06:00
Take necessary helpers from server.data.helpers for normalization, Add unidecode to Pipfile
- General cleanup, getting ready to delete server.data
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -14,6 +14,7 @@ coloredlogs = "*"
|
|||||||
markupsafe = "<2.1.0"
|
markupsafe = "<2.1.0"
|
||||||
flask_cors = "*"
|
flask_cors = "*"
|
||||||
flask_wtf = "*"
|
flask_wtf = "*"
|
||||||
|
unidecode = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
|
|
||||||
|
|||||||
10
Pipfile.lock
generated
10
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "b3345bfd51f4748db74891f88052c39e8711e7eabb4a90ee6b03b66004685efe"
|
"sha256": "1b7d4db98a117f45dc7130e880f5e49ab9b85de8ec8d18864742a364fab852b0"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
@@ -257,6 +257,14 @@
|
|||||||
"markers": "python_version >= '3.0'",
|
"markers": "python_version >= '3.0'",
|
||||||
"version": "==2.3.2.post1"
|
"version": "==2.3.2.post1"
|
||||||
},
|
},
|
||||||
|
"unidecode": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:8e4352fb93d5a735c788110d2e7ac8e8031eb06ccbfe8d324ab71735015f9342",
|
||||||
|
"sha256:afa04efcdd818a93237574791be9b2817d7077c25a068b00f8cff7baa4e59257"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.3.4"
|
||||||
|
},
|
||||||
"urllib3": {
|
"urllib3": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
|
"sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
|
||||||
|
|||||||
@@ -59,11 +59,6 @@ def algolia_transform(old_dictionary: dict, key_list: List[Tuple[str, Optional[s
|
|||||||
|
|
||||||
return new_dictionary
|
return new_dictionary
|
||||||
|
|
||||||
|
|
||||||
def is_main_character(name: str) -> bool:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def character_id(name: str) -> str:
|
def character_id(name: str) -> str:
|
||||||
return '-'.join(name.split(' ')).lower()
|
return '-'.join(name.split(' ')).lower()
|
||||||
|
|
||||||
@@ -76,69 +71,4 @@ def random_id(length: int = 8) -> str:
|
|||||||
return ''.join(random.choices(alphabet, k=length))
|
return ''.join(random.choices(alphabet, k=length))
|
||||||
|
|
||||||
|
|
||||||
def char_filter(string):
|
|
||||||
latin = re.compile('[a-zA-Z]+')
|
|
||||||
for char in unicodedata.normalize('NFC', string):
|
|
||||||
decoded = unidecode.unidecode(char)
|
|
||||||
if latin.match(decoded):
|
|
||||||
yield char
|
|
||||||
else:
|
|
||||||
yield decoded
|
|
||||||
|
|
||||||
|
|
||||||
def clean_string(string):
|
|
||||||
return "".join(char_filter(string))
|
|
||||||
|
|
||||||
|
|
||||||
def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
|
|
||||||
"""Use SequenceMatcher to return a list of the indexes of the best
|
|
||||||
"good enough" matches. word is a sequence for which close matches
|
|
||||||
are desired (typically a string).
|
|
||||||
possibilities is a list of sequences against which to match word
|
|
||||||
(typically a list of strings).
|
|
||||||
Optional arg n (default 3) is the maximum number of close matches to
|
|
||||||
return. n must be > 0.
|
|
||||||
Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
|
|
||||||
that don't score at least that similar to word are ignored.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not n > 0:
|
|
||||||
raise ValueError("n must be > 0: %r" % (n,))
|
|
||||||
if not 0.0 <= cutoff <= 1.0:
|
|
||||||
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
|
|
||||||
result = []
|
|
||||||
s = SequenceMatcher()
|
|
||||||
s.set_seq2(word)
|
|
||||||
for idx, x in enumerate(possibilities):
|
|
||||||
s.set_seq1(x)
|
|
||||||
if s.real_quick_ratio() >= cutoff and \
|
|
||||||
s.quick_ratio() >= cutoff and \
|
|
||||||
s.ratio() >= cutoff:
|
|
||||||
result.append((s.ratio(), idx))
|
|
||||||
|
|
||||||
# Move the best scorers to head of list
|
|
||||||
result = _nlargest(n, result)
|
|
||||||
|
|
||||||
# Strip scores for the best n matches
|
|
||||||
return [x for score, x in result]
|
|
||||||
|
|
||||||
|
|
||||||
def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
|
|
||||||
"""Add the values of identical keys together, then return both the keys and values"""
|
|
||||||
merge = OrderedDict()
|
|
||||||
for key, value in zip(keys, values):
|
|
||||||
# Already inserted, now make/keep it negative
|
|
||||||
if key in merge.keys():
|
|
||||||
# Keys that haven't been turned over need to be made negative
|
|
||||||
if merge[key] > 0:
|
|
||||||
merge[key] = -merge[key]
|
|
||||||
|
|
||||||
# And then subtract the value in all cases
|
|
||||||
merge[key] -= value
|
|
||||||
else:
|
|
||||||
# Values that are positive didn't merge with other counts.
|
|
||||||
merge[key] = value
|
|
||||||
|
|
||||||
keys, values = zip(*merge.items())
|
|
||||||
values = [f'{-value}*' if value < 0 else str(value) for value in values]
|
|
||||||
return keys, values
|
|
||||||
|
|||||||
80
server/normalization/helpers.py
Normal file
80
server/normalization/helpers.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import random
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
import unicodedata
|
||||||
|
from collections import OrderedDict
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from heapq import nlargest as _nlargest
|
||||||
|
from typing import List, Optional, Tuple, Iterator
|
||||||
|
|
||||||
|
import unidecode
|
||||||
|
|
||||||
|
|
||||||
|
def char_filter(s: str) -> Iterator[str]:
|
||||||
|
"""Returns a generator of characters that are properly converted from their unicode character into their ASCII equivalent."""
|
||||||
|
latin = re.compile('[a-zA-Z]+')
|
||||||
|
for char in unicodedata.normalize('NFC', s):
|
||||||
|
decoded = unidecode.unidecode(char)
|
||||||
|
if latin.match(decoded):
|
||||||
|
yield char
|
||||||
|
else:
|
||||||
|
yield decoded
|
||||||
|
|
||||||
|
|
||||||
|
def clean_string(s: str) -> str:
|
||||||
|
"""Returns a clean string, devoid of ugly Unicode characters."""
|
||||||
|
return "".join(char_filter(s))
|
||||||
|
|
||||||
|
|
||||||
|
def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
|
||||||
|
"""Use SequenceMatcher to return a list of the indexes of the best
|
||||||
|
"good enough" matches. word is a sequence for which close matches
|
||||||
|
are desired (typically a string).
|
||||||
|
possibilities is a list of sequences against which to match word
|
||||||
|
(typically a list of strings).
|
||||||
|
Optional arg n (default 3) is the maximum number of close matches to
|
||||||
|
return. n must be > 0.
|
||||||
|
Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
|
||||||
|
that don't score at least that similar to word are ignored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not n > 0:
|
||||||
|
raise ValueError("n must be > 0: %r" % (n,))
|
||||||
|
if not 0.0 <= cutoff <= 1.0:
|
||||||
|
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
|
||||||
|
result = []
|
||||||
|
s = SequenceMatcher()
|
||||||
|
s.set_seq2(word)
|
||||||
|
for idx, x in enumerate(possibilities):
|
||||||
|
s.set_seq1(x)
|
||||||
|
if s.real_quick_ratio() >= cutoff and \
|
||||||
|
s.quick_ratio() >= cutoff and \
|
||||||
|
s.ratio() >= cutoff:
|
||||||
|
result.append((s.ratio(), idx))
|
||||||
|
|
||||||
|
# Move the best scorers to head of list
|
||||||
|
result = _nlargest(n, result)
|
||||||
|
|
||||||
|
# Strip scores for the best n matches
|
||||||
|
return [x for score, x in result]
|
||||||
|
|
||||||
|
|
||||||
|
def marked_item_merge(keys: List[str], values: List[int]) -> Tuple[List[str], List[str]]:
|
||||||
|
"""Add the values of identical keys together, then return both the keys and values"""
|
||||||
|
merge = OrderedDict()
|
||||||
|
for key, value in zip(keys, values):
|
||||||
|
# Already inserted, now make/keep it negative
|
||||||
|
if key in merge.keys():
|
||||||
|
# Keys that haven't been turned over need to be made negative
|
||||||
|
if merge[key] > 0:
|
||||||
|
merge[key] = -merge[key]
|
||||||
|
|
||||||
|
# And then subtract the value in all cases
|
||||||
|
merge[key] -= value
|
||||||
|
else:
|
||||||
|
# Values that are positive didn't merge with other counts.
|
||||||
|
merge[key] = value
|
||||||
|
|
||||||
|
keys, values = zip(*merge.items())
|
||||||
|
values = [f'{-value}*' if value < 0 else str(value) for value in values]
|
||||||
|
return keys, values
|
||||||
@@ -12,9 +12,7 @@ from typing import List, Optional, Union
|
|||||||
|
|
||||||
import click
|
import click
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from helpers import clean_string, get_close_matches_indexes, marked_item_merge
|
||||||
sys.path[0] += '\\..\\..'
|
|
||||||
from server.helpers import clean_string, get_close_matches_indexes, marked_item_merge
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger('normalization.main')
|
logger = logging.getLogger('normalization.main')
|
||||||
|
|||||||
Reference in New Issue
Block a user