mirror of
https://github.com/Xevion/thesaurus-scraper.git
synced 2025-12-09 16:08:52 -06:00
Finished all Word/WordVariant subclasses, raw processing & repr functions
This commit is contained in:
116
scraper/word.py
116
scraper/word.py
@@ -1,11 +1,16 @@
|
||||
from typing import List, Optional
|
||||
from collections import namedtuple
|
||||
from typing import Dict, List, Optional, Set
|
||||
|
||||
Inflection = namedtuple('Inflection', ['display', 'slug'])
|
||||
SimilarWord = namedtuple('SimilarWord', ['term', 'similarity', 'vulgar', 'informal', 'targetTerm', 'targetSlug'])
|
||||
|
||||
|
||||
class Word(object):
|
||||
"""
|
||||
Describes a word on Thesaurus.com uniquely identified by it's slug/URL.
|
||||
"""
|
||||
def __init__(self, entry: str, inflections: Optional[List[str]] = None, variants: Optional[List['WordVariant']] = None,
|
||||
|
||||
def __init__(self, entry: str, inflections: Optional[List[Inflection]] = None, variants: Optional[List['WordVariant']] = None,
|
||||
pronunciation: Optional['Pronunciation'] = None, examples: Optional[List['ExampleSentence']] = None):
|
||||
self.entry = entry
|
||||
self.inflections = inflections or []
|
||||
@@ -13,45 +18,120 @@ class Word(object):
|
||||
self.pronunciation = pronunciation
|
||||
self.examples = examples or []
|
||||
|
||||
def get_urls(self) -> Set[str]:
|
||||
urls = set()
|
||||
for variant in self.variants:
|
||||
for word in variant.antonyms + variant.synonyms:
|
||||
urls.add(f'https://www.thesaurus.com/browse/{word.targetSlug}')
|
||||
return urls
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, data: dict) -> 'Word':
|
||||
pass
|
||||
"""
|
||||
Creates a Word instance from raw data extracted from Thesaurus.com
|
||||
|
||||
:param data: A parsed Python dictionary.
|
||||
:return: A Word instance.
|
||||
"""
|
||||
api = data['tunaApiData']
|
||||
inflections = [Inflection(inflection['displayForm'], inflection['slug']) for inflection in api['inflections']]
|
||||
variants = [WordVariant.from_raw(api['entry'], variant_raw) for variant_raw in api['posTabs']]
|
||||
pronunciation = Pronunciation.from_raw(api['pronunciation']) if (api.get('pronunciation') is not None) else None
|
||||
examples = [ExampleSentence.from_raw(example_raw) for example_raw in api['exampleSentences']]
|
||||
|
||||
return Word(
|
||||
entry=api['entry'],
|
||||
inflections=inflections,
|
||||
variants=variants,
|
||||
pronunciation=pronunciation,
|
||||
examples=examples
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
pass
|
||||
# Count the number of antonyms and synonyms
|
||||
s = a = 0
|
||||
for variant in self.variants:
|
||||
s += len(variant.synonyms)
|
||||
a += len(variant.antonyms)
|
||||
|
||||
return f'Word("{self.entry}", {len(self.variants)} variant{"s" if len(self.variants) != 1 else ""}, {s} syn{"s" if s != 1 else ""}' \
|
||||
f', {a} ant{"s" if a != 1 else ""})'
|
||||
|
||||
|
||||
class WordVariant(object):
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
def __init__(self, term: str, definition: str, pos: str, synonyms: List[SimilarWord], antonyms: List[SimilarWord],
|
||||
informal: Optional[int],
|
||||
vulgar: int, note: Optional[str], ref_id: int) -> None:
|
||||
self.term = term
|
||||
self.definition = definition
|
||||
self.pos = pos
|
||||
self.synonyms = synonyms
|
||||
self.antonyms = antonyms
|
||||
self.informal = informal
|
||||
self.vulgar = vulgar
|
||||
self.note = note
|
||||
self.id = ref_id
|
||||
|
||||
@staticmethod
|
||||
def basic_word(raw: dict) -> SimilarWord:
|
||||
return SimilarWord(term=raw['term'],
|
||||
similarity=int(raw['similarity']),
|
||||
vulgar=raw['isVulgar'],
|
||||
informal=raw['isInformal'],
|
||||
targetSlug=raw['targetSlug'],
|
||||
targetTerm=raw['targetTerm'])
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, data: dict) -> 'WordVariant':
|
||||
pass
|
||||
def from_raw(cls, term: str, data: dict) -> 'WordVariant':
|
||||
return WordVariant(
|
||||
term=term,
|
||||
definition=data['definition'],
|
||||
pos=data['pos'],
|
||||
synonyms=[WordVariant.basic_word(syn) for syn in data['synonyms']],
|
||||
antonyms=[WordVariant.basic_word(ant) for ant in data['antonyms']],
|
||||
informal=int(data['isInformal']) if data['isInformal'] else None,
|
||||
vulgar=int(data['isVulgar']),
|
||||
note=data['note'],
|
||||
ref_id=int(data['thesRid']) if 'thesRid' in data.keys() else None
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
pass
|
||||
return f'WordVariant({self.term}, {self.pos}, {self.definition}, [{len(self.synonyms)}/{len(self.antonyms)}])'
|
||||
|
||||
|
||||
class Pronunciation(object):
|
||||
def __init__(self, term: str, definition: str, pos: str, synonyms: List[str], antonyms: List[str], informal: Optional[int],
|
||||
vulgar: int, note: Optional[str], id: int) -> None:
|
||||
pass
|
||||
def __init__(self, audio: Optional[Dict[str, str]], ipa: Optional[str], spelling: Optional[str]) -> None:
|
||||
self.audio = audio
|
||||
self.ipa = ipa
|
||||
self.spelling = spelling
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, data: dict) -> 'Pronunciation':
|
||||
pass
|
||||
return Pronunciation(
|
||||
audio=data['audio'] if 'audio' in data.keys() else None,
|
||||
ipa=data['ipa'] if 'ipa' in data.keys() else None,
|
||||
spelling=data['spell'] if 'spell' in data.keys() else None
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
pass
|
||||
return f'Pronunciation({len(self.audio.keys())} audio sources, {self.ipa}, {self.spelling})'
|
||||
|
||||
|
||||
class ExampleSentence(object):
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
def __init__(self, id: int, sentence: str, source: Dict[str, str], profanity: int) -> None:
|
||||
self.id = id
|
||||
self.sentence = sentence
|
||||
self.source = source
|
||||
self.profanity = profanity
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, data: dict) -> 'ExampleSentence':
|
||||
pass
|
||||
return ExampleSentence(
|
||||
id=data['id'],
|
||||
sentence=data['sentence'],
|
||||
source=data['source'],
|
||||
profanity=data['profanity']
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
pass
|
||||
return f'ExampleSentence({self.id}, "{self.sentence[:15]}{"..." if len(self.sentence) >= 15 else ""}")'
|
||||
|
||||
Reference in New Issue
Block a user