From 696e96e2ad8627c7593fe70da8803bfe13d22215 Mon Sep 17 00:00:00 2001 From: Xevion Date: Sun, 14 Mar 2021 10:02:26 -0500 Subject: [PATCH] thesaurus.com javascript JSON parsing with demjson, unicode ready --- scraper/helpers.py | 11 ++++++++ scraper/spiders/spider.py | 38 ++++++++++++++++++++++++++ scraper/word.py | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 scraper/helpers.py create mode 100644 scraper/spiders/spider.py create mode 100644 scraper/word.py diff --git a/scraper/helpers.py b/scraper/helpers.py new file mode 100644 index 0000000..9060376 --- /dev/null +++ b/scraper/helpers.py @@ -0,0 +1,11 @@ +import demjson + + +class CustomJSON(demjson.JSON): + """ + A simple override for the demjson.JSON class to map all instances of undefined into 'null'. + """ + + def encode_undefined(self, state): + """Return null as undefined.""" + state.append('null') diff --git a/scraper/spiders/spider.py b/scraper/spiders/spider.py new file mode 100644 index 0000000..c506fee --- /dev/null +++ b/scraper/spiders/spider.py @@ -0,0 +1,38 @@ +import re + +import demjson +import scrapy +from scrapy.http import Response + +from scraper.helpers import CustomJSON +from scraper.word import Word + +REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};' + + +class ThesaurusSpider(scrapy.Spider): + name = 'thesaurus' + start_urls = ['https://www.thesaurus.com/browse/deny'] + download_delay = 1 + + custom_settings = { + 'FEED_EXPORT_ENCODING': 'utf-8' + } + + def parse(self, response: Response, **kwargs): + for script in response.xpath('//script/text()').getall(): + # Look for the specific script tag we want + if 'INITIAL_STATE' in script: + # Extract the interesting part from the script tag + m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script) + + # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer + custom_demjson = CustomJSON(json_options=demjson.json_options(compactly=False)) + decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape') + + # Write a proper valid JSON file out + with open('example.json', 'w', encoding='utf-8') as file: + file.write(custom_demjson.encode(decoded)) + + data = decoded['searchData']['relatedWordsApiData']['data'][0] + print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)}) diff --git a/scraper/word.py b/scraper/word.py new file mode 100644 index 0000000..8f26064 --- /dev/null +++ b/scraper/word.py @@ -0,0 +1,57 @@ +from typing import List, Optional + + +class Word(object): + """ + Describes a word on Thesaurus.com uniquely identified by it's slug/URL. + """ + def __init__(self, entry: str, inflections: Optional[List[str]] = None, variants: Optional[List['WordVariant']] = None, + pronunciation: Optional['Pronunciation'] = None, examples: Optional[List['ExampleSentence']] = None): + self.entry = entry + self.inflections = inflections or [] + self.variants = variants or [] + self.pronunciation = pronunciation + self.examples = examples or [] + + @classmethod + def from_raw(cls, data: dict) -> 'Word': + pass + + def __repr__(self) -> str: + pass + + +class WordVariant(object): + def __init__(self) -> None: + pass + + @classmethod + def from_raw(cls, data: dict) -> 'WordVariant': + pass + + def __repr__(self) -> str: + pass + + +class Pronunciation(object): + def __init__(self, term: str, definition: str, pos: str, synonyms: List[str], antonyms: List[str], informal: Optional[int], + vulgar: int, note: Optional[str], id: int) -> None: + pass + + @classmethod + def from_raw(cls, data: dict) -> 'Pronunciation': + pass + + def __repr__(self) -> str: + pass + +class ExampleSentence(object): + def __init__(self) -> None: + pass + + @classmethod + def from_raw(cls, data: dict) -> 'ExampleSentence': + pass + + def __repr__(self) -> str: + pass