thesaurus.com javascript JSON parsing with demjson, unicode ready

2026-01-31 04:26:17 -06:00 · 2021-03-14 10:02:26 -05:00
parent 1f6f3246ae
commit 696e96e2ad
3 changed files with 106 additions and 0 deletions
@@ -0,0 +1,11 @@
+import demjson
+
+
+class CustomJSON(demjson.JSON):
+    """
+    A simple override for the demjson.JSON class to map all instances of undefined into 'null'.
+    """
+
+    def encode_undefined(self, state):
+        """Return null as undefined."""
+        state.append('null')
@@ -0,0 +1,38 @@
+import re
+
+import demjson
+import scrapy
+from scrapy.http import Response
+
+from scraper.helpers import CustomJSON
+from scraper.word import Word
+
+REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};'
+
+
+class ThesaurusSpider(scrapy.Spider):
+    name = 'thesaurus'
+    start_urls = ['https://www.thesaurus.com/browse/deny']
+    download_delay = 1
+
+    custom_settings = {
+        'FEED_EXPORT_ENCODING': 'utf-8'
+    }
+
+    def parse(self, response: Response, **kwargs):
+        for script in response.xpath('//script/text()').getall():
+            # Look for the specific script tag we want
+            if 'INITIAL_STATE' in script:
+                # Extract the interesting part from the script tag
+                m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script)
+
+                # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer
+                custom_demjson = CustomJSON(json_options=demjson.json_options(compactly=False))
+                decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
+
+                # Write a proper valid JSON file out
+                with open('example.json', 'w', encoding='utf-8') as file:
+                    file.write(custom_demjson.encode(decoded))
+
+                data = decoded['searchData']['relatedWordsApiData']['data'][0]
+                print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)})
@@ -0,0 +1,57 @@
+from typing import List, Optional
+
+
+class Word(object):
+    """
+    Describes a word on Thesaurus.com uniquely identified by it's slug/URL.
+    """
+    def __init__(self, entry: str, inflections: Optional[List[str]] = None, variants: Optional[List['WordVariant']] = None,
+                 pronunciation: Optional['Pronunciation'] = None, examples: Optional[List['ExampleSentence']] = None):
+        self.entry = entry
+        self.inflections = inflections or []
+        self.variants = variants or []
+        self.pronunciation = pronunciation
+        self.examples = examples or []
+
+    @classmethod
+    def from_raw(cls, data: dict) -> 'Word':
+        pass
+
+    def __repr__(self) -> str:
+        pass
+
+
+class WordVariant(object):
+    def __init__(self) -> None:
+        pass
+
+    @classmethod
+    def from_raw(cls, data: dict) -> 'WordVariant':
+        pass
+
+    def __repr__(self) -> str:
+        pass
+
+
+class Pronunciation(object):
+    def __init__(self, term: str, definition: str, pos: str, synonyms: List[str], antonyms: List[str], informal: Optional[int],
+                 vulgar: int, note: Optional[str], id: int) -> None:
+        pass
+
+    @classmethod
+    def from_raw(cls, data: dict) -> 'Pronunciation':
+        pass
+
+    def __repr__(self) -> str:
+        pass
+
+class ExampleSentence(object):
+    def __init__(self) -> None:
+        pass
+
+    @classmethod
+    def from_raw(cls, data: dict) -> 'ExampleSentence':
+        pass
+
+    def __repr__(self) -> str:
+        pass