mirror of
https://github.com/Xevion/thesaurus-scraper.git
synced 2026-01-31 08:26:15 -06:00
thesaurus.com javascript JSON parsing with demjson, unicode ready
This commit is contained in:
@@ -0,0 +1,11 @@
|
|||||||
|
import demjson
|
||||||
|
|
||||||
|
|
||||||
|
class CustomJSON(demjson.JSON):
|
||||||
|
"""
|
||||||
|
A simple override for the demjson.JSON class to map all instances of undefined into 'null'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def encode_undefined(self, state):
|
||||||
|
"""Return null as undefined."""
|
||||||
|
state.append('null')
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import demjson
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
from scraper.helpers import CustomJSON
|
||||||
|
from scraper.word import Word
|
||||||
|
|
||||||
|
REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};'
|
||||||
|
|
||||||
|
|
||||||
|
class ThesaurusSpider(scrapy.Spider):
|
||||||
|
name = 'thesaurus'
|
||||||
|
start_urls = ['https://www.thesaurus.com/browse/deny']
|
||||||
|
download_delay = 1
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse(self, response: Response, **kwargs):
|
||||||
|
for script in response.xpath('//script/text()').getall():
|
||||||
|
# Look for the specific script tag we want
|
||||||
|
if 'INITIAL_STATE' in script:
|
||||||
|
# Extract the interesting part from the script tag
|
||||||
|
m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script)
|
||||||
|
|
||||||
|
# Decode it properly, handling annoying unicode escapes and nonsense from the site renderer
|
||||||
|
custom_demjson = CustomJSON(json_options=demjson.json_options(compactly=False))
|
||||||
|
decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
|
||||||
|
|
||||||
|
# Write a proper valid JSON file out
|
||||||
|
with open('example.json', 'w', encoding='utf-8') as file:
|
||||||
|
file.write(custom_demjson.encode(decoded))
|
||||||
|
|
||||||
|
data = decoded['searchData']['relatedWordsApiData']['data'][0]
|
||||||
|
print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)})
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class Word(object):
|
||||||
|
"""
|
||||||
|
Describes a word on Thesaurus.com uniquely identified by it's slug/URL.
|
||||||
|
"""
|
||||||
|
def __init__(self, entry: str, inflections: Optional[List[str]] = None, variants: Optional[List['WordVariant']] = None,
|
||||||
|
pronunciation: Optional['Pronunciation'] = None, examples: Optional[List['ExampleSentence']] = None):
|
||||||
|
self.entry = entry
|
||||||
|
self.inflections = inflections or []
|
||||||
|
self.variants = variants or []
|
||||||
|
self.pronunciation = pronunciation
|
||||||
|
self.examples = examples or []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_raw(cls, data: dict) -> 'Word':
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WordVariant(object):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_raw(cls, data: dict) -> 'WordVariant':
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Pronunciation(object):
|
||||||
|
def __init__(self, term: str, definition: str, pos: str, synonyms: List[str], antonyms: List[str], informal: Optional[int],
|
||||||
|
vulgar: int, note: Optional[str], id: int) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_raw(cls, data: dict) -> 'Pronunciation':
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ExampleSentence(object):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_raw(cls, data: dict) -> 'ExampleSentence':
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
pass
|
||||||
Reference in New Issue
Block a user