mirror of
https://github.com/Xevion/thesaurus-scraper.git
synced 2025-12-15 02:13:27 -06:00
thesaurus.com javascript JSON parsing with demjson, unicode ready
This commit is contained in:
38
scraper/spiders/spider.py
Normal file
38
scraper/spiders/spider.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
|
||||
import demjson
|
||||
import scrapy
|
||||
from scrapy.http import Response
|
||||
|
||||
from scraper.helpers import CustomJSON
|
||||
from scraper.word import Word
|
||||
|
||||
REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};'
|
||||
|
||||
|
||||
class ThesaurusSpider(scrapy.Spider):
|
||||
name = 'thesaurus'
|
||||
start_urls = ['https://www.thesaurus.com/browse/deny']
|
||||
download_delay = 1
|
||||
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||
}
|
||||
|
||||
def parse(self, response: Response, **kwargs):
|
||||
for script in response.xpath('//script/text()').getall():
|
||||
# Look for the specific script tag we want
|
||||
if 'INITIAL_STATE' in script:
|
||||
# Extract the interesting part from the script tag
|
||||
m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script)
|
||||
|
||||
# Decode it properly, handling annoying unicode escapes and nonsense from the site renderer
|
||||
custom_demjson = CustomJSON(json_options=demjson.json_options(compactly=False))
|
||||
decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
|
||||
|
||||
# Write a proper valid JSON file out
|
||||
with open('example.json', 'w', encoding='utf-8') as file:
|
||||
file.write(custom_demjson.encode(decoded))
|
||||
|
||||
data = decoded['searchData']['relatedWordsApiData']['data'][0]
|
||||
print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)})
|
||||
Reference in New Issue
Block a user