mirror of
https://github.com/Xevion/thesaurus-scraper.git
synced 2025-12-06 11:16:40 -06:00
Working basic spider crawler
TODO: Item generation, data export, option to use WordVariant synonym/antonyms instead of full crawl
This commit is contained in:
@@ -12,13 +12,17 @@ REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};'
|
|||||||
|
|
||||||
class ThesaurusSpider(scrapy.Spider):
|
class ThesaurusSpider(scrapy.Spider):
|
||||||
name = 'thesaurus'
|
name = 'thesaurus'
|
||||||
start_urls = ['https://www.thesaurus.com/browse/deny']
|
start_urls = ['https://www.thesaurus.com/browse/lock%20up']
|
||||||
download_delay = 1
|
download_delay = 1
|
||||||
|
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.queue = set()
|
||||||
|
|
||||||
def parse(self, response: Response, **kwargs):
|
def parse(self, response: Response, **kwargs):
|
||||||
for script in response.xpath('//script/text()').getall():
|
for script in response.xpath('//script/text()').getall():
|
||||||
# Look for the specific script tag we want
|
# Look for the specific script tag we want
|
||||||
@@ -31,8 +35,16 @@ class ThesaurusSpider(scrapy.Spider):
|
|||||||
decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
|
decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
|
||||||
|
|
||||||
# Write a proper valid JSON file out
|
# Write a proper valid JSON file out
|
||||||
with open('example.json', 'w', encoding='utf-8') as file:
|
# with open('example.json', 'w', encoding='utf-8') as file:
|
||||||
file.write(custom_demjson.encode(decoded))
|
# file.write(custom_demjson.encode(decoded))
|
||||||
|
|
||||||
data = decoded['searchData']['relatedWordsApiData']['data'][0]
|
raw_data = decoded['searchData']
|
||||||
print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)})
|
word = Word.from_raw(data=raw_data)
|
||||||
|
|
||||||
|
urls = word.get_urls()
|
||||||
|
new = urls - self.queue
|
||||||
|
self.queue.update(new)
|
||||||
|
|
||||||
|
if len(new) > 0:
|
||||||
|
print(f'Found {len(new)} more URLs.')
|
||||||
|
return response.follow_all(new)
|
||||||
|
|||||||
Reference in New Issue
Block a user