Working basic spider crawler

TODO: Item generation, data export, option to use WordVariant synonym/antonyms instead of full crawl
2025-12-06 11:16:40 -06:00 · 2021-03-14 11:18:20 -05:00
parent d65abdc3a3
commit a3fb052dba
1 changed files with 17 additions and 5 deletions
--- a/scraper/spiders/spider.py
+++ b/scraper/spiders/spider.py
@@ -12,13 +12,17 @@ REGEX = r'window\.INITIAL_STATE\s+=\s+\{([\s\S]+)\};'
 class ThesaurusSpider(scrapy.Spider):
    name = 'thesaurus'
-    start_urls = ['https://www.thesaurus.com/browse/deny']
+    start_urls = ['https://www.thesaurus.com/browse/lock%20up']
    download_delay = 1
    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8'
    }
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.queue = set()
    def parse(self, response: Response, **kwargs):
        for script in response.xpath('//script/text()').getall():
            # Look for the specific script tag we want
@@ -31,8 +35,16 @@ class ThesaurusSpider(scrapy.Spider):
                decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape')
                # Write a proper valid JSON file out
-                with open('example.json', 'w', encoding='utf-8') as file:
+                # with open('example.json', 'w', encoding='utf-8') as file:
-                    file.write(custom_demjson.encode(decoded))
+                #     file.write(custom_demjson.encode(decoded))
-                data = decoded['searchData']['relatedWordsApiData']['data'][0]
+                raw_data = decoded['searchData']
-                print({'synonyms': sorted([Word.from_raw(word) for word in data['synonyms']], key=lambda word: word.similarity)})
+                word = Word.from_raw(data=raw_data)
                urls = word.get_urls()
                new = urls - self.queue
                self.queue.update(new)
                if len(new) > 0:
                    print(f'Found {len(new)} more URLs.')
                return response.follow_all(new)