Spaces:

anakin87
/

who-killed-laura-palmer

Sleeping

App Files Files Community

Stefano Fiorucci commited on May 16, 2022

Commit

82fe524

1 Parent(s): a251941

crawler refactoring

Browse files

Files changed (4) hide show

crawler/README.md +15 -0
crawler/data/.gitkeep +0 -0
crawler/requirements.txt +1 -1
crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py +32 -30

crawler/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# Twin Peaks crawler
+This crawler download texts and metadata from [Twin Peaks Fandom Wiki](https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki). The output format is JSON. The crawler is based on the combination of [Scrapy](https://github.com/scrapy/scrapy) and [fandom-py](https://github.com/NikolajDanger/fandom-py).
+*Several wiki pages are discarded, since they are not related to Twin Peaks plot and create noise in the Question Answering index.*
+## Installation
+- `pip install -r requirements.txt`
+- copy this folder (if needed, see [stackoverflow](https://stackoverflow.com/questions/7106012/download-a-single-folder-or-directory-from-a-github-repo))
+## Usage
+- (if needed, activate the virtual environment)
+- `cd tpcrawler`
+- `scrapy crawl tpcrawler`
+- you can find the downloaded pages in `data` subfolder

crawler/data/.gitkeep ADDED Viewed

File without changes

crawler/requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	fandom-py==0.2.1
2	- Scrapy==2.5.1


1	fandom-py==0.2.1
2	+ Scrapy==2.6.1

crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py CHANGED Viewed

@@ -1,14 +1,11 @@
 import scrapy
-from scrapy.utils.response import open_in_browser
 from scrapy.http import TextResponse
 import re
 import fandom
 import json
-fandom.set_wiki("Twinpeaks")
-article_id_pattern="wgArticleId\"\:([0-9]+)"
-categories_xpath="//div[@class='page-header__categories']/a//text()"
 excluded_categories=set("""Twin Peaks (2017) crew
 Actors
 Camera and electrical department
@@ -34,7 +31,13 @@ Decades
 Days
 Production timeline""".split("\n"))
-#print(excluded_categories)
 class Tpcrawler(scrapy.Spider):
     name = 'tpcrawler'
@@ -43,40 +46,39 @@ class Tpcrawler(scrapy.Spider):
     def parse(self, response):
-        #open_in_browser(response)
-        hrefs = response.xpath("//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href").extract()
         for href in hrefs:
-            yield scrapy.Request(url=response.urljoin(href), callback=self.parse_page, dont_filter=True,
-            meta={'name':href.rpartition('/')[-1],
-            'url':response.urljoin(href)})
-        next_page =  response.xpath("//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href").extract_first()
         if next_page:
-            yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse, dont_filter=True)
     def parse_page(self, response: TextResponse):
         categories = set(response.xpath(categories_xpath).extract())
-        #print(categories)
         if len(categories.intersection(excluded_categories))==0:
             name = response.meta['name']
             url = response.meta['url']
             article_id = int(re.findall(article_id_pattern, response.text)[0])
-        # una volta trovato l'id, usa l'API di fandom per recuperare solo il testo della voce
-            page = fandom.page(pageid = article_id)
-            text = page.plain_text.split('\nAppearances\n')[0].split('\nReferences\n')[0]
             json_content={'name': name, 'url':url, 'text':text}
             with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
-                json.dump(json_content, fout)

 import scrapy
 from scrapy.http import TextResponse
 import re
 import fandom
 import json
+# Categories unrelated to Twin Peaks plot
+# (they make noise in the index)
 excluded_categories=set("""Twin Peaks (2017) crew
 Actors
 Camera and electrical department
 Days
 Production timeline""".split("\n"))
+fandom.set_wiki("Twinpeaks")
+article_id_pattern = "wgArticleId\"\:([0-9]+)"
+categories_xpath = "//div[@class='page-header__categories']/a//text()"
+wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href"
+next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href"
 class Tpcrawler(scrapy.Spider):
     name = 'tpcrawler'
     def parse(self, response):
+        """Start from wiki "all pages" list and open them"""
+        hrefs = response.xpath(wiki_page_href_xpath).extract()
         for href in hrefs:
+            yield scrapy.Request(url=response.urljoin(href),
+                callback=self.parse_page, dont_filter=True,
+                meta={'name':href.rpartition('/')[-1],
+                'url':response.urljoin(href)})
+        next_page =  response.xpath(next_page_href_xpath).extract_first()
         if next_page:
+            yield scrapy.Request(url=response.urljoin(next_page),
+                callback=self.parse, dont_filter=True)
     def parse_page(self, response: TextResponse):
+        """
+        Collect all interesting pages IDs
+        and use the Fandom API to crawl them.
+        Save the output in JSON format.
+        """
         categories = set(response.xpath(categories_xpath).extract())
+        # the wiki page is interesting only if related to plot
+        # (= not contained in excluded categories)
         if len(categories.intersection(excluded_categories))==0:
             name = response.meta['name']
             url = response.meta['url']
             article_id = int(re.findall(article_id_pattern, response.text)[0])
+            # once the ID is found, use the Fandom API to retrieve the clean page text
+            page = fandom.page(pageid = article_id)
+            text = page.plain_text.split('\nAppearances\n')[0]\
+                .split('\nReferences\n')[0]
             json_content={'name': name, 'url':url, 'text':text}
             with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
+                json.dump(json_content, fout)