Spaces:

anakin87
/

who-killed-laura-palmer

Sleeping

App Files Files Community

who-killed-laura-palmer / crawler /tpcrawler /tpcrawler /spiders /tpcrawler.py

Stefano Fiorucci

crawler refactoring

82fe524 about 2 years ago

raw

history blame

No virus

2.81 kB

	import scrapy
	from scrapy.http import TextResponse
	import re
	import fandom
	import json

	# Categories unrelated to Twin Peaks plot
	# (they make noise in the index)
	excluded_categories=set("""Twin Peaks (2017) crew
	Actors
	Camera and electrical department
	Casting department
	Catering department
	Costume department
	Directors
	Editors
	Location department
	Makeup department
	Medics
	Music department
	Producers
	Production associates
	Special and visual effects department
	Stand-ins
	Story editors
	Stunt department
	Transportation department
	Writers
	Years
	Decades
	Days
	Production timeline""".split("\n"))

	fandom.set_wiki("Twinpeaks")

	article_id_pattern = "wgArticleId\"\:([0-9]+)"
	categories_xpath = "//div[@class='page-header__categories']/a//text()"
	wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href"
	next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href"


	class Tpcrawler(scrapy.Spider):
	name = 'tpcrawler'
	allowed_domains = ['https://twinpeaks.fandom.com/']
	start_urls = ['https://twinpeaks.fandom.com/wiki/Special:AllPages']


	def parse(self, response):
	"""Start from wiki "all pages" list and open them"""
	hrefs = response.xpath(wiki_page_href_xpath).extract()
	for href in hrefs:
	yield scrapy.Request(url=response.urljoin(href),
	callback=self.parse_page, dont_filter=True,
	meta={'name':href.rpartition('/')[-1],
	'url':response.urljoin(href)})

	next_page = response.xpath(next_page_href_xpath).extract_first()
	if next_page:
	yield scrapy.Request(url=response.urljoin(next_page),
	callback=self.parse, dont_filter=True)

	def parse_page(self, response: TextResponse):
	"""
	Collect all interesting pages IDs
	and use the Fandom API to crawl them.
	Save the output in JSON format.
	"""

	categories = set(response.xpath(categories_xpath).extract())

	# the wiki page is interesting only if related to plot
	# (= not contained in excluded categories)
	if len(categories.intersection(excluded_categories))==0:
	name = response.meta['name']
	url = response.meta['url']
	article_id = int(re.findall(article_id_pattern, response.text)[0])

	# once the ID is found, use the Fandom API to retrieve the clean page text
	page = fandom.page(pageid = article_id)
	text = page.plain_text.split('\nAppearances\n')[0]\
	.split('\nReferences\n')[0]
	json_content={'name': name, 'url':url, 'text':text}
	with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
	json.dump(json_content, fout)