import scrapy from scrapy.http import TextResponse import re import fandom import json # Categories unrelated to Twin Peaks plot # (they make noise in the index) excluded_categories=set("""Twin Peaks (2017) crew Actors Camera and electrical department Casting department Catering department Costume department Directors Editors Location department Makeup department Medics Music department Producers Production associates Special and visual effects department Stand-ins Story editors Stunt department Transportation department Writers Years Decades Days Production timeline""".split("\n")) fandom.set_wiki("Twinpeaks") article_id_pattern = "wgArticleId\"\:([0-9]+)" categories_xpath = "//div[@class='page-header__categories']/a//text()" wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href" next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href" class Tpcrawler(scrapy.Spider): name = 'tpcrawler' allowed_domains = ['https://twinpeaks.fandom.com/'] start_urls = ['https://twinpeaks.fandom.com/wiki/Special:AllPages'] def parse(self, response): """Start from wiki "all pages" list and open them""" hrefs = response.xpath(wiki_page_href_xpath).extract() for href in hrefs: yield scrapy.Request(url=response.urljoin(href), callback=self.parse_page, dont_filter=True, meta={'name':href.rpartition('/')[-1], 'url':response.urljoin(href)}) next_page = response.xpath(next_page_href_xpath).extract_first() if next_page: yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse, dont_filter=True) def parse_page(self, response: TextResponse): """ Collect all interesting pages IDs and use the Fandom API to crawl them. Save the output in JSON format. """ categories = set(response.xpath(categories_xpath).extract()) # the wiki page is interesting only if related to plot # (= not contained in excluded categories) if len(categories.intersection(excluded_categories))==0: name = response.meta['name'] url = response.meta['url'] article_id = int(re.findall(article_id_pattern, response.text)[0]) # once the ID is found, use the Fandom API to retrieve the clean page text page = fandom.page(pageid = article_id) text = page.plain_text.split('\nAppearances\n')[0]\ .split('\nReferences\n')[0] json_content={'name': name, 'url':url, 'text':text} with open(f'./data/{name}.json','w', encoding='utf-8') as fout: json.dump(json_content, fout)