ltu-chat / progremme-spider.py
Stepan
Init
4717959
from bs4 import BeautifulSoup, Comment
import scrapy, re
from boilerpy3 import extractors
class FullHTMLSpider(scrapy.spiders.SitemapSpider):
name = "ltu_programme_crawler"
# allowed_domains = ["www.ltu.se"]
sitemap_urls = ["https://www.ltu.se/robots.txt"]
sitemap_rules = [(re.compile(r'\/en\/education\/programme\/[a-zA-Z0-9-]*$'), 'parse')]
exclude_patterns = [
"sdhog-continuing-part-of-study-programme-non-freshmen-120-300-credits",
"international-orchestra-academy"]
def parse(self, response):
# Skip the page if its URL contains any excluded pattern.
if any(pattern in response.url for pattern in self.exclude_patterns):
self.logger.info("Skipping page due to excluded URL pattern: %s", response.url)
return
html = response.text
if "Discontinued." in html:
self.logger.info("Skipping page (contains 'Discontinued.'): %s", response.url)
return
extractor = extractors.ArticleExtractor()
# Pass HTML to Extractor
content = extractor.get_content(html)
return {'url': response.url, 'content': content}