RAGTheDocs-mila-qc

Sleeping

App Files Files Community

RAGTheDocs-mila-qc / rtd_scraper /tutorial /spiders /docs_spider.py

jerpint

sanitize web urls

df044c6 almost 2 years ago

raw

history blame contribute delete

2.7 kB

	import logging
	from pathlib import Path
	from urllib.parse import urlparse

	import scrapy

	logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)

	from urllib.parse import urlparse


	def extract_domain(url):
	"""
	Extract the domain (including subdomains) from a given URL.

	Args:
	- url (str): The URL from which the domain needs to be extracted.

	Returns:
	- str: The domain (with subdomains) extracted from the URL.
	For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.

	"""
	parsed_uri = urlparse(url)
	# The netloc attribute will contain the domain name
	domain = parsed_uri.netloc
	return domain


	def sanitize_url(url: str) -> str:
	"""Adds https:// and trailing backslash."""
	if not url.startswith("https://"):
	url = "https://" + url

	if not url.endswith("/"):
	url = url + "/"
	return url


	class DocsSpider(scrapy.Spider):
	name = "docs"

	def __init__(
	self,
	homepage_url: str,
	save_dir="outputs/",
	target_version=None,
	*args,
	**kwargs,
	):
	super(DocsSpider, self).__init__(args, *kwargs)

	homepage_url = sanitize_url(homepage_url)

	self.allowed_domains = [extract_domain(homepage_url)]
	self.start_urls = [homepage_url]
	self.base_dir = Path(save_dir)
	self.target_version = target_version

	def parse(self, response):
	parsed_uri = urlparse(response.url)
	# Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
	if parsed_uri.path.endswith("/"):
	filepath = (
	self.base_dir
	/ parsed_uri.netloc
	/ parsed_uri.path.strip("/")
	/ "index.html"
	)
	else:
	filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
	filepath.parent.mkdir(parents=True, exist_ok=True)

	with open(filepath, "wb") as f:
	f.write(response.body)

	# Follow links to other documentation pages only if they contain the target version in the full URL
	for href in response.css("a::attr(href)").getall():
	if self.target_version:
	# A version was specified, check to see if it's the correct version from url
	full_url = response.urljoin(href) # Expand href to a full URL
	if self.target_version in full_url:
	yield response.follow(href, self.parse)
	else:
	# no version specified, follow all links
	yield response.follow(href, self.parse)