import logging from pathlib import Path from urllib.parse import urlparse import scrapy logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) from urllib.parse import urlparse def extract_domain(url): """ Extract the domain (including subdomains) from a given URL. Args: - url (str): The URL from which the domain needs to be extracted. Returns: - str: The domain (with subdomains) extracted from the URL. For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'. """ parsed_uri = urlparse(url) # The netloc attribute will contain the domain name domain = parsed_uri.netloc return domain def sanitize_url(url: str) -> str: """Adds https:// and trailing backslash.""" if not url.startswith("https://"): url = "https://" + url if not url.endswith("/"): url = url + "/" return url class DocsSpider(scrapy.Spider): name = "docs" def __init__( self, homepage_url: str, save_dir="outputs/", target_version=None, *args, **kwargs, ): super(DocsSpider, self).__init__(*args, **kwargs) homepage_url = sanitize_url(homepage_url) self.allowed_domains = [extract_domain(homepage_url)] self.start_urls = [homepage_url] self.base_dir = Path(save_dir) self.target_version = target_version def parse(self, response): parsed_uri = urlparse(response.url) # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename. if parsed_uri.path.endswith("/"): filepath = ( self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/") / "index.html" ) else: filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/") filepath.parent.mkdir(parents=True, exist_ok=True) with open(filepath, "wb") as f: f.write(response.body) # Follow links to other documentation pages only if they contain the target version in the full URL for href in response.css("a::attr(href)").getall(): if self.target_version: # A version was specified, check to see if it's the correct version from url full_url = response.urljoin(href) # Expand href to a full URL if self.target_version in full_url: yield response.follow(href, self.parse) else: # no version specified, follow all links yield response.follow(href, self.parse)