Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import logging | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| import scrapy | |
| logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) | |
| from urllib.parse import urlparse | |
| def extract_domain(url): | |
| """ | |
| Extract the domain (including subdomains) from a given URL. | |
| Args: | |
| - url (str): The URL from which the domain needs to be extracted. | |
| Returns: | |
| - str: The domain (with subdomains) extracted from the URL. | |
| For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'. | |
| """ | |
| parsed_uri = urlparse(url) | |
| # The netloc attribute will contain the domain name | |
| domain = parsed_uri.netloc | |
| return domain | |
| def sanitize_url(url: str) -> str: | |
| """Adds https:// and trailing backslash.""" | |
| if not url.startswith("https://"): | |
| url = "https://" + url | |
| if not url.endswith("/"): | |
| url = url + "/" | |
| return url | |
| class DocsSpider(scrapy.Spider): | |
| name = "docs" | |
| def __init__( | |
| self, | |
| homepage_url: str, | |
| save_dir="outputs/", | |
| target_version=None, | |
| *args, | |
| **kwargs, | |
| ): | |
| super(DocsSpider, self).__init__(*args, **kwargs) | |
| homepage_url = sanitize_url(homepage_url) | |
| self.allowed_domains = [extract_domain(homepage_url)] | |
| self.start_urls = [homepage_url] | |
| self.base_dir = Path(save_dir) | |
| self.target_version = target_version | |
| def parse(self, response): | |
| parsed_uri = urlparse(response.url) | |
| # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename. | |
| if parsed_uri.path.endswith("/"): | |
| filepath = ( | |
| self.base_dir | |
| / parsed_uri.netloc | |
| / parsed_uri.path.strip("/") | |
| / "index.html" | |
| ) | |
| else: | |
| filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/") | |
| filepath.parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, "wb") as f: | |
| f.write(response.body) | |
| # Follow links to other documentation pages only if they contain the target version in the full URL | |
| for href in response.css("a::attr(href)").getall(): | |
| if self.target_version: | |
| # A version was specified, check to see if it's the correct version from url | |
| full_url = response.urljoin(href) # Expand href to a full URL | |
| if self.target_version in full_url: | |
| yield response.follow(href, self.parse) | |
| else: | |
| # no version specified, follow all links | |
| yield response.follow(href, self.parse) | |
