jerpint's picture
sanitize web urls
df044c6
import logging
from pathlib import Path
from urllib.parse import urlparse
import scrapy
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
from urllib.parse import urlparse
def extract_domain(url):
"""
Extract the domain (including subdomains) from a given URL.
Args:
- url (str): The URL from which the domain needs to be extracted.
Returns:
- str: The domain (with subdomains) extracted from the URL.
For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.
"""
parsed_uri = urlparse(url)
# The netloc attribute will contain the domain name
domain = parsed_uri.netloc
return domain
def sanitize_url(url: str) -> str:
"""Adds https:// and trailing backslash."""
if not url.startswith("https://"):
url = "https://" + url
if not url.endswith("/"):
url = url + "/"
return url
class DocsSpider(scrapy.Spider):
name = "docs"
def __init__(
self,
homepage_url: str,
save_dir="outputs/",
target_version=None,
*args,
**kwargs,
):
super(DocsSpider, self).__init__(*args, **kwargs)
homepage_url = sanitize_url(homepage_url)
self.allowed_domains = [extract_domain(homepage_url)]
self.start_urls = [homepage_url]
self.base_dir = Path(save_dir)
self.target_version = target_version
def parse(self, response):
parsed_uri = urlparse(response.url)
# Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
if parsed_uri.path.endswith("/"):
filepath = (
self.base_dir
/ parsed_uri.netloc
/ parsed_uri.path.strip("/")
/ "index.html"
)
else:
filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
filepath.parent.mkdir(parents=True, exist_ok=True)
with open(filepath, "wb") as f:
f.write(response.body)
# Follow links to other documentation pages only if they contain the target version in the full URL
for href in response.css("a::attr(href)").getall():
if self.target_version:
# A version was specified, check to see if it's the correct version from url
full_url = response.urljoin(href) # Expand href to a full URL
if self.target_version in full_url:
yield response.follow(href, self.parse)
else:
# no version specified, follow all links
yield response.follow(href, self.parse)