RAGTheDocs-mila-qc

Sleeping

File size: 2,695 Bytes

ac493ec
 
 
 
 
 
 
 
75f72d8
 
db19951
75f72d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac493ec
df044c6
 
 
 
 
 
 
 
 
 
ac493ec
 
 
db19951
 
 
df044c6
db19951
 
 
 
ac493ec
 
df044c6
ac493ec
75f72d8
ac493ec
 
75f72d8
ac493ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75f72d8
ac493ec
75f72d8
 
 
 
 
 
 
db19951

import logging
from pathlib import Path
from urllib.parse import urlparse

import scrapy

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)

from urllib.parse import urlparse


def extract_domain(url):
    """
    Extract the domain (including subdomains) from a given URL.

    Args:
    - url (str): The URL from which the domain needs to be extracted.

    Returns:
    - str: The domain (with subdomains) extracted from the URL.
           For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.

    """
    parsed_uri = urlparse(url)
    # The netloc attribute will contain the domain name
    domain = parsed_uri.netloc
    return domain


def sanitize_url(url: str) -> str:
    """Adds https:// and trailing backslash."""
    if not url.startswith("https://"):
        url = "https://" + url

    if not url.endswith("/"):
        url = url + "/"
    return url


class DocsSpider(scrapy.Spider):
    name = "docs"

    def __init__(
        self,
        homepage_url: str,
        save_dir="outputs/",
        target_version=None,
        *args,
        **kwargs,
    ):
        super(DocsSpider, self).__init__(*args, **kwargs)

        homepage_url = sanitize_url(homepage_url)

        self.allowed_domains = [extract_domain(homepage_url)]
        self.start_urls = [homepage_url]
        self.base_dir = Path(save_dir)
        self.target_version = target_version

    def parse(self, response):
        parsed_uri = urlparse(response.url)
        # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
        if parsed_uri.path.endswith("/"):
            filepath = (
                self.base_dir
                / parsed_uri.netloc
                / parsed_uri.path.strip("/")
                / "index.html"
            )
        else:
            filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
        filepath.parent.mkdir(parents=True, exist_ok=True)

        with open(filepath, "wb") as f:
            f.write(response.body)

        # Follow links to other documentation pages only if they contain the target version in the full URL
        for href in response.css("a::attr(href)").getall():
            if self.target_version:
                # A version was specified, check to see if it's the correct version from url
                full_url = response.urljoin(href)  # Expand href to a full URL
                if self.target_version in full_url:
                    yield response.follow(href, self.parse)
            else:
                # no version specified, follow all links
                yield response.follow(href, self.parse)