File size: 2,695 Bytes
ac493ec
 
 
 
 
 
 
 
75f72d8
 
db19951
75f72d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac493ec
df044c6
 
 
 
 
 
 
 
 
 
ac493ec
 
 
db19951
 
 
df044c6
db19951
 
 
 
ac493ec
 
df044c6
ac493ec
75f72d8
ac493ec
 
75f72d8
ac493ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75f72d8
ac493ec
75f72d8
 
 
 
 
 
 
db19951
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import logging
from pathlib import Path
from urllib.parse import urlparse

import scrapy

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)

from urllib.parse import urlparse


def extract_domain(url):
    """
    Extract the domain (including subdomains) from a given URL.

    Args:
    - url (str): The URL from which the domain needs to be extracted.

    Returns:
    - str: The domain (with subdomains) extracted from the URL.
           For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.

    """
    parsed_uri = urlparse(url)
    # The netloc attribute will contain the domain name
    domain = parsed_uri.netloc
    return domain


def sanitize_url(url: str) -> str:
    """Adds https:// and trailing backslash."""
    if not url.startswith("https://"):
        url = "https://" + url

    if not url.endswith("/"):
        url = url + "/"
    return url


class DocsSpider(scrapy.Spider):
    name = "docs"

    def __init__(
        self,
        homepage_url: str,
        save_dir="outputs/",
        target_version=None,
        *args,
        **kwargs,
    ):
        super(DocsSpider, self).__init__(*args, **kwargs)

        homepage_url = sanitize_url(homepage_url)

        self.allowed_domains = [extract_domain(homepage_url)]
        self.start_urls = [homepage_url]
        self.base_dir = Path(save_dir)
        self.target_version = target_version

    def parse(self, response):
        parsed_uri = urlparse(response.url)
        # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
        if parsed_uri.path.endswith("/"):
            filepath = (
                self.base_dir
                / parsed_uri.netloc
                / parsed_uri.path.strip("/")
                / "index.html"
            )
        else:
            filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
        filepath.parent.mkdir(parents=True, exist_ok=True)

        with open(filepath, "wb") as f:
            f.write(response.body)

        # Follow links to other documentation pages only if they contain the target version in the full URL
        for href in response.css("a::attr(href)").getall():
            if self.target_version:
                # A version was specified, check to see if it's the correct version from url
                full_url = response.urljoin(href)  # Expand href to a full URL
                if self.target_version in full_url:
                    yield response.follow(href, self.parse)
            else:
                # no version specified, follow all links
                yield response.follow(href, self.parse)