import re from typing import List, Optional, Sequence, Union from urllib.parse import urljoin, urlparse PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") SUFFIXES_TO_IGNORE = ( ".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".csv", ".bz2", ".zip", ".epub", ) SUFFIXES_TO_IGNORE_REGEX = ( "(?!" + "|".join([re.escape(s) + r"[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")" ) PREFIXES_TO_IGNORE_REGEX = ( "(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")" ) DEFAULT_LINK_REGEX = ( rf"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]" ) def find_all_links( raw_html: str, *, pattern: Union[str, re.Pattern, None] = None ) -> List[str]: """Extract all links from a raw html string. Args: raw_html: original html. pattern: Regex to use for extracting links from raw html. Returns: List[str]: all links """ pattern = pattern or DEFAULT_LINK_REGEX return list(set(re.findall(pattern, raw_html))) def extract_sub_links( raw_html: str, url: str, *, base_url: Optional[str] = None, pattern: Union[str, re.Pattern, None] = None, prevent_outside: bool = True, exclude_prefixes: Sequence[str] = (), ) -> List[str]: """Extract all links from a raw html string and convert into absolute paths. Args: raw_html: original html. url: the url of the html. base_url: the base url to check for outside links against. pattern: Regex to use for extracting links from raw html. prevent_outside: If True, ignore external links which are not children of the base url. exclude_prefixes: Exclude any URLs that start with one of these prefixes. Returns: List[str]: sub links """ base_url = base_url if base_url is not None else url all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: # Some may be absolute links like https://to/path if link.startswith("http"): absolute_paths.add(link) # Some may have omitted the protocol like //to/path elif link.startswith("//"): absolute_paths.add(f"{urlparse(url).scheme}:{link}") else: absolute_paths.add(urljoin(url, link)) res = [] for path in absolute_paths: if any(path.startswith(exclude) for exclude in exclude_prefixes): continue if prevent_outside and not path.startswith(base_url): continue res.append(path) return res