Spaces:
Runtime error
Runtime error
import re | |
from typing import List, Optional, Sequence, Union | |
from urllib.parse import urljoin, urlparse | |
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") | |
SUFFIXES_TO_IGNORE = ( | |
".css", | |
".js", | |
".ico", | |
".png", | |
".jpg", | |
".jpeg", | |
".gif", | |
".svg", | |
".csv", | |
".bz2", | |
".zip", | |
".epub", | |
) | |
SUFFIXES_TO_IGNORE_REGEX = ( | |
"(?!" + "|".join([re.escape(s) + r"[\#'\"]" for s in SUFFIXES_TO_IGNORE]) + ")" | |
) | |
PREFIXES_TO_IGNORE_REGEX = ( | |
"(?!" + "|".join([re.escape(s) for s in PREFIXES_TO_IGNORE]) + ")" | |
) | |
DEFAULT_LINK_REGEX = ( | |
rf"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)[\#'\"]" | |
) | |
def find_all_links( | |
raw_html: str, *, pattern: Union[str, re.Pattern, None] = None | |
) -> List[str]: | |
"""Extract all links from a raw html string. | |
Args: | |
raw_html: original html. | |
pattern: Regex to use for extracting links from raw html. | |
Returns: | |
List[str]: all links | |
""" | |
pattern = pattern or DEFAULT_LINK_REGEX | |
return list(set(re.findall(pattern, raw_html))) | |
def extract_sub_links( | |
raw_html: str, | |
url: str, | |
*, | |
base_url: Optional[str] = None, | |
pattern: Union[str, re.Pattern, None] = None, | |
prevent_outside: bool = True, | |
exclude_prefixes: Sequence[str] = (), | |
) -> List[str]: | |
"""Extract all links from a raw html string and convert into absolute paths. | |
Args: | |
raw_html: original html. | |
url: the url of the html. | |
base_url: the base url to check for outside links against. | |
pattern: Regex to use for extracting links from raw html. | |
prevent_outside: If True, ignore external links which are not children | |
of the base url. | |
exclude_prefixes: Exclude any URLs that start with one of these prefixes. | |
Returns: | |
List[str]: sub links | |
""" | |
base_url = base_url if base_url is not None else url | |
all_links = find_all_links(raw_html, pattern=pattern) | |
absolute_paths = set() | |
for link in all_links: | |
# Some may be absolute links like https://to/path | |
if link.startswith("http"): | |
absolute_paths.add(link) | |
# Some may have omitted the protocol like //to/path | |
elif link.startswith("//"): | |
absolute_paths.add(f"{urlparse(url).scheme}:{link}") | |
else: | |
absolute_paths.add(urljoin(url, link)) | |
res = [] | |
for path in absolute_paths: | |
if any(path.startswith(exclude) for exclude in exclude_prefixes): | |
continue | |
if prevent_outside and not path.startswith(base_url): | |
continue | |
res.append(path) | |
return res | |