|
import logging |
|
import re |
|
import os |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from typing import Dict, List, Union |
|
from urllib.parse import urljoin, urlparse |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
from huggingface_hub import HfApi |
|
|
|
HF_API = HfApi(token=os.environ.get("TOKEN", None)) |
|
|
|
|
|
def get_base_url(url: str) -> str: |
|
""" |
|
Extracts the base URL from a given URL. |
|
|
|
Parameters: |
|
- url (str): The URL to extract the base URL from. |
|
|
|
Returns: |
|
- str: The base URL. |
|
""" |
|
parsed_url = urlparse(url) |
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
|
return base_url |
|
|
|
|
|
def get_domain_name(url: str) -> str: |
|
""" |
|
Get the domain name from a URL. |
|
|
|
Args: |
|
url (str): The URL. |
|
|
|
Returns: |
|
str: The domain name. |
|
""" |
|
|
|
parsed_uri = urlparse(url) |
|
domain = "{uri.netloc}".format(uri=parsed_uri) |
|
if domain.startswith("www."): |
|
domain = domain[4:] |
|
|
|
|
|
domain = ".".join(domain.split(".")[:-1]) |
|
|
|
return domain.capitalize() |
|
|
|
|
|
def get_favicon(url: str) -> str: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" |
|
} |
|
try: |
|
response = requests.get(url, headers=headers, timeout=2) |
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
icon_links = soup.find_all( |
|
"link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I) |
|
) |
|
meta_icons = soup.find_all( |
|
"meta", attrs={"content": re.compile(r".ico$", re.I)} |
|
) |
|
icons = icon_links + meta_icons |
|
|
|
if icons: |
|
for icon in icons: |
|
favicon_url = icon.get("href") or icon.get("content") |
|
if favicon_url: |
|
if favicon_url.startswith("/"): |
|
favicon_url = urljoin(url, favicon_url) |
|
return favicon_url |
|
|
|
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
else: |
|
|
|
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
else: |
|
|
|
return ( |
|
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
) |
|
except requests.Timeout: |
|
logging.warning(f"Request timed out for {url}") |
|
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
except Exception as e: |
|
logging.warning(f"An error occurred while fetching favicon for {url}: {e}") |
|
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
|
|
|
|
def download_favicons(urls: List[str]) -> Dict[str, str]: |
|
favicons = {} |
|
urls = list(set(urls)) |
|
with ThreadPoolExecutor(max_workers=20) as executor: |
|
future_to_url = {executor.submit(get_favicon, url): url for url in urls} |
|
for future in as_completed(future_to_url): |
|
url = future_to_url[future] |
|
try: |
|
favicon_url = future.result() |
|
favicons[url] = favicon_url |
|
except Exception as e: |
|
logging.warning(f"Failed to fetch favicon for {url}: {e}") |
|
favicons[url] = ( |
|
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
) |
|
return favicons |
|
|
|
|
|
def url_exists(url): |
|
""" |
|
Checks if a URL exists by making a HEAD request. |
|
|
|
Parameters: |
|
- url (str): The URL to check. |
|
|
|
Returns: |
|
- bool: True if the URL exists, False otherwise. |
|
""" |
|
try: |
|
response = requests.head(url, allow_redirects=True) |
|
return response.status_code < 400 |
|
except requests.RequestException: |
|
|
|
return False |
|
|
|
|
|
def build_dataset_url(dataset_name: str): |
|
""" |
|
Build an HTML string with the dataset URL. |
|
""" |
|
url = f"https://huggingface.co/datasets/{dataset_name}" |
|
|
|
if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"): |
|
return url |
|
else: |
|
return None |
|
|
|
|
|
def build_model_url(model_name: str): |
|
""" |
|
Build an HTML string with the model URL. |
|
""" |
|
url = f"https://huggingface.co/{model_name}" |
|
|
|
if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"): |
|
return url |
|
else: |
|
return None |
|
|
|
|
|
def build_text_icon(text: str, url: Union[str, None], icon_url: str): |
|
if url is not None: |
|
return ( |
|
f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">' |
|
f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">' |
|
f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>' |
|
) |
|
else: |
|
return text |
|
|
|
|
|
def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]: |
|
""" |
|
Build a dictionary of dataset URLs from a list of dataset names. |
|
|
|
Parameters: |
|
- datasets_names (List[str]): The list of dataset names. |
|
|
|
Returns: |
|
- Dict[str, str]: A dictionary of dataset URLs. |
|
""" |
|
return {dataset: build_dataset_url(dataset) for dataset in datasets_names} |
|
|
|
|
|
def build_models_urls(models_names: List[str]) -> Dict[str, str]: |
|
""" |
|
Build a dictionary of model URLs from a list of model names. |
|
|
|
Parameters: |
|
- models_names (List[str]): The list of model names. |
|
|
|
Returns: |
|
- Dict[str, str]: A dictionary of model URLs. |
|
""" |
|
return {model: build_model_url(model) for model in models_names} |
|
|