import logging import re import os from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, List, Union from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from huggingface_hub import HfApi HF_API = HfApi(token=os.environ.get("TOKEN", None)) def get_base_url(url: str) -> str: """ Extracts the base URL from a given URL. Parameters: - url (str): The URL to extract the base URL from. Returns: - str: The base URL. """ parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" return base_url def get_domain_name(url: str) -> str: """ Get the domain name from a URL. Args: url (str): The URL. Returns: str: The domain name. """ parsed_uri = urlparse(url) domain = "{uri.netloc}".format(uri=parsed_uri) if domain.startswith("www."): domain = domain[4:] # Remove last domain domain = ".".join(domain.split(".")[:-1]) # First latter in uppercase return domain.capitalize() def get_favicon(url: str) -> str: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } try: response = requests.get(url, headers=headers, timeout=2) if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") # Search for all potential icons including meta tags icon_links = soup.find_all( "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I) ) meta_icons = soup.find_all( "meta", attrs={"content": re.compile(r".ico$", re.I)} ) icons = icon_links + meta_icons if icons: for icon in icons: favicon_url = icon.get("href") or icon.get("content") if favicon_url: if favicon_url.startswith("/"): favicon_url = urljoin(url, favicon_url) return favicon_url # If icons found but no href or content, return default return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" else: # No icons found, return default return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" else: # Response was not OK, return default return ( "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" ) except requests.Timeout: logging.warning(f"Request timed out for {url}") return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" except Exception as e: logging.warning(f"An error occurred while fetching favicon for {url}: {e}") return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" def download_favicons(urls: List[str]) -> Dict[str, str]: favicons = {} urls = list(set(urls)) with ThreadPoolExecutor(max_workers=20) as executor: future_to_url = {executor.submit(get_favicon, url): url for url in urls} for future in as_completed(future_to_url): url = future_to_url[future] try: favicon_url = future.result() favicons[url] = favicon_url except Exception as e: logging.warning(f"Failed to fetch favicon for {url}: {e}") favicons[url] = ( "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" ) return favicons def url_exists(url): """ Checks if a URL exists by making a HEAD request. Parameters: - url (str): The URL to check. Returns: - bool: True if the URL exists, False otherwise. """ try: response = requests.head(url, allow_redirects=True) return response.status_code < 400 except requests.RequestException: # In case of network problems, SSL errors, etc. return False def build_dataset_url(dataset_name: str): """ Build an HTML string with the dataset URL. """ url = f"https://huggingface.co/datasets/{dataset_name}" # Test if the url exists if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"): return url else: return None def build_model_url(model_name: str): """ Build an HTML string with the model URL. """ url = f"https://huggingface.co/{model_name}" # Test if the url exists if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"): return url else: return None def build_text_icon(text: str, url: Union[str, None], icon_url: str): if url is not None: return ( f'' f'{url}' f'{text} ' ) else: return text def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]: """ Build a dictionary of dataset URLs from a list of dataset names. Parameters: - datasets_names (List[str]): The list of dataset names. Returns: - Dict[str, str]: A dictionary of dataset URLs. """ return {dataset: build_dataset_url(dataset) for dataset in datasets_names} def build_models_urls(models_names: List[str]) -> Dict[str, str]: """ Build a dictionary of model URLs from a list of model names. Parameters: - models_names (List[str]): The list of model names. Returns: - Dict[str, str]: A dictionary of model URLs. """ return {model: build_model_url(model) for model in models_names}