OSainz's picture
Get token from environment
76cf558
import logging
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi
HF_API = HfApi(token=os.environ.get("TOKEN", None))
def get_base_url(url: str) -> str:
"""
Extracts the base URL from a given URL.
Parameters:
- url (str): The URL to extract the base URL from.
Returns:
- str: The base URL.
"""
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_url
def get_domain_name(url: str) -> str:
"""
Get the domain name from a URL.
Args:
url (str): The URL.
Returns:
str: The domain name.
"""
parsed_uri = urlparse(url)
domain = "{uri.netloc}".format(uri=parsed_uri)
if domain.startswith("www."):
domain = domain[4:]
# Remove last domain
domain = ".".join(domain.split(".")[:-1])
# First latter in uppercase
return domain.capitalize()
def get_favicon(url: str) -> str:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
try:
response = requests.get(url, headers=headers, timeout=2)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
# Search for all potential icons including meta tags
icon_links = soup.find_all(
"link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I)
)
meta_icons = soup.find_all(
"meta", attrs={"content": re.compile(r".ico$", re.I)}
)
icons = icon_links + meta_icons
if icons:
for icon in icons:
favicon_url = icon.get("href") or icon.get("content")
if favicon_url:
if favicon_url.startswith("/"):
favicon_url = urljoin(url, favicon_url)
return favicon_url
# If icons found but no href or content, return default
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
else:
# No icons found, return default
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
else:
# Response was not OK, return default
return (
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
)
except requests.Timeout:
logging.warning(f"Request timed out for {url}")
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
except Exception as e:
logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
def download_favicons(urls: List[str]) -> Dict[str, str]:
favicons = {}
urls = list(set(urls))
with ThreadPoolExecutor(max_workers=20) as executor:
future_to_url = {executor.submit(get_favicon, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
favicon_url = future.result()
favicons[url] = favicon_url
except Exception as e:
logging.warning(f"Failed to fetch favicon for {url}: {e}")
favicons[url] = (
"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
)
return favicons
def url_exists(url):
"""
Checks if a URL exists by making a HEAD request.
Parameters:
- url (str): The URL to check.
Returns:
- bool: True if the URL exists, False otherwise.
"""
try:
response = requests.head(url, allow_redirects=True)
return response.status_code < 400
except requests.RequestException:
# In case of network problems, SSL errors, etc.
return False
def build_dataset_url(dataset_name: str):
"""
Build an HTML string with the dataset URL.
"""
url = f"https://huggingface.co/datasets/{dataset_name}"
# Test if the url exists
if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"):
return url
else:
return None
def build_model_url(model_name: str):
"""
Build an HTML string with the model URL.
"""
url = f"https://huggingface.co/{model_name}"
# Test if the url exists
if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"):
return url
else:
return None
def build_text_icon(text: str, url: Union[str, None], icon_url: str):
if url is not None:
return (
f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
)
else:
return text
def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
"""
Build a dictionary of dataset URLs from a list of dataset names.
Parameters:
- datasets_names (List[str]): The list of dataset names.
Returns:
- Dict[str, str]: A dictionary of dataset URLs.
"""
return {dataset: build_dataset_url(dataset) for dataset in datasets_names}
def build_models_urls(models_names: List[str]) -> Dict[str, str]:
"""
Build a dictionary of model URLs from a list of model names.
Parameters:
- models_names (List[str]): The list of model names.
Returns:
- Dict[str, str]: A dictionary of model URLs.
"""
return {model: build_model_url(model) for model in models_names}