Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Running

File size: 6,108 Bytes

import logging
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

from huggingface_hub import HfApi

HF_API = HfApi(token=os.environ.get("TOKEN", None))


def get_base_url(url: str) -> str:
    """
    Extracts the base URL from a given URL.

    Parameters:
    - url (str): The URL to extract the base URL from.

    Returns:
    - str: The base URL.
    """
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return base_url


def get_domain_name(url: str) -> str:
    """
    Get the domain name from a URL.

    Args:
        url (str): The URL.

    Returns:
        str: The domain name.
    """

    parsed_uri = urlparse(url)
    domain = "{uri.netloc}".format(uri=parsed_uri)
    if domain.startswith("www."):
        domain = domain[4:]

    # Remove last domain
    domain = ".".join(domain.split(".")[:-1])
    # First latter in uppercase
    return domain.capitalize()


def get_favicon(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(url, headers=headers, timeout=2)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            # Search for all potential icons including meta tags
            icon_links = soup.find_all(
                "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I)
            )
            meta_icons = soup.find_all(
                "meta", attrs={"content": re.compile(r".ico$", re.I)}
            )
            icons = icon_links + meta_icons

            if icons:
                for icon in icons:
                    favicon_url = icon.get("href") or icon.get("content")
                    if favicon_url:
                        if favicon_url.startswith("/"):
                            favicon_url = urljoin(url, favicon_url)
                        return favicon_url
                # If icons found but no href or content, return default
                return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
            else:
                # No icons found, return default
                return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
        else:
            # Response was not OK, return default
            return (
                "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
            )
    except requests.Timeout:
        logging.warning(f"Request timed out for {url}")
        return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
    except Exception as e:
        logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
        return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"


def download_favicons(urls: List[str]) -> Dict[str, str]:
    favicons = {}
    urls = list(set(urls))
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(get_favicon, url): url for url in urls}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                favicon_url = future.result()
                favicons[url] = favicon_url
            except Exception as e:
                logging.warning(f"Failed to fetch favicon for {url}: {e}")
                favicons[url] = (
                    "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
                )
    return favicons


def url_exists(url):
    """
    Checks if a URL exists by making a HEAD request.

    Parameters:
    - url (str): The URL to check.

    Returns:
    - bool: True if the URL exists, False otherwise.
    """
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code < 400
    except requests.RequestException:
        # In case of network problems, SSL errors, etc.
        return False


def build_dataset_url(dataset_name: str):
    """
    Build an HTML string with the dataset URL.
    """
    url = f"https://huggingface.co/datasets/{dataset_name}"
    # Test if the url exists
    if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"):
        return url
    else:
        return None


def build_model_url(model_name: str):
    """
    Build an HTML string with the model URL.
    """
    url = f"https://huggingface.co/{model_name}"
    # Test if the url exists
    if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"):
        return url
    else:
        return None


def build_text_icon(text: str, url: Union[str, None], icon_url: str):
    if url is not None:
        return (
            f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
            f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
            f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
        )
    else:
        return text


def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
    """
    Build a dictionary of dataset URLs from a list of dataset names.

    Parameters:
    - datasets_names (List[str]): The list of dataset names.

    Returns:
    - Dict[str, str]: A dictionary of dataset URLs.
    """
    return {dataset: build_dataset_url(dataset) for dataset in datasets_names}


def build_models_urls(models_names: List[str]) -> Dict[str, str]:
    """
    Build a dictionary of model URLs from a list of model names.

    Parameters:
    - models_names (List[str]): The list of model names.

    Returns:
    - Dict[str, str]: A dictionary of model URLs.
    """
    return {model: build_model_url(model) for model in models_names}