File size: 6,108 Bytes
eba8a37
 
76cf558
eba8a37
 
 
 
 
 
 
dee592a
 
76cf558
dee592a
eba8a37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd6f269
 
 
eba8a37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dee592a
eba8a37
 
 
 
 
 
 
 
 
 
 
dee592a
eba8a37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import logging
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Union
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

from huggingface_hub import HfApi

HF_API = HfApi(token=os.environ.get("TOKEN", None))


def get_base_url(url: str) -> str:
    """
    Extracts the base URL from a given URL.

    Parameters:
    - url (str): The URL to extract the base URL from.

    Returns:
    - str: The base URL.
    """
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return base_url


def get_domain_name(url: str) -> str:
    """
    Get the domain name from a URL.

    Args:
        url (str): The URL.

    Returns:
        str: The domain name.
    """

    parsed_uri = urlparse(url)
    domain = "{uri.netloc}".format(uri=parsed_uri)
    if domain.startswith("www."):
        domain = domain[4:]

    # Remove last domain
    domain = ".".join(domain.split(".")[:-1])
    # First latter in uppercase
    return domain.capitalize()


def get_favicon(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(url, headers=headers, timeout=2)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            # Search for all potential icons including meta tags
            icon_links = soup.find_all(
                "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I)
            )
            meta_icons = soup.find_all(
                "meta", attrs={"content": re.compile(r".ico$", re.I)}
            )
            icons = icon_links + meta_icons

            if icons:
                for icon in icons:
                    favicon_url = icon.get("href") or icon.get("content")
                    if favicon_url:
                        if favicon_url.startswith("/"):
                            favicon_url = urljoin(url, favicon_url)
                        return favicon_url
                # If icons found but no href or content, return default
                return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
            else:
                # No icons found, return default
                return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
        else:
            # Response was not OK, return default
            return (
                "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
            )
    except requests.Timeout:
        logging.warning(f"Request timed out for {url}")
        return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
    except Exception as e:
        logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
        return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"


def download_favicons(urls: List[str]) -> Dict[str, str]:
    favicons = {}
    urls = list(set(urls))
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(get_favicon, url): url for url in urls}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                favicon_url = future.result()
                favicons[url] = favicon_url
            except Exception as e:
                logging.warning(f"Failed to fetch favicon for {url}: {e}")
                favicons[url] = (
                    "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
                )
    return favicons


def url_exists(url):
    """
    Checks if a URL exists by making a HEAD request.

    Parameters:
    - url (str): The URL to check.

    Returns:
    - bool: True if the URL exists, False otherwise.
    """
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code < 400
    except requests.RequestException:
        # In case of network problems, SSL errors, etc.
        return False


def build_dataset_url(dataset_name: str):
    """
    Build an HTML string with the dataset URL.
    """
    url = f"https://huggingface.co/datasets/{dataset_name}"
    # Test if the url exists
    if url_exists(url) and HF_API.repo_exists(dataset_name, repo_type="dataset"):
        return url
    else:
        return None


def build_model_url(model_name: str):
    """
    Build an HTML string with the model URL.
    """
    url = f"https://huggingface.co/{model_name}"
    # Test if the url exists
    if url_exists(url) and HF_API.repo_exists(model_name, repo_type="model"):
        return url
    else:
        return None


def build_text_icon(text: str, url: Union[str, None], icon_url: str):
    if url is not None:
        return (
            f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
            f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
            f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
        )
    else:
        return text


def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
    """
    Build a dictionary of dataset URLs from a list of dataset names.

    Parameters:
    - datasets_names (List[str]): The list of dataset names.

    Returns:
    - Dict[str, str]: A dictionary of dataset URLs.
    """
    return {dataset: build_dataset_url(dataset) for dataset in datasets_names}


def build_models_urls(models_names: List[str]) -> Dict[str, str]:
    """
    Build a dictionary of model URLs from a list of model names.

    Parameters:
    - models_names (List[str]): The list of model names.

    Returns:
    - Dict[str, str]: A dictionary of model URLs.
    """
    return {model: build_model_url(model) for model in models_names}