Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Running

App Files Files Community

Data-Contamination-Database / utils.py

OSainz

Small changes

fd6f269 11 months ago

raw

history blame

5.9 kB

	import logging
	import re
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import Dict, List, Union
	from urllib.parse import urljoin, urlparse

	import requests
	from bs4 import BeautifulSoup


	def get_base_url(url: str) -> str:
	"""
	Extracts the base URL from a given URL.

	Parameters:
	- url (str): The URL to extract the base URL from.

	Returns:
	- str: The base URL.
	"""
	parsed_url = urlparse(url)
	base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
	return base_url


	def get_domain_name(url: str) -> str:
	"""
	Get the domain name from a URL.

	Args:
	url (str): The URL.

	Returns:
	str: The domain name.
	"""

	parsed_uri = urlparse(url)
	domain = "{uri.netloc}".format(uri=parsed_uri)
	if domain.startswith("www."):
	domain = domain[4:]

	# Remove last domain
	domain = ".".join(domain.split(".")[:-1])
	# First latter in uppercase
	return domain.capitalize()


	def get_favicon(url: str) -> str:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
	}
	try:
	response = requests.get(url, headers=headers, timeout=2)
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, "html.parser")
	# Search for all potential icons including meta tags
	icon_links = soup.find_all(
	"link", rel=re.compile(r"(shortcut icon\|icon\|apple-touch-icon)", re.I)
	)
	meta_icons = soup.find_all(
	"meta", attrs={"content": re.compile(r".ico$", re.I)}
	)
	icons = icon_links + meta_icons

	if icons:
	for icon in icons:
	favicon_url = icon.get("href") or icon.get("content")
	if favicon_url:
	if favicon_url.startswith("/"):
	favicon_url = urljoin(url, favicon_url)
	return favicon_url
	# If icons found but no href or content, return default
	return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
	else:
	# No icons found, return default
	return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
	else:
	# Response was not OK, return default
	return (
	"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
	)
	except requests.Timeout:
	logging.warning(f"Request timed out for {url}")
	return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
	except Exception as e:
	logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
	return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"


	def download_favicons(urls: List[str]) -> Dict[str, str]:
	favicons = {}
	urls = list(set(urls))
	with ThreadPoolExecutor(max_workers=20) as executor:
	future_to_url = {executor.submit(get_favicon, url): url for url in urls}
	for future in as_completed(future_to_url):
	url = future_to_url[future]
	try:
	favicon_url = future.result()
	favicons[url] = favicon_url
	except Exception as e:
	logging.warning(f"Failed to fetch favicon for {url}: {e}")
	favicons[url] = (
	"https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
	)
	return favicons


	def url_exists(url):
	"""
	Checks if a URL exists by making a HEAD request.

	Parameters:
	- url (str): The URL to check.

	Returns:
	- bool: True if the URL exists, False otherwise.
	"""
	try:
	response = requests.head(url, allow_redirects=True)
	return response.status_code < 400
	except requests.RequestException:
	# In case of network problems, SSL errors, etc.
	return False


	def build_dataset_url(dataset_name: str):
	"""
	Build an HTML string with the dataset URL.
	"""
	url = f"https://huggingface.co/datasets/{dataset_name}"
	# Test if the url exists
	if url_exists(url):
	return url
	else:
	return None


	def build_model_url(model_name: str):
	"""
	Build an HTML string with the model URL.
	"""
	url = f"https://huggingface.co/{model_name}"
	# Test if the url exists
	if url_exists(url):
	return url
	else:
	return None


	def build_text_icon(text: str, url: Union[str, None], icon_url: str):
	if url is not None:
	return (
	f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
	f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
	f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
	)
	else:
	return text


	def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
	"""
	Build a dictionary of dataset URLs from a list of dataset names.

	Parameters:
	- datasets_names (List[str]): The list of dataset names.

	Returns:
	- Dict[str, str]: A dictionary of dataset URLs.
	"""
	return {dataset: build_dataset_url(dataset) for dataset in datasets_names}


	def build_models_urls(models_names: List[str]) -> Dict[str, str]:
	"""
	Build a dictionary of model URLs from a list of model names.

	Parameters:
	- models_names (List[str]): The list of model names.

	Returns:
	- Dict[str, str]: A dictionary of model URLs.
	"""
	return {model: build_model_url(model) for model in models_names}