|
import json |
|
import os |
|
|
|
import filelock |
|
import huggingface_hub |
|
import pandas as pd |
|
|
|
from utils import ( |
|
build_datasets_urls, |
|
build_models_urls, |
|
build_text_icon, |
|
download_favicons, |
|
get_base_url, |
|
get_domain_name, |
|
) |
|
|
|
|
|
HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg" |
|
CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png" |
|
|
|
DISABLE_ONLINE_CACHE = False |
|
ONLINE_CACHE = "CONDA-Workshop/RequestCache" |
|
|
|
|
|
def save_cache(cache_data, cache_file, initial_timestamp): |
|
print(f"Saving cache to {cache_file}") |
|
|
|
with filelock.FileLock(f"{cache_file}.lock"): |
|
|
|
current_timestamp = ( |
|
os.path.getmtime(cache_file) if os.path.exists(cache_file) else None |
|
) |
|
if current_timestamp is None or initial_timestamp != current_timestamp: |
|
|
|
try: |
|
with open(cache_file, "r", encoding="utf8") as f: |
|
|
|
cache_dict = json.load(f) |
|
|
|
if cache_dict != cache_data: |
|
cache_data.update(cache_dict) |
|
|
|
except FileNotFoundError: |
|
pass |
|
|
|
|
|
with open(cache_file, "w", encoding="utf8") as f: |
|
json.dump(cache_data, f, ensure_ascii=False, indent=4) |
|
|
|
if not DISABLE_ONLINE_CACHE: |
|
try: |
|
huggingface_hub.upload_file( |
|
repo_id=ONLINE_CACHE, |
|
repo_type="dataset", |
|
token=os.environ.get("TOKEN") or True, |
|
path_in_repo=cache_file, |
|
path_or_fileobj=cache_file, |
|
) |
|
except Exception as e: |
|
print(f"Unable to upload {cache_file}: {e}") |
|
|
|
return cache_data |
|
|
|
|
|
def update_favicon_cache(sources): |
|
|
|
favicon_dict = {} |
|
favicon_file_path = "favicons.json" |
|
initial_timestamp = None |
|
|
|
if not DISABLE_ONLINE_CACHE: |
|
try: |
|
huggingface_hub.hf_hub_download( |
|
repo_id=ONLINE_CACHE, |
|
repo_type="dataset", |
|
token=os.environ.get("TOKEN") or True, |
|
filename=favicon_file_path, |
|
local_dir=os.getcwd(), |
|
) |
|
except Exception as e: |
|
print(f"Unable to download favicons.json: {e}") |
|
|
|
|
|
if os.path.exists(favicon_file_path): |
|
initial_timestamp = os.path.getmtime(favicon_file_path) |
|
try: |
|
with open(favicon_file_path, "r", encoding="utf8") as f: |
|
favicon_dict = json.load(f) |
|
except FileNotFoundError: |
|
pass |
|
|
|
|
|
missing_domains = [domain for domain in sources if domain not in favicon_dict] |
|
|
|
|
|
if missing_domains: |
|
new_favicon_urls = download_favicons(missing_domains) |
|
favicon_dict.update(new_favicon_urls) |
|
favicon_dict = save_cache( |
|
cache_data=favicon_dict, |
|
cache_file=favicon_file_path, |
|
initial_timestamp=initial_timestamp, |
|
) |
|
|
|
return favicon_dict |
|
|
|
|
|
def update_model_url_cache(models): |
|
models = [x for x in models if x is not None] |
|
models = list(set(models)) |
|
|
|
|
|
model_url_dict = {} |
|
model_url_file_path = "model_urls.json" |
|
initial_timestamp = None |
|
|
|
if not DISABLE_ONLINE_CACHE: |
|
try: |
|
huggingface_hub.hf_hub_download( |
|
repo_id=ONLINE_CACHE, |
|
repo_type="dataset", |
|
token=os.environ.get("TOKEN") or True, |
|
filename=model_url_file_path, |
|
local_dir=os.getcwd(), |
|
) |
|
except Exception as e: |
|
print(f"Unable to download model_urls.json: {e}") |
|
|
|
|
|
if os.path.exists(model_url_file_path): |
|
initial_timestamp = os.path.getmtime(model_url_file_path) |
|
try: |
|
with open(model_url_file_path, "r", encoding="utf8") as f: |
|
model_url_dict = json.load(f) |
|
except FileNotFoundError: |
|
pass |
|
|
|
|
|
missing_model_urls = [model for model in models if model not in model_url_dict] |
|
|
|
|
|
if missing_model_urls: |
|
new_model_urls = build_models_urls(missing_model_urls) |
|
model_url_dict.update(new_model_urls) |
|
model_url_dict = save_cache( |
|
cache_data=model_url_dict, |
|
cache_file=model_url_file_path, |
|
initial_timestamp=initial_timestamp, |
|
) |
|
|
|
return model_url_dict |
|
|
|
|
|
def update_dataset_url_cache(datasets): |
|
datasets = [x for x in datasets if x is not None] |
|
datasets = list(set(datasets)) |
|
|
|
|
|
dataset_url_dict = {} |
|
dataset_url_file_path = "dataset_urls.json" |
|
initial_timestamp = None |
|
|
|
if not DISABLE_ONLINE_CACHE: |
|
try: |
|
huggingface_hub.hf_hub_download( |
|
repo_id=ONLINE_CACHE, |
|
repo_type="dataset", |
|
token=os.environ.get("TOKEN") or True, |
|
filename=dataset_url_file_path, |
|
local_dir=os.getcwd(), |
|
) |
|
except Exception as e: |
|
print(f"Unable to download dataset_urls.json: {e}") |
|
|
|
|
|
if os.path.exists(dataset_url_file_path): |
|
initial_timestamp = os.path.getmtime(dataset_url_file_path) |
|
try: |
|
with open(dataset_url_file_path, "r", encoding="utf8") as f: |
|
dataset_url_dict = json.load(f) |
|
except FileNotFoundError: |
|
pass |
|
|
|
|
|
missing_dataset_urls = [ |
|
dataset for dataset in datasets if dataset not in dataset_url_dict |
|
] |
|
|
|
|
|
if missing_dataset_urls: |
|
new_dataset_urls = build_datasets_urls(missing_dataset_urls) |
|
dataset_url_dict.update(new_dataset_urls) |
|
dataset_url_dict = save_cache( |
|
cache_data=dataset_url_dict, |
|
cache_file=dataset_url_file_path, |
|
initial_timestamp=initial_timestamp, |
|
) |
|
|
|
return dataset_url_dict |
|
|
|
|
|
def get_dataframe(): |
|
|
|
data = pd.read_csv("contamination_report.csv", delimiter=";", header=0) |
|
|
|
|
|
favicon_dict = {} |
|
|
|
|
|
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]]) |
|
|
|
|
|
model_url_dict = update_model_url_cache( |
|
data[data["Model or corpus"] == "model"]["Contaminated Source"] |
|
) |
|
|
|
|
|
dataset_url_dict = update_dataset_url_cache( |
|
list(data["Evaluation Dataset"]) |
|
+ list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"]) |
|
) |
|
|
|
|
|
data["Reference"] = data["Reference"].apply( |
|
lambda x: build_text_icon( |
|
text=get_domain_name(x), |
|
url=x, |
|
icon_url=favicon_dict.get(get_base_url(x), ""), |
|
) |
|
) |
|
|
|
PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}" |
|
data["PR"] = data["PR"].apply( |
|
lambda x: build_text_icon( |
|
text="", |
|
url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link", |
|
icon_url=HF_ICON if x == x else CROSS_ICON, |
|
) |
|
) |
|
|
|
data["Evaluation Dataset"] = data["Evaluation Dataset"].apply( |
|
lambda x: build_text_icon( |
|
text=x, |
|
url=dataset_url_dict.get(x, ""), |
|
icon_url=HF_ICON, |
|
) |
|
) |
|
|
|
data["Evaluation Dataset"] = data.apply( |
|
lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"], |
|
axis=1, |
|
) |
|
|
|
del data["Subset"] |
|
|
|
|
|
data["Contaminated Source"] = data.apply( |
|
lambda x: build_text_icon( |
|
text=x["Contaminated Source"], |
|
url=dataset_url_dict.get(x["Contaminated Source"], "") |
|
if x["Model or corpus"] == "corpus" |
|
else model_url_dict.get(x["Contaminated Source"], ""), |
|
icon_url=HF_ICON, |
|
), |
|
axis=1, |
|
) |
|
|
|
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x) |
|
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x) |
|
data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x) |
|
|
|
return data |
|
|