Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Sleeping

File size: 9,638 Bytes

import json
import os

import filelock
import huggingface_hub
import pandas as pd

from utils import (
    build_datasets_urls,
    build_models_urls,
    build_text_icon,
    download_favicons,
    get_base_url,
    get_domain_name,
)


HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"

DISABLE_ONLINE_CACHE = False
ONLINE_CACHE = "CONDA-Workshop/RequestCache"


def save_cache(cache_data, cache_file, initial_timestamp):
    print(f"Saving cache to {cache_file}")
    # Acquire lock before reading and updating the file to prevent race conditions
    with filelock.FileLock(f"{cache_file}.lock"):
        # Check if the file has been modified since the initial read
        current_timestamp = (
            os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
        )
        if current_timestamp is None or initial_timestamp != current_timestamp:
            # File has been modified or created since initial read, re-read the file
            try:
                with open(cache_file, "r", encoding="utf8") as f:
                    # Update the dictionary with newly added entries
                    cache_dict = json.load(f)
                    # Test if cache_dict and cache_data are different
                    if cache_dict != cache_data:
                        cache_data.update(cache_dict)

            except FileNotFoundError:
                pass  # If the file doesn't exist at this point, continue with the current dictionary

        # Write the updated dictionary back to the file
        with open(cache_file, "w", encoding="utf8") as f:
            json.dump(cache_data, f, ensure_ascii=False, indent=4)

        if not DISABLE_ONLINE_CACHE:
            try:
                huggingface_hub.upload_file(
                    repo_id=ONLINE_CACHE,
                    repo_type="dataset",
                    token=os.environ.get("TOKEN") or True,
                    path_in_repo=cache_file,
                    path_or_fileobj=cache_file,
                )
            except Exception as e:
                print(f"Unable to upload {cache_file}: {e}")

    return cache_data


def update_favicon_cache(sources):
    # Load the favicon dictionary if it exists
    favicon_dict = {}
    favicon_file_path = "favicons.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=favicon_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download favicons.json: {e}")

    # Attempt to load the favicon dictionary and record its last modification time
    if os.path.exists(favicon_file_path):
        initial_timestamp = os.path.getmtime(favicon_file_path)
        try:
            with open(favicon_file_path, "r", encoding="utf8") as f:
                favicon_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which favicons need to be downloaded
    missing_domains = [domain for domain in sources if domain not in favicon_dict]

    # Download missing favicons in batch
    if missing_domains:
        new_favicon_urls = download_favicons(missing_domains)
        favicon_dict.update(new_favicon_urls)
        favicon_dict = save_cache(
            cache_data=favicon_dict,
            cache_file=favicon_file_path,
            initial_timestamp=initial_timestamp,
        )

    return favicon_dict


def update_model_url_cache(models):
    models = [x for x in models if x is not None]
    models = list(set(models))

    # Load the model url dictionary if it exists
    model_url_dict = {}
    model_url_file_path = "model_urls.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=model_url_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download model_urls.json: {e}")

    # Attempt to load the model url dictionary and record its last modification time
    if os.path.exists(model_url_file_path):
        initial_timestamp = os.path.getmtime(model_url_file_path)
        try:
            with open(model_url_file_path, "r", encoding="utf8") as f:
                model_url_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which model urls need to be downloaded
    missing_model_urls = [model for model in models if model not in model_url_dict]

    # Download missing model urls in batch
    if missing_model_urls:
        new_model_urls = build_models_urls(missing_model_urls)
        model_url_dict.update(new_model_urls)
        model_url_dict = save_cache(
            cache_data=model_url_dict,
            cache_file=model_url_file_path,
            initial_timestamp=initial_timestamp,
        )

    return model_url_dict


def update_dataset_url_cache(datasets):
    datasets = [x for x in datasets if x is not None]
    datasets = list(set(datasets))

    # Load the dataset url dictionary if it exists
    dataset_url_dict = {}
    dataset_url_file_path = "dataset_urls.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=dataset_url_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download dataset_urls.json: {e}")

    # Attempt to load the dataset url dictionary and record its last modification time
    if os.path.exists(dataset_url_file_path):
        initial_timestamp = os.path.getmtime(dataset_url_file_path)
        try:
            with open(dataset_url_file_path, "r", encoding="utf8") as f:
                dataset_url_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which dataset urls need to be downloaded
    missing_dataset_urls = [
        dataset for dataset in datasets if dataset not in dataset_url_dict
    ]

    # Download missing dataset urls in batch
    if missing_dataset_urls:
        new_dataset_urls = build_datasets_urls(missing_dataset_urls)
        dataset_url_dict.update(new_dataset_urls)
        dataset_url_dict = save_cache(
            cache_data=dataset_url_dict,
            cache_file=dataset_url_file_path,
            initial_timestamp=initial_timestamp,
        )

    return dataset_url_dict


def get_dataframe():
    # Load the contamination_report.csv file
    data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)

    # Load the favicon dictionary if it exists
    favicon_dict = {}

    # Update the favicon dictionary
    favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])

    # Update the model url dictionary
    model_url_dict = update_model_url_cache(
        data[data["Model or corpus"] == "model"]["Contaminated Source"]
    )

    # Update the dataset url dictionary
    dataset_url_dict = update_dataset_url_cache(
        list(data["Evaluation Dataset"])
        + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
    )

    # Add favicons URLs to the dataframe in a vectorized manner
    data["Reference"] = data["Reference"].apply(
        lambda x: build_text_icon(
            text=get_domain_name(x),
            url=x,
            icon_url=favicon_dict.get(get_base_url(x), ""),
        )
    )

    PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}"
    data["PR"] = data["PR"].apply(
        lambda x: build_text_icon(
            text="",
            url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link",
            icon_url=HF_ICON if x == x else CROSS_ICON,
        )
    )

    data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
        lambda x: build_text_icon(
            text=x,
            url=dataset_url_dict.get(x, ""),
            icon_url=HF_ICON,
        )
    )

    data["Evaluation Dataset"] = data.apply(
        lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
        axis=1,
    )

    del data["Subset"]

    # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
    data["Contaminated Source"] = data.apply(
        lambda x: build_text_icon(
            text=x["Contaminated Source"],
            url=dataset_url_dict.get(x["Contaminated Source"], "")
            if x["Model or corpus"] == "corpus"
            else model_url_dict.get(x["Contaminated Source"], ""),
            icon_url=HF_ICON,
        ),
        axis=1,
    )

    data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
    data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
    data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)

    return data