File size: 9,638 Bytes
eba8a37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd6f269
eba8a37
 
 
 
 
 
 
 
 
 
 
 
 
fd6f269
eba8a37
 
 
 
 
 
 
f35c65c
fd6f269
eba8a37
 
f35c65c
eba8a37
 
 
 
 
 
 
 
 
 
 
 
fd6f269
 
 
 
 
 
 
eba8a37
 
 
 
 
 
 
 
 
 
 
 
fd6f269
 
 
 
eba8a37
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import json
import os

import filelock
import huggingface_hub
import pandas as pd

from utils import (
    build_datasets_urls,
    build_models_urls,
    build_text_icon,
    download_favicons,
    get_base_url,
    get_domain_name,
)


HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"

DISABLE_ONLINE_CACHE = False
ONLINE_CACHE = "CONDA-Workshop/RequestCache"


def save_cache(cache_data, cache_file, initial_timestamp):
    print(f"Saving cache to {cache_file}")
    # Acquire lock before reading and updating the file to prevent race conditions
    with filelock.FileLock(f"{cache_file}.lock"):
        # Check if the file has been modified since the initial read
        current_timestamp = (
            os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
        )
        if current_timestamp is None or initial_timestamp != current_timestamp:
            # File has been modified or created since initial read, re-read the file
            try:
                with open(cache_file, "r", encoding="utf8") as f:
                    # Update the dictionary with newly added entries
                    cache_dict = json.load(f)
                    # Test if cache_dict and cache_data are different
                    if cache_dict != cache_data:
                        cache_data.update(cache_dict)

            except FileNotFoundError:
                pass  # If the file doesn't exist at this point, continue with the current dictionary

        # Write the updated dictionary back to the file
        with open(cache_file, "w", encoding="utf8") as f:
            json.dump(cache_data, f, ensure_ascii=False, indent=4)

        if not DISABLE_ONLINE_CACHE:
            try:
                huggingface_hub.upload_file(
                    repo_id=ONLINE_CACHE,
                    repo_type="dataset",
                    token=os.environ.get("TOKEN") or True,
                    path_in_repo=cache_file,
                    path_or_fileobj=cache_file,
                )
            except Exception as e:
                print(f"Unable to upload {cache_file}: {e}")

    return cache_data


def update_favicon_cache(sources):
    # Load the favicon dictionary if it exists
    favicon_dict = {}
    favicon_file_path = "favicons.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=favicon_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download favicons.json: {e}")

    # Attempt to load the favicon dictionary and record its last modification time
    if os.path.exists(favicon_file_path):
        initial_timestamp = os.path.getmtime(favicon_file_path)
        try:
            with open(favicon_file_path, "r", encoding="utf8") as f:
                favicon_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which favicons need to be downloaded
    missing_domains = [domain for domain in sources if domain not in favicon_dict]

    # Download missing favicons in batch
    if missing_domains:
        new_favicon_urls = download_favicons(missing_domains)
        favicon_dict.update(new_favicon_urls)
        favicon_dict = save_cache(
            cache_data=favicon_dict,
            cache_file=favicon_file_path,
            initial_timestamp=initial_timestamp,
        )

    return favicon_dict


def update_model_url_cache(models):
    models = [x for x in models if x is not None]
    models = list(set(models))

    # Load the model url dictionary if it exists
    model_url_dict = {}
    model_url_file_path = "model_urls.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=model_url_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download model_urls.json: {e}")

    # Attempt to load the model url dictionary and record its last modification time
    if os.path.exists(model_url_file_path):
        initial_timestamp = os.path.getmtime(model_url_file_path)
        try:
            with open(model_url_file_path, "r", encoding="utf8") as f:
                model_url_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which model urls need to be downloaded
    missing_model_urls = [model for model in models if model not in model_url_dict]

    # Download missing model urls in batch
    if missing_model_urls:
        new_model_urls = build_models_urls(missing_model_urls)
        model_url_dict.update(new_model_urls)
        model_url_dict = save_cache(
            cache_data=model_url_dict,
            cache_file=model_url_file_path,
            initial_timestamp=initial_timestamp,
        )

    return model_url_dict


def update_dataset_url_cache(datasets):
    datasets = [x for x in datasets if x is not None]
    datasets = list(set(datasets))

    # Load the dataset url dictionary if it exists
    dataset_url_dict = {}
    dataset_url_file_path = "dataset_urls.json"
    initial_timestamp = None

    if not DISABLE_ONLINE_CACHE:
        try:
            huggingface_hub.hf_hub_download(
                repo_id=ONLINE_CACHE,
                repo_type="dataset",
                token=os.environ.get("TOKEN") or True,
                filename=dataset_url_file_path,
                local_dir=os.getcwd(),
            )
        except Exception as e:
            print(f"Unable to download dataset_urls.json: {e}")

    # Attempt to load the dataset url dictionary and record its last modification time
    if os.path.exists(dataset_url_file_path):
        initial_timestamp = os.path.getmtime(dataset_url_file_path)
        try:
            with open(dataset_url_file_path, "r", encoding="utf8") as f:
                dataset_url_dict = json.load(f)
        except FileNotFoundError:
            pass  # File not found, proceed with an empty dictionary

    # Determine which dataset urls need to be downloaded
    missing_dataset_urls = [
        dataset for dataset in datasets if dataset not in dataset_url_dict
    ]

    # Download missing dataset urls in batch
    if missing_dataset_urls:
        new_dataset_urls = build_datasets_urls(missing_dataset_urls)
        dataset_url_dict.update(new_dataset_urls)
        dataset_url_dict = save_cache(
            cache_data=dataset_url_dict,
            cache_file=dataset_url_file_path,
            initial_timestamp=initial_timestamp,
        )

    return dataset_url_dict


def get_dataframe():
    # Load the contamination_report.csv file
    data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)

    # Load the favicon dictionary if it exists
    favicon_dict = {}

    # Update the favicon dictionary
    favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])

    # Update the model url dictionary
    model_url_dict = update_model_url_cache(
        data[data["Model or corpus"] == "model"]["Contaminated Source"]
    )

    # Update the dataset url dictionary
    dataset_url_dict = update_dataset_url_cache(
        list(data["Evaluation Dataset"])
        + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
    )

    # Add favicons URLs to the dataframe in a vectorized manner
    data["Reference"] = data["Reference"].apply(
        lambda x: build_text_icon(
            text=get_domain_name(x),
            url=x,
            icon_url=favicon_dict.get(get_base_url(x), ""),
        )
    )

    PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}"
    data["PR"] = data["PR"].apply(
        lambda x: build_text_icon(
            text="",
            url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link",
            icon_url=HF_ICON if x == x else CROSS_ICON,
        )
    )

    data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
        lambda x: build_text_icon(
            text=x,
            url=dataset_url_dict.get(x, ""),
            icon_url=HF_ICON,
        )
    )

    data["Evaluation Dataset"] = data.apply(
        lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
        axis=1,
    )

    del data["Subset"]

    # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
    data["Contaminated Source"] = data.apply(
        lambda x: build_text_icon(
            text=x["Contaminated Source"],
            url=dataset_url_dict.get(x["Contaminated Source"], "")
            if x["Model or corpus"] == "corpus"
            else model_url_dict.get(x["Contaminated Source"], ""),
            icon_url=HF_ICON,
        ),
        axis=1,
    )

    data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
    data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
    data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)

    return data