Iker commited on
Commit
eba8a37
β€’
1 Parent(s): 1751f3a

Initital commit

Browse files
Files changed (7) hide show
  1. README.md +9 -4
  2. app.py +181 -0
  3. contamination_report.csv +4 -0
  4. dataset.py +260 -0
  5. markdown.py +69 -0
  6. requirements.txt +7 -0
  7. utils.py +181 -0
README.md CHANGED
@@ -1,13 +1,18 @@
1
  ---
2
- title: Data Contamination Report
3
- emoji: πŸƒ
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.21.0
 
8
  app_file: app.py
 
 
9
  pinned: false
10
- license: apache-2.0
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: πŸπŸ’¨ Data Contamination Report
3
+ emoji: 🏭
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ python_version: 3.10
8
+ sdk_version: 4.19.1
9
  app_file: app.py
10
+ app_port: 7860
11
+ fullWidth: true
12
  pinned: false
13
+ license: mit
14
+ suggested_hardware: cpu-upgrade
15
+
16
  ---
17
 
18
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ from dataset import get_dataframe
5
+ from markdown import GUIDELINES, PANEL_MARKDOWN
6
+
7
+
8
+ df = get_dataframe()
9
+
10
+
11
+ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
12
+ """
13
+ Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes.
14
+
15
+ Args:
16
+ dataframe (pandas.DataFrame): The input dataframe to filter.
17
+ eval_dataset (str): The evaluation dataset to filter by.
18
+ cont_source (str): The contaminated source to filter by.
19
+ checkboxes (list): The checkboxes to filter by.
20
+
21
+ Returns:
22
+ pandas.DataFrame: The filtered dataframe.
23
+ """
24
+ if isinstance(eval_dataset, str):
25
+ dataframe = dataframe[
26
+ dataframe["Evaluation Dataset"].str.contains(eval_dataset)
27
+ ]
28
+ if isinstance(cont_source, str):
29
+ dataframe = dataframe[
30
+ dataframe["Contaminated Source"].str.contains(cont_source)
31
+ ]
32
+ if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
33
+ dataframe = dataframe[dataframe["Approach"] != "model-based"]
34
+ if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes:
35
+ dataframe = dataframe[
36
+ (dataframe["Train Split"] > 0.0)
37
+ | (dataframe["Development Split"] > 0.0)
38
+ | (dataframe["Test Split"] > 0.0)
39
+ ]
40
+
41
+ return dataframe
42
+
43
+
44
+ def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame:
45
+ """
46
+ Filter the dataframe for corpus contamination.
47
+
48
+ Returns:
49
+ pandas.DataFrame: The filtered dataframe for corpus contamination.
50
+ """
51
+ # Get rows in which the column Model or corpus is equal to dataset
52
+ filtered_df = df[df["Model or corpus"] == "corpus"]
53
+ filtered_df = filtered_df.drop(columns=["Model or corpus"])
54
+ return filter_dataframe(filtered_df, *args, **kwargs)
55
+
56
+
57
+ def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame:
58
+ """
59
+ Filter the dataframe for model contamination.
60
+
61
+ Returns:
62
+ pandas.DataFrame: The filtered dataframe for model contamination.
63
+ """
64
+ # Get rows in which the column Model or corpus is equal to dataset
65
+ filtered_df = df[df["Model or corpus"] == "model"]
66
+ filtered_df = filtered_df.drop(columns=["Model or corpus"])
67
+ return filter_dataframe(filtered_df, *args, **kwargs)
68
+
69
+
70
+ theme = gr.themes.Soft(
71
+ primary_hue="emerald",
72
+ secondary_hue="red",
73
+ text_size="sm",
74
+ spacing_size="sm",
75
+ font=[
76
+ gr.themes.GoogleFont("Poppins"),
77
+ gr.themes.GoogleFont("Poppins"),
78
+ gr.themes.GoogleFont("Poppins"),
79
+ gr.themes.GoogleFont("Poppins"),
80
+ ],
81
+ ).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950")
82
+
83
+
84
+ with gr.Blocks(
85
+ theme=theme,
86
+ title="πŸ’¨ Data Contamination Report",
87
+ analytics_enabled=False,
88
+ ) as demo:
89
+ gr.Markdown(PANEL_MARKDOWN)
90
+ with gr.Tab("Corpus contamination") as tab_corpus:
91
+ with gr.Row(variant="compact"):
92
+ with gr.Column():
93
+ eval_dataset_corpus = gr.Textbox(
94
+ placeholder="Evaluation dataset",
95
+ label="Evaluation dataset",
96
+ value="",
97
+ )
98
+ cont_corpora = gr.Textbox(
99
+ placeholder="Pre-training corpora",
100
+ label="Pre-training corpora",
101
+ value="",
102
+ )
103
+ with gr.Column():
104
+ checkboxes_corpus = gr.CheckboxGroup(
105
+ ["Exclude model-based evidences", "Show only contaminated"],
106
+ label="Search options",
107
+ value=[],
108
+ )
109
+
110
+ filter_corpus_btn = gr.Button("Filter")
111
+
112
+ corpus_dataframe = gr.DataFrame(
113
+ value=filter_dataframe_corpus(
114
+ eval_dataset_corpus, cont_corpora, checkboxes_corpus
115
+ ).style.format(precision=2),
116
+ headers=df.columns.to_list(),
117
+ datatype=[
118
+ "markdown",
119
+ "markdown",
120
+ "number",
121
+ "number",
122
+ "number",
123
+ "str",
124
+ "markdown",
125
+ "markdown",
126
+ ],
127
+ )
128
+
129
+ with gr.Tab("Model contamination") as tab_model:
130
+ with gr.Row(variant="compact"):
131
+ with gr.Column():
132
+ eval_dataset_model = gr.Textbox(
133
+ placeholder="Evaluation dataset",
134
+ label="Evaluation dataset",
135
+ value="",
136
+ )
137
+ cont_model = gr.Textbox(
138
+ placeholder="Model", label="Pre-training corpora", value=""
139
+ )
140
+ with gr.Column():
141
+ checkboxes_model = gr.CheckboxGroup(
142
+ ["Exclude model-based evidences", "Show only contaminated"],
143
+ label="Search options",
144
+ value=[],
145
+ )
146
+
147
+ filter_model_btn = gr.Button("Filter")
148
+
149
+ model_dataframe = gr.DataFrame(
150
+ value=filter_dataframe_model(
151
+ eval_dataset_model, cont_model, checkboxes_model
152
+ ),
153
+ headers=df.columns.to_list(),
154
+ datatype=[
155
+ "markdown",
156
+ "markdown",
157
+ "number",
158
+ "number",
159
+ "number",
160
+ "str",
161
+ "markdown",
162
+ "markdown",
163
+ ],
164
+ )
165
+
166
+ filter_corpus_btn.click(
167
+ filter_dataframe_corpus,
168
+ inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus],
169
+ outputs=corpus_dataframe,
170
+ )
171
+ filter_model_btn.click(
172
+ filter_dataframe_model,
173
+ inputs=[eval_dataset_model, cont_model, checkboxes_model],
174
+ outputs=model_dataframe,
175
+ )
176
+
177
+ with gr.Tab("Guidelines") as tab_guidelines:
178
+ gr.Markdown(GUIDELINES)
179
+
180
+
181
+ demo.launch()
contamination_report.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Citation;PR Link
2
+ conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
3
+ conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
4
+ Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;
dataset.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import filelock
5
+ import huggingface_hub
6
+ import pandas as pd
7
+
8
+ from utils import (
9
+ build_datasets_urls,
10
+ build_models_urls,
11
+ build_text_icon,
12
+ download_favicons,
13
+ get_base_url,
14
+ get_domain_name,
15
+ )
16
+
17
+
18
+ HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
19
+ CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"
20
+
21
+ DISABLE_ONLINE_CACHE = False
22
+ ONLINE_CACHE = "CONDA-Workshop/RequestCache"
23
+
24
+
25
+ def save_cache(cache_data, cache_file, initial_timestamp):
26
+ print(f"Saving cache to {cache_file}")
27
+ # Acquire lock before reading and updating the file to prevent race conditions
28
+ with filelock.FileLock(f"{cache_file}.lock"):
29
+ # Check if the file has been modified since the initial read
30
+ current_timestamp = (
31
+ os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
32
+ )
33
+ if current_timestamp is None or initial_timestamp != current_timestamp:
34
+ # File has been modified or created since initial read, re-read the file
35
+ try:
36
+ with open(cache_file, "r", encoding="utf8") as f:
37
+ # Update the dictionary with newly added entries
38
+ cache_dict = json.load(f)
39
+ # Test if cache_dict and cache_data are different
40
+ if cache_dict != cache_data:
41
+ cache_data.update(cache_dict)
42
+
43
+ except FileNotFoundError:
44
+ pass # If the file doesn't exist at this point, continue with the current dictionary
45
+
46
+ # Write the updated dictionary back to the file
47
+ with open(cache_file, "w", encoding="utf8") as f:
48
+ json.dump(cache_data, f, ensure_ascii=False, indent=4)
49
+
50
+ if not DISABLE_ONLINE_CACHE:
51
+ try:
52
+ huggingface_hub.upload_file(
53
+ repo_id=ONLINE_CACHE,
54
+ repo_type="dataset",
55
+ token=os.environ.get("TOKEN") or True,
56
+ path_in_repo=cache_file,
57
+ path_or_fileobj=cache_file,
58
+ )
59
+ except Exception as e:
60
+ print(f"Unable to upload {cache_file}: {e}")
61
+
62
+ return cache_data
63
+
64
+
65
+ def update_favicon_cache(sources):
66
+ # Load the favicon dictionary if it exists
67
+ favicon_dict = {}
68
+ favicon_file_path = "favicons.json"
69
+ initial_timestamp = None
70
+
71
+ if not DISABLE_ONLINE_CACHE:
72
+ try:
73
+ huggingface_hub.hf_hub_download(
74
+ repo_id=ONLINE_CACHE,
75
+ repo_type="dataset",
76
+ token=os.environ.get("TOKEN") or True,
77
+ filename=favicon_file_path,
78
+ local_dir=os.getcwd(),
79
+ )
80
+ except Exception as e:
81
+ print(f"Unable to download favicons.json: {e}")
82
+
83
+ # Attempt to load the favicon dictionary and record its last modification time
84
+ if os.path.exists(favicon_file_path):
85
+ initial_timestamp = os.path.getmtime(favicon_file_path)
86
+ try:
87
+ with open(favicon_file_path, "r", encoding="utf8") as f:
88
+ favicon_dict = json.load(f)
89
+ except FileNotFoundError:
90
+ pass # File not found, proceed with an empty dictionary
91
+
92
+ # Determine which favicons need to be downloaded
93
+ missing_domains = [domain for domain in sources if domain not in favicon_dict]
94
+
95
+ # Download missing favicons in batch
96
+ if missing_domains:
97
+ new_favicon_urls = download_favicons(missing_domains)
98
+ favicon_dict.update(new_favicon_urls)
99
+ favicon_dict = save_cache(
100
+ cache_data=favicon_dict,
101
+ cache_file=favicon_file_path,
102
+ initial_timestamp=initial_timestamp,
103
+ )
104
+
105
+ return favicon_dict
106
+
107
+
108
+ def update_model_url_cache(models):
109
+ models = [x for x in models if x is not None]
110
+ models = list(set(models))
111
+
112
+ # Load the model url dictionary if it exists
113
+ model_url_dict = {}
114
+ model_url_file_path = "model_urls.json"
115
+ initial_timestamp = None
116
+
117
+ if not DISABLE_ONLINE_CACHE:
118
+ try:
119
+ huggingface_hub.hf_hub_download(
120
+ repo_id=ONLINE_CACHE,
121
+ repo_type="dataset",
122
+ token=os.environ.get("TOKEN") or True,
123
+ filename=model_url_file_path,
124
+ local_dir=os.getcwd(),
125
+ )
126
+ except Exception as e:
127
+ print(f"Unable to download model_urls.json: {e}")
128
+
129
+ # Attempt to load the model url dictionary and record its last modification time
130
+ if os.path.exists(model_url_file_path):
131
+ initial_timestamp = os.path.getmtime(model_url_file_path)
132
+ try:
133
+ with open(model_url_file_path, "r", encoding="utf8") as f:
134
+ model_url_dict = json.load(f)
135
+ except FileNotFoundError:
136
+ pass # File not found, proceed with an empty dictionary
137
+
138
+ # Determine which model urls need to be downloaded
139
+ missing_model_urls = [model for model in models if model not in model_url_dict]
140
+
141
+ # Download missing model urls in batch
142
+ if missing_model_urls:
143
+ new_model_urls = build_models_urls(missing_model_urls)
144
+ model_url_dict.update(new_model_urls)
145
+ model_url_dict = save_cache(
146
+ cache_data=model_url_dict,
147
+ cache_file=model_url_file_path,
148
+ initial_timestamp=initial_timestamp,
149
+ )
150
+
151
+ return model_url_dict
152
+
153
+
154
+ def update_dataset_url_cache(datasets):
155
+ datasets = [x for x in datasets if x is not None]
156
+ datasets = list(set(datasets))
157
+
158
+ # Load the dataset url dictionary if it exists
159
+ dataset_url_dict = {}
160
+ dataset_url_file_path = "dataset_urls.json"
161
+ initial_timestamp = None
162
+
163
+ if not DISABLE_ONLINE_CACHE:
164
+ try:
165
+ huggingface_hub.hf_hub_download(
166
+ repo_id=ONLINE_CACHE,
167
+ repo_type="dataset",
168
+ token=os.environ.get("TOKEN") or True,
169
+ filename=dataset_url_file_path,
170
+ local_dir=os.getcwd(),
171
+ )
172
+ except Exception as e:
173
+ print(f"Unable to download dataset_urls.json: {e}")
174
+
175
+ # Attempt to load the dataset url dictionary and record its last modification time
176
+ if os.path.exists(dataset_url_file_path):
177
+ initial_timestamp = os.path.getmtime(dataset_url_file_path)
178
+ try:
179
+ with open(dataset_url_file_path, "r", encoding="utf8") as f:
180
+ dataset_url_dict = json.load(f)
181
+ except FileNotFoundError:
182
+ pass # File not found, proceed with an empty dictionary
183
+
184
+ # Determine which dataset urls need to be downloaded
185
+ missing_dataset_urls = [
186
+ dataset for dataset in datasets if dataset not in dataset_url_dict
187
+ ]
188
+
189
+ # Download missing dataset urls in batch
190
+ if missing_dataset_urls:
191
+ new_dataset_urls = build_datasets_urls(missing_dataset_urls)
192
+ dataset_url_dict.update(new_dataset_urls)
193
+ dataset_url_dict = save_cache(
194
+ cache_data=dataset_url_dict,
195
+ cache_file=dataset_url_file_path,
196
+ initial_timestamp=initial_timestamp,
197
+ )
198
+
199
+ return dataset_url_dict
200
+
201
+
202
+ def get_dataframe():
203
+ # Load the contamination_report.csv file
204
+ data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)
205
+
206
+ # Load the favicon dictionary if it exists
207
+ favicon_dict = {}
208
+
209
+ # Update the favicon dictionary
210
+ favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Citation"]])
211
+
212
+ # Update the model url dictionary
213
+ model_url_dict = update_model_url_cache(
214
+ data[data["Model or corpus"] == "model"]["Contaminated Source"]
215
+ )
216
+
217
+ # Update the dataset url dictionary
218
+ dataset_url_dict = update_dataset_url_cache(
219
+ list(data["Evaluation Dataset"])
220
+ + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
221
+ )
222
+
223
+ # Add favicons URLs to the dataframe in a vectorized manner
224
+ data["Citation"] = data["Citation"].apply(
225
+ lambda x: build_text_icon(
226
+ text=get_domain_name(x),
227
+ url=x,
228
+ icon_url=favicon_dict.get(get_base_url(x), ""),
229
+ )
230
+ )
231
+
232
+ data["PR Link"] = data["PR Link"].apply(
233
+ lambda x: build_text_icon(
234
+ text="",
235
+ url=x if x == x else "no link",
236
+ icon_url=HF_ICON if x == x else CROSS_ICON,
237
+ )
238
+ )
239
+
240
+ data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
241
+ lambda x: build_text_icon(
242
+ text=x,
243
+ url=dataset_url_dict.get(x, ""),
244
+ icon_url=HF_ICON,
245
+ )
246
+ )
247
+
248
+ # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
249
+ data["Contaminated Source"] = data.apply(
250
+ lambda x: build_text_icon(
251
+ text=x["Contaminated Source"],
252
+ url=dataset_url_dict.get(x["Contaminated Source"], "")
253
+ if x["Model or corpus"] == "corpus"
254
+ else model_url_dict.get(x["Contaminated Source"], ""),
255
+ icon_url=HF_ICON,
256
+ ),
257
+ axis=1,
258
+ )
259
+
260
+ return data
markdown.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GUIDELINES = """
2
+ # Contribution Guidelines
3
+
4
+ The πŸ’¨Data Contamination Report is a community-driven project and we welcome contributions from everyone.The objetive of this project is to provide a comprehensive list of data contamination cases, for both models and datasets. We aim to provide a tool for the community for avoiding evaluating
5
+ models on contaminated datasets. We also expect to generate a dataset that will help researchers
6
+ to develop algorithms to automatically detect contaminated datasets in the future.
7
+
8
+ If you wish to contribute to the project by reporting a data contamination case, please open a pull request
9
+ in the [βœ‹Community Tab](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions).Your pull request should edit the [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/blob/main/contamination_report.csv)
10
+ file and add a new row with the details of the contamination case. Please will the following template with the details of the contamination case. ***Pull Requests that do not follow the template won't be accepted.***
11
+
12
+ # Template for reporting data contamination
13
+
14
+ ```markdown
15
+ ## What are you reporting:
16
+ - [ ] Evaluation dataset(s) found in a pre-training corpus. (e.g. COPA found in ThePile)
17
+ - [ ] Evaluation dataset(s) found in a pre-trained model. (e.g. FLAN T5 has been trained on ANLI)
18
+
19
+ **Evaluation dataset(s)**: Name(s) of the evaluation dataset(s). If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise provide a link to a paper, GitHub or dataset-card.
20
+
21
+ **Contaminated model(s)**: Name of the model(s) (if any) that have been contaminated with the evaluation dataset. If available in the HuggingFace Hub please list the corresponding paths (e.g. `allenai/OLMo-7B`).
22
+
23
+ **Contaminated corpora**: Name of the corpora used to pretrain models (if any) that have been contaminated with the evaluation dataset. If available in the HuggingFace hub please write the path (e.g. `CohereForAI/aya_dataset`)
24
+
25
+ **Contaminated split(s)**: If the dataset has Train, Development and/or Test splits please report the contaminated split(s). You can report a percentage of the dataset contaminated.
26
+
27
+
28
+ ## Briefly describe your method to detect data contamination
29
+
30
+ - [ ] Data-based approach
31
+ - [ ] Model-based approach
32
+
33
+ Description of your method, 3-4 sentences. Evidence of data contamination (Read below):
34
+
35
+ #### Data-based approaches
36
+ Data-based approaches identify evidence of data contamination in a pre-training corpus by directly examining the dataset for instances of the evaluation data. This method involves algorithmically searching through a large pre-training dataset to find occurrences of the evaluation data. You should provide evidence of data contamination in the form: "dataset X appears in line N of corpus Y," "dataset X appears N times in corpus Y," or "N examples from dataset X appear in corpus Y."
37
+
38
+ #### Model-based approaches
39
+
40
+ Model-based approaches, on the other hand, utilize heuristic algorithms to infer the presence of data contamination in a pre-trained model. These methods do not directly analyze the data but instead assess the model's behavior to predict data contamination. Examples include prompting the model to reproduce elements of an evaluation dataset to demonstrate memorization (i.e https://hitz-zentroa.github.io/lm-contamination/blog/), or using perplexity measures to estimate data contamination (). You should provide evidence of data contamination in the form of evaluation results of the algorithm from research papers, screenshots of model outputs that demonstrate memorization of a pre-training dataset, or any other form of evaluation that substantiates the method's effectiveness in detecting data contamination. You can provide a confidence score in your predictions.
41
+
42
+ ## Citation
43
+
44
+ Is there a paper that reports the data contamination or describes the method used to detect data contamination?
45
+
46
+ URL: `https://aclanthology.org/2023.findings-emnlp.722/`
47
+ Citation: `@inproceedings{...`
48
+ ```
49
+ ---
50
+
51
+ ### How to update the contamination_report.csv file
52
+
53
+ The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
54
+ - Evaluation Dataset: Name of the evaluation dataset contaminated. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
55
+ - Contaminated Source: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
56
+ - Train split: Percentage of the train split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
57
+ - Development split: Percentage of the development split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted.
58
+ - Train split: Percentage of the test split contaminated. 0 means no contamination. 1 means that the dataset has been fully contamianted. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
59
+ - Approach: data-based or model-based approach. See above for more information.
60
+ - Citation: If there is paper or any other resource describing how you have detected this contamination example, provide the URL.
61
+ - PR Link: Leave it blank, we will update it after you create the Pull Request.
62
+ """.strip()
63
+
64
+
65
+ PANEL_MARKDOWN = """
66
+ # Data Contamination Report
67
+ The πŸ’¨Data Contamination Report aims to track instances of data contamination in pre-trained models and corpora.
68
+ This effort is part of [The 1st Workshop on Data Contamination (CONDA)](https://conda-workshop.github.io/) that will be held at ACL 2024.
69
+ """.strip()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ setuptools
3
+ filelock
4
+ pandas
5
+ beautifulsoup4
6
+ requests
7
+ huggingface_hub
utils.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from typing import Dict, List, Union
5
+ from urllib.parse import urljoin, urlparse
6
+
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+
10
+
11
+ def get_base_url(url: str) -> str:
12
+ """
13
+ Extracts the base URL from a given URL.
14
+
15
+ Parameters:
16
+ - url (str): The URL to extract the base URL from.
17
+
18
+ Returns:
19
+ - str: The base URL.
20
+ """
21
+ parsed_url = urlparse(url)
22
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
23
+ return base_url
24
+
25
+
26
+ def get_domain_name(url: str) -> str:
27
+ """
28
+ Get the domain name from a URL.
29
+
30
+ Args:
31
+ url (str): The URL.
32
+
33
+ Returns:
34
+ str: The domain name.
35
+ """
36
+
37
+ parsed_uri = urlparse(url)
38
+ domain = "{uri.netloc}".format(uri=parsed_uri)
39
+ if domain.startswith("www."):
40
+ domain = domain[4:]
41
+ # First latter in uppercase
42
+ return domain.capitalize()
43
+
44
+
45
+ def get_favicon(url: str) -> str:
46
+ headers = {
47
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
48
+ }
49
+ try:
50
+ response = requests.get(url, headers=headers, timeout=2)
51
+ if response.status_code == 200:
52
+ soup = BeautifulSoup(response.content, "html.parser")
53
+ # Search for all potential icons including meta tags
54
+ icon_links = soup.find_all(
55
+ "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I)
56
+ )
57
+ meta_icons = soup.find_all(
58
+ "meta", attrs={"content": re.compile(r".ico$", re.I)}
59
+ )
60
+ icons = icon_links + meta_icons
61
+
62
+ if icons:
63
+ for icon in icons:
64
+ favicon_url = icon.get("href") or icon.get("content")
65
+ if favicon_url:
66
+ if favicon_url.startswith("/"):
67
+ favicon_url = urljoin(url, favicon_url)
68
+ return favicon_url
69
+ # If icons found but no href or content, return default
70
+ return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
71
+ else:
72
+ # No icons found, return default
73
+ return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
74
+ else:
75
+ # Response was not OK, return default
76
+ return (
77
+ "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
78
+ )
79
+ except requests.Timeout:
80
+ logging.warning(f"Request timed out for {url}")
81
+ return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
82
+ except Exception as e:
83
+ logging.warning(f"An error occurred while fetching favicon for {url}: {e}")
84
+ return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
85
+
86
+
87
+ def download_favicons(urls: List[str]) -> Dict[str, str]:
88
+ favicons = {}
89
+ urls = list(set(urls))
90
+ with ThreadPoolExecutor(max_workers=20) as executor:
91
+ future_to_url = {executor.submit(get_favicon, url): url for url in urls}
92
+ for future in as_completed(future_to_url):
93
+ url = future_to_url[future]
94
+ try:
95
+ favicon_url = future.result()
96
+ favicons[url] = favicon_url
97
+ except Exception as e:
98
+ logging.warning(f"Failed to fetch favicon for {url}: {e}")
99
+ favicons[url] = (
100
+ "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg"
101
+ )
102
+ return favicons
103
+
104
+
105
+ def url_exists(url):
106
+ """
107
+ Checks if a URL exists by making a HEAD request.
108
+
109
+ Parameters:
110
+ - url (str): The URL to check.
111
+
112
+ Returns:
113
+ - bool: True if the URL exists, False otherwise.
114
+ """
115
+ try:
116
+ response = requests.head(url, allow_redirects=True)
117
+ return response.status_code < 400
118
+ except requests.RequestException:
119
+ # In case of network problems, SSL errors, etc.
120
+ return False
121
+
122
+
123
+ def build_dataset_url(dataset_name: str):
124
+ """
125
+ Build an HTML string with the dataset URL.
126
+ """
127
+ url = f"https://huggingface.co/datasets/{dataset_name}"
128
+ # Test if the url exists
129
+ if url_exists(url):
130
+ return url
131
+ else:
132
+ return None
133
+
134
+
135
+ def build_model_url(model_name: str):
136
+ """
137
+ Build an HTML string with the model URL.
138
+ """
139
+ url = f"https://huggingface.co/{model_name}"
140
+ # Test if the url exists
141
+ if url_exists(url):
142
+ return url
143
+ else:
144
+ return None
145
+
146
+
147
+ def build_text_icon(text: str, url: Union[str, None], icon_url: str):
148
+ if url is not None:
149
+ return (
150
+ f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">'
151
+ f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">'
152
+ f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>'
153
+ )
154
+ else:
155
+ return text
156
+
157
+
158
+ def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]:
159
+ """
160
+ Build a dictionary of dataset URLs from a list of dataset names.
161
+
162
+ Parameters:
163
+ - datasets_names (List[str]): The list of dataset names.
164
+
165
+ Returns:
166
+ - Dict[str, str]: A dictionary of dataset URLs.
167
+ """
168
+ return {dataset: build_dataset_url(dataset) for dataset in datasets_names}
169
+
170
+
171
+ def build_models_urls(models_names: List[str]) -> Dict[str, str]:
172
+ """
173
+ Build a dictionary of model URLs from a list of model names.
174
+
175
+ Parameters:
176
+ - models_names (List[str]): The list of model names.
177
+
178
+ Returns:
179
+ - Dict[str, str]: A dictionary of model URLs.
180
+ """
181
+ return {model: build_model_url(model) for model in models_names}