Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Running

OSainz commited on Mar 22, 2024

Commit

fd6f269

1 Parent(s): 540407e

Small changes

Files changed (4) hide show

app.py CHANGED Viewed

@@ -38,8 +38,10 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
             | (dataframe["Test Split"] > 0.0)
         ]
     return dataframe.style.format(
-        {"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}
     )

             | (dataframe["Test Split"] > 0.0)
         ]
+    dataframe = dataframe.sort_values("Test Split", ascending=False)
     return dataframe.style.format(
+        {"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}, na_rep="Unknown"
     )

contamination_report.csv CHANGED Viewed

@@ -1,4 +1 @@
-Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Citation;PR Link
-conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
-conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
-Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;


1	+ Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR

dataset.py CHANGED Viewed

@@ -207,7 +207,7 @@ def get_dataframe():
     favicon_dict = {}
     # Update the favicon dictionary
-    favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Citation"]])
     # Update the model url dictionary
     model_url_dict = update_model_url_cache(
@@ -221,7 +221,7 @@ def get_dataframe():
     )
     # Add favicons URLs to the dataframe in a vectorized manner
-    data["Citation"] = data["Citation"].apply(
         lambda x: build_text_icon(
             text=get_domain_name(x),
             url=x,
@@ -229,7 +229,7 @@ def get_dataframe():
         )
     )
-    data["PR Link"] = data["PR Link"].apply(
         lambda x: build_text_icon(
             text="",
             url=x if x == x else "no link",
@@ -245,6 +245,13 @@ def get_dataframe():
         )
     )
     # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
     data["Contaminated Source"] = data.apply(
         lambda x: build_text_icon(
@@ -257,4 +264,8 @@ def get_dataframe():
         axis=1,
     )
     return data

     favicon_dict = {}
     # Update the favicon dictionary
+    favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])
     # Update the model url dictionary
     model_url_dict = update_model_url_cache(
     )
     # Add favicons URLs to the dataframe in a vectorized manner
+    data["Reference"] = data["Reference"].apply(
         lambda x: build_text_icon(
             text=get_domain_name(x),
             url=x,
         )
     )
+    data["PR"] = data["PR"].apply(
         lambda x: build_text_icon(
             text="",
             url=x if x == x else "no link",
         )
     )
+    data["Evaluation Dataset"] = data.apply(
+        lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
+        axis=1,
+    )
+    del data["Subset"]
     # For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
     data["Contaminated Source"] = data.apply(
         lambda x: build_text_icon(
         axis=1,
     )
+    data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
+    data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
+    data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)
     return data

utils.py CHANGED Viewed

@@ -38,6 +38,9 @@ def get_domain_name(url: str) -> str:
     domain = "{uri.netloc}".format(uri=parsed_uri)
     if domain.startswith("www."):
         domain = domain[4:]
     # First latter in uppercase
     return domain.capitalize()

     domain = "{uri.netloc}".format(uri=parsed_uri)
     if domain.startswith("www."):
         domain = domain[4:]
+    # Remove last domain
+    domain = ".".join(domain.split(".")[:-1])
     # First latter in uppercase
     return domain.capitalize()