Small changes
Browse files- app.py +3 -1
- contamination_report.csv +1 -4
- dataset.py +14 -3
- utils.py +3 -0
app.py
CHANGED
|
@@ -38,8 +38,10 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
|
|
| 38 |
| (dataframe["Test Split"] > 0.0)
|
| 39 |
]
|
| 40 |
|
|
|
|
|
|
|
| 41 |
return dataframe.style.format(
|
| 42 |
-
{"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}
|
| 43 |
)
|
| 44 |
|
| 45 |
|
|
|
|
| 38 |
| (dataframe["Test Split"] > 0.0)
|
| 39 |
]
|
| 40 |
|
| 41 |
+
dataframe = dataframe.sort_values("Test Split", ascending=False)
|
| 42 |
+
|
| 43 |
return dataframe.style.format(
|
| 44 |
+
{"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}, na_rep="Unknown"
|
| 45 |
)
|
| 46 |
|
| 47 |
|
contamination_report.csv
CHANGED
|
@@ -1,4 +1 @@
|
|
| 1 |
-
Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;
|
| 2 |
-
conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
|
| 3 |
-
conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
|
| 4 |
-
Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;
|
|
|
|
| 1 |
+
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
|
|
|
|
|
|
|
|
dataset.py
CHANGED
|
@@ -207,7 +207,7 @@ def get_dataframe():
|
|
| 207 |
favicon_dict = {}
|
| 208 |
|
| 209 |
# Update the favicon dictionary
|
| 210 |
-
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["
|
| 211 |
|
| 212 |
# Update the model url dictionary
|
| 213 |
model_url_dict = update_model_url_cache(
|
|
@@ -221,7 +221,7 @@ def get_dataframe():
|
|
| 221 |
)
|
| 222 |
|
| 223 |
# Add favicons URLs to the dataframe in a vectorized manner
|
| 224 |
-
data["
|
| 225 |
lambda x: build_text_icon(
|
| 226 |
text=get_domain_name(x),
|
| 227 |
url=x,
|
|
@@ -229,7 +229,7 @@ def get_dataframe():
|
|
| 229 |
)
|
| 230 |
)
|
| 231 |
|
| 232 |
-
data["PR
|
| 233 |
lambda x: build_text_icon(
|
| 234 |
text="",
|
| 235 |
url=x if x == x else "no link",
|
|
@@ -245,6 +245,13 @@ def get_dataframe():
|
|
| 245 |
)
|
| 246 |
)
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
| 249 |
data["Contaminated Source"] = data.apply(
|
| 250 |
lambda x: build_text_icon(
|
|
@@ -257,4 +264,8 @@ def get_dataframe():
|
|
| 257 |
axis=1,
|
| 258 |
)
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return data
|
|
|
|
| 207 |
favicon_dict = {}
|
| 208 |
|
| 209 |
# Update the favicon dictionary
|
| 210 |
+
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])
|
| 211 |
|
| 212 |
# Update the model url dictionary
|
| 213 |
model_url_dict = update_model_url_cache(
|
|
|
|
| 221 |
)
|
| 222 |
|
| 223 |
# Add favicons URLs to the dataframe in a vectorized manner
|
| 224 |
+
data["Reference"] = data["Reference"].apply(
|
| 225 |
lambda x: build_text_icon(
|
| 226 |
text=get_domain_name(x),
|
| 227 |
url=x,
|
|
|
|
| 229 |
)
|
| 230 |
)
|
| 231 |
|
| 232 |
+
data["PR"] = data["PR"].apply(
|
| 233 |
lambda x: build_text_icon(
|
| 234 |
text="",
|
| 235 |
url=x if x == x else "no link",
|
|
|
|
| 245 |
)
|
| 246 |
)
|
| 247 |
|
| 248 |
+
data["Evaluation Dataset"] = data.apply(
|
| 249 |
+
lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
|
| 250 |
+
axis=1,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
del data["Subset"]
|
| 254 |
+
|
| 255 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
| 256 |
data["Contaminated Source"] = data.apply(
|
| 257 |
lambda x: build_text_icon(
|
|
|
|
| 264 |
axis=1,
|
| 265 |
)
|
| 266 |
|
| 267 |
+
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
|
| 268 |
+
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
|
| 269 |
+
data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)
|
| 270 |
+
|
| 271 |
return data
|
utils.py
CHANGED
|
@@ -38,6 +38,9 @@ def get_domain_name(url: str) -> str:
|
|
| 38 |
domain = "{uri.netloc}".format(uri=parsed_uri)
|
| 39 |
if domain.startswith("www."):
|
| 40 |
domain = domain[4:]
|
|
|
|
|
|
|
|
|
|
| 41 |
# First latter in uppercase
|
| 42 |
return domain.capitalize()
|
| 43 |
|
|
|
|
| 38 |
domain = "{uri.netloc}".format(uri=parsed_uri)
|
| 39 |
if domain.startswith("www."):
|
| 40 |
domain = domain[4:]
|
| 41 |
+
|
| 42 |
+
# Remove last domain
|
| 43 |
+
domain = ".".join(domain.split(".")[:-1])
|
| 44 |
# First latter in uppercase
|
| 45 |
return domain.capitalize()
|
| 46 |
|