boyiwei's picture
added citation
1ed88c3
raw
history blame
9.88 kB
import gradio as gr
import pandas as pd
import os
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from uploads import add_new_eval
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{wei2024evaluating,
title={Evaluating Copyright Takedown Methods for Language Models},
author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter},
journal={arXiv preprint arXiv:2406.18664},
year={2024}
}"""
api = HfApi()
TOKEN = os.environ.get("TOKEN", None)
LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard"
def restart_space():
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
def format_floats(x):
if isinstance(x, float):
return f"{x:.3f}"
return x
# Function to load data from a given CSV file
def baseline_load_data(model, dataset, setting, criteria):
file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv' # Replace with your file paths
df = pd.read_csv(file_path)
df = df.applymap(format_floats)
# we only want specific columns and in a specific order
if dataset == 'news':
column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity",
"MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"]
elif dataset == 'books':
column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity",
"MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency"
]
df = df[column_names]
return df
def update_dropdowns(setting, dataset, model, criteria):
updates = {
"setting": gr.update(interactive=True),
"dataset": gr.update(interactive=True),
"model": gr.update(interactive=True),
"criteria": gr.update(interactive=True),
}
if setting == "memorization":
updates["dataset"] = gr.update(value="news", interactive=False)
updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False)
elif dataset == "books":
updates["setting"] = gr.update(value="rag", interactive=False)
if model == "llama2-7b-chat-hf-newsqa":
updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True)
elif model == "llama2-7b-chat-hf-newsqa":
updates["setting"] = gr.update(value="memorization", interactive=False)
updates["dataset"] = gr.update(value="news", interactive=False)
elif model != "llama2-7b-chat-hf-newsqa":
updates["setting"] = gr.update(value="rag", interactive=False)
return updates["model"], updates["dataset"], updates["setting"], updates["criteria"]
def load_data(model, dataset, setting, criteria):
baseline_df = baseline_load_data(model, dataset, setting, criteria)
# now for every file in "versions/{model}-{version}/*.csv"
# if file name is not "model-version.csv", load the file and append it to the dataframe
# version = version.replace("%", "p")
# for file in os.listdir(f'versions/{model}-{version}'):
# if file == f"{model}-{version}.csv":
# continue
# df = pd.read_csv(f'versions/{model}-{version}/{file}')
# df = df[baseline_df.columns]
# baseline_df = pd.concat([baseline_df, df])
return baseline_df
# Function for searching in the leaderboard
def search_leaderboard(df, query):
if query == "":
return df
else:
return df[df['Method'].str.contains(query)]
# Function to change the version of the leaderboard
def change_version(model, dataset, setting, criteria):
new_df = load_data(model, dataset, setting, criteria)
return new_df
# Initialize Gradio app
demo = gr.Blocks()
with demo:
gr.Markdown("""
## ๐Ÿฅ‡ CoTAEval Leaderboard
CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models.
Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/).
""")
with gr.Row():
with gr.Accordion("๐Ÿ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
show_copy_button=True,
) #.style(show_copy_button=True)
with gr.Tabs():
with gr.TabItem("Leaderboard"):
with gr.Row():
setting_dropdown = gr.Dropdown(
choices = ["rag", "memorization"],
label="๐Ÿ”„ Select Setting",
value="rag",
)
dataset_dropdown = gr.Dropdown(
choices = ['news', 'books'],
label="๐Ÿ”„ Select Dataset",
value="news",
)
model_dropdown = gr.Dropdown(
choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"],
label="๐Ÿ”„ Select Model",
value="llama2-7b-chat-hf",
)
criteria_dropdown = gr.Dropdown(
choices=['mean', 'max'],
label = "๐Ÿ”„ Select Criteria",
value = 'mean',
)
leaderboard_table = gr.components.Dataframe(
value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"),
interactive=True,
visible=True,
)
# setting_dropdown.change(
# update_dropdowns,
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
# )
# dataset_dropdown.change(
# update_dropdowns,
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
# )
# model_dropdown.change(
# update_dropdowns,
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
# )
setting_dropdown.change(
change_version,
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
outputs=leaderboard_table
)
dataset_dropdown.change(
change_version,
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
outputs=leaderboard_table
)
model_dropdown.change(
change_version,
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
outputs=leaderboard_table
)
criteria_dropdown.change(
change_version,
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
outputs=leaderboard_table
)
# with gr.Accordion("Submit a new model for evaluation"):
# with gr.Row():
# with gr.Column():
# method_name_textbox = gr.Textbox(label="Method name")
# #llama, phi
# model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family")
# forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate")
# url_textbox = gr.Textbox(label="Url to model information")
# with gr.Column():
# organisation = gr.Textbox(label="Organisation")
# mail = gr.Textbox(label="Contact email")
# file_output = gr.File()
# submit_button = gr.Button("Submit Eval")
# submission_result = gr.Markdown()
# submit_button.click(
# add_new_eval,
# [
# method_name_textbox,
# model_family_radio,
# forget_rate_radio,
# url_textbox,
# file_output,
# organisation,
# mail
# ],
# submission_result,
# )
gr.Markdown("""
## Links
- [**Website**](https://cotaeval.github.io): The website for CoTaEval Project.
- [**GitHub Repository**](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval.
- [**Datasets**](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning.
This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard).
""")
# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
# scheduler.start()
# demo.queue(default_concurrency_limit=40).launch()
# demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
custom_css = """
<style>
select {
max-width: 200px; /* ๆ นๆฎ้œ€่ฆ่ฐƒๆ•ด่ฟ™ไธชๅ€ผ */
}
option {
white-space: normal;
}
</style>
"""
# demo.launch(debug=True, custom_css=custom_css)
demo.launch(debug=True)