Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import os | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi | |
from uploads import add_new_eval | |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
CITATION_BUTTON_TEXT = r""" | |
@article{wei2024evaluating, | |
title={Evaluating Copyright Takedown Methods for Language Models}, | |
author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter}, | |
journal={arXiv preprint arXiv:2406.18664}, | |
year={2024} | |
}""" | |
api = HfApi() | |
TOKEN = os.environ.get("TOKEN", None) | |
LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard" | |
def restart_space(): | |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) | |
def format_floats(x): | |
if isinstance(x, float): | |
return f"{x:.3f}" | |
return x | |
# Function to load data from a given CSV file | |
def baseline_load_data(model, dataset, setting, criteria): | |
file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv' # Replace with your file paths | |
df = pd.read_csv(file_path) | |
df = df.applymap(format_floats) | |
# we only want specific columns and in a specific order | |
if dataset == 'news': | |
column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", | |
"MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"] | |
elif dataset == 'books': | |
column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", | |
"MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency" | |
] | |
df = df[column_names] | |
return df | |
def update_dropdowns(setting, dataset, model, criteria): | |
updates = { | |
"setting": gr.update(interactive=True), | |
"dataset": gr.update(interactive=True), | |
"model": gr.update(interactive=True), | |
"criteria": gr.update(interactive=True), | |
} | |
if setting == "memorization": | |
updates["dataset"] = gr.update(value="news", interactive=False) | |
updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False) | |
elif dataset == "books": | |
updates["setting"] = gr.update(value="rag", interactive=False) | |
if model == "llama2-7b-chat-hf-newsqa": | |
updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True) | |
elif model == "llama2-7b-chat-hf-newsqa": | |
updates["setting"] = gr.update(value="memorization", interactive=False) | |
updates["dataset"] = gr.update(value="news", interactive=False) | |
elif model != "llama2-7b-chat-hf-newsqa": | |
updates["setting"] = gr.update(value="rag", interactive=False) | |
return updates["model"], updates["dataset"], updates["setting"], updates["criteria"] | |
def load_data(model, dataset, setting, criteria): | |
baseline_df = baseline_load_data(model, dataset, setting, criteria) | |
# now for every file in "versions/{model}-{version}/*.csv" | |
# if file name is not "model-version.csv", load the file and append it to the dataframe | |
# version = version.replace("%", "p") | |
# for file in os.listdir(f'versions/{model}-{version}'): | |
# if file == f"{model}-{version}.csv": | |
# continue | |
# df = pd.read_csv(f'versions/{model}-{version}/{file}') | |
# df = df[baseline_df.columns] | |
# baseline_df = pd.concat([baseline_df, df]) | |
return baseline_df | |
# Function for searching in the leaderboard | |
def search_leaderboard(df, query): | |
if query == "": | |
return df | |
else: | |
return df[df['Method'].str.contains(query)] | |
# Function to change the version of the leaderboard | |
def change_version(model, dataset, setting, criteria): | |
new_df = load_data(model, dataset, setting, criteria) | |
return new_df | |
# Initialize Gradio app | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown(""" | |
## ๐ฅ CoTaEval Leaderboard | |
CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models. | |
Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/). | |
""") | |
with gr.Row(): | |
with gr.Accordion("๐ Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) #.style(show_copy_button=True) | |
with gr.Tabs(): | |
with gr.TabItem("Leaderboard"): | |
with gr.Row(): | |
setting_dropdown = gr.Dropdown( | |
choices = ["rag", "memorization"], | |
label="๐ Select Setting", | |
value="rag", | |
) | |
dataset_dropdown = gr.Dropdown( | |
choices = ['news', 'books'], | |
label="๐ Select Dataset", | |
value="news", | |
) | |
model_dropdown = gr.Dropdown( | |
choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"], | |
label="๐ Select Model", | |
value="llama2-7b-chat-hf", | |
) | |
criteria_dropdown = gr.Dropdown( | |
choices=['mean', 'max'], | |
label = "๐ Select Criteria", | |
value = 'mean', | |
) | |
leaderboard_table = gr.components.Dataframe( | |
value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"), | |
interactive=True, | |
visible=True, | |
) | |
# setting_dropdown.change( | |
# update_dropdowns, | |
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] | |
# ) | |
# dataset_dropdown.change( | |
# update_dropdowns, | |
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] | |
# ) | |
# model_dropdown.change( | |
# update_dropdowns, | |
# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown] | |
# ) | |
setting_dropdown.change( | |
change_version, | |
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
outputs=leaderboard_table | |
) | |
dataset_dropdown.change( | |
change_version, | |
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
outputs=leaderboard_table | |
) | |
model_dropdown.change( | |
change_version, | |
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
outputs=leaderboard_table | |
) | |
criteria_dropdown.change( | |
change_version, | |
inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown], | |
outputs=leaderboard_table | |
) | |
# with gr.Accordion("Submit a new model for evaluation"): | |
# with gr.Row(): | |
# with gr.Column(): | |
# method_name_textbox = gr.Textbox(label="Method name") | |
# #llama, phi | |
# model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family") | |
# forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate") | |
# url_textbox = gr.Textbox(label="Url to model information") | |
# with gr.Column(): | |
# organisation = gr.Textbox(label="Organisation") | |
# mail = gr.Textbox(label="Contact email") | |
# file_output = gr.File() | |
# submit_button = gr.Button("Submit Eval") | |
# submission_result = gr.Markdown() | |
# submit_button.click( | |
# add_new_eval, | |
# [ | |
# method_name_textbox, | |
# model_family_radio, | |
# forget_rate_radio, | |
# url_textbox, | |
# file_output, | |
# organisation, | |
# ], | |
# submission_result, | |
# ) | |
gr.Markdown(""" | |
## Links | |
- [**Website**](https://cotaeval.github.io): The website for CoTaEval Project. | |
- [**GitHub Repository**](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval. | |
- [**Datasets**](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning. | |
This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard). | |
""") | |
# scheduler = BackgroundScheduler() | |
# scheduler.add_job(restart_space, "interval", seconds=1800) | |
# scheduler.start() | |
# demo.queue(default_concurrency_limit=40).launch() | |
# demo.launch() | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=3600) | |
scheduler.start() | |
custom_css = """ | |
<style> | |
select { | |
max-width: 200px; /* ๆ นๆฎ้่ฆ่ฐๆด่ฟไธชๅผ */ | |
} | |
option { | |
white-space: normal; | |
} | |
</style> | |
""" | |
# demo.launch(debug=True, custom_css=custom_css) | |
demo.launch(debug=True) | |