Spaces:

boyiwei
/

CoTaEval_leaderboard

Running

File size: 9,877 Bytes

b182afd
 
1c6d55d
b182afd
1c6d55d
 
 
 
9e4aab9
1ed88c3
 
 
 
 
1c6d55d
 
 
 
fd71a7a
b182afd
1c6d55d
 
605a986
 
 
 
1c6d55d
a246870
 
1c6d55d
06b4546
1c6d55d
 
c94c38d
 
 
 
 
 
1c6d55d
 
 
b182afd
 
2916d58
30c2633
 
 
 
cb09ce6
30c2633
 
 
 
765f337
30c2633
e701d13
765f337
eab7ed9
8f0d64b
30c2633
 
8f0d64b
e701d13
30c2633
01ef3bc
864023f
30c2633
 
a246870
 
1c6d55d
 
a246870
 
 
 
 
 
 
1c6d55d
 
 
 
 
 
 
 
 
 
a246870
 
1c6d55d
 
 
 
 
b182afd
1c6d55d
 
81618ab
9e4aab9
 
 
1c6d55d
b182afd
1c6d55d
 
 
 
 
 
 
 
b182afd
1c6d55d
 
 
30c2633
d410a83
30c2633
e701d13
1c6d55d
30c2633
 
 
 
1c6d55d
30c2633
765f337
30c2633
eab7ed9
1c6d55d
a246870
 
 
 
 
b182afd
 
e701d13
1c6d55d
b182afd
 
30c2633
4596351
c8f0900
35e9319
c8f0900
35e9319
c8f0900
30c2633
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
4596351
35e9319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
8e7f358
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
 
 
8e7f358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
 
 
 
 
8e7f358
1c6d55d
8e7f358
 
 
1c6d55d
39f3047
1c6d55d
 
 
 
 
 
 
 
 
 
b182afd
1c6d55d
b182afd
30c346e
 
 
 
 
 
 
 
 
 
 
 
 
9ae69bd

import gradio as gr
import pandas as pd
import os
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from uploads import add_new_eval

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{wei2024evaluating,
  title={Evaluating Copyright Takedown Methods for Language Models},
  author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter},
  journal={arXiv preprint arXiv:2406.18664},
  year={2024}
}"""

api = HfApi()
TOKEN = os.environ.get("TOKEN", None)
LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard"
def restart_space():
    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

def format_floats(x):
    if isinstance(x, float):
        return f"{x:.3f}"
    return x
# Function to load data from a given CSV file
def baseline_load_data(model, dataset, setting, criteria):
    file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv'  # Replace with your file paths
    df = pd.read_csv(file_path)
    df = df.applymap(format_floats)
    
    # we only want specific columns and in a specific order
    if dataset == 'news':
        column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", 
                    "MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"]
    elif dataset == 'books':
        column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", 
                    "MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency"
                    ]
    df = df[column_names]
    
    return df

def update_dropdowns(setting, dataset, model, criteria):
    updates = {
        "setting": gr.update(interactive=True),
        "dataset": gr.update(interactive=True),
        "model": gr.update(interactive=True),
        "criteria": gr.update(interactive=True),
    }
    
    if setting == "memorization":
        updates["dataset"] = gr.update(value="news", interactive=False)
        updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False)
    elif dataset == "books":
        updates["setting"] = gr.update(value="rag", interactive=False)
        if model == "llama2-7b-chat-hf-newsqa":
            updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True)
    elif model == "llama2-7b-chat-hf-newsqa":
        updates["setting"] = gr.update(value="memorization", interactive=False)
        updates["dataset"] = gr.update(value="news", interactive=False)
    elif model != "llama2-7b-chat-hf-newsqa":
        updates["setting"] = gr.update(value="rag", interactive=False)
    
    return updates["model"], updates["dataset"], updates["setting"], updates["criteria"]

    

def load_data(model, dataset, setting, criteria):
    baseline_df = baseline_load_data(model, dataset, setting, criteria)
    # now for every file in "versions/{model}-{version}/*.csv"
    # if file name is not "model-version.csv", load the file and append it to the dataframe
    # version = version.replace("%", "p")
    # for file in os.listdir(f'versions/{model}-{version}'):
    #     if file == f"{model}-{version}.csv":
    #         continue
    #     df = pd.read_csv(f'versions/{model}-{version}/{file}')
    #     df = df[baseline_df.columns]
    #     baseline_df = pd.concat([baseline_df, df])
    return baseline_df

# Function for searching in the leaderboard
def search_leaderboard(df, query):
    if query == "":
        return df
    else:
        return df[df['Method'].str.contains(query)]

# Function to change the version of the leaderboard
def change_version(model, dataset, setting, criteria):
    new_df = load_data(model, dataset, setting, criteria)
    return new_df


# Initialize Gradio app
demo = gr.Blocks()

with demo:
    gr.Markdown("""
    ## 🥇 CoTAEval Leaderboard
    CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models.
    
    Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/).
    """)

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
                show_copy_button=True,
            ) #.style(show_copy_button=True)

    with gr.Tabs():
        with gr.TabItem("Leaderboard"):
            with gr.Row():
                setting_dropdown = gr.Dropdown(
                    choices = ["rag", "memorization"],
                    label="🔄 Select Setting",
                    value="rag",
                )
                dataset_dropdown = gr.Dropdown(
                    choices = ['news', 'books'],
                    label="🔄 Select Dataset",
                    value="news",
                )
                model_dropdown = gr.Dropdown(
                    choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"],
                    label="🔄 Select Model",
                    value="llama2-7b-chat-hf",
                )
                criteria_dropdown = gr.Dropdown(
                    choices=['mean', 'max'],
                    label = "🔄 Select Criteria",
                    value = 'mean',
                )

            leaderboard_table = gr.components.Dataframe(
                value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"),
                interactive=True,
                visible=True,
            )
            
            
            # setting_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )
            
            # dataset_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )

            # model_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )
            
            setting_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            dataset_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            model_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            criteria_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
    
    # with gr.Accordion("Submit a new model for evaluation"):
    #     with gr.Row():
    #         with gr.Column():
    #             method_name_textbox = gr.Textbox(label="Method name")
    #             #llama, phi
    #             model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family")
    #             forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate")
    #             url_textbox = gr.Textbox(label="Url to model information")
    #         with gr.Column():
    #             organisation = gr.Textbox(label="Organisation")
    #             mail = gr.Textbox(label="Contact email")
    #             file_output = gr.File()
                


    #     submit_button = gr.Button("Submit Eval")
    #     submission_result = gr.Markdown()
    #     submit_button.click(
    #         add_new_eval,
    #         [
    #             method_name_textbox,
    #             model_family_radio,
    #             forget_rate_radio,
    #             url_textbox,
    #             file_output,
    #             organisation,
    #             mail
    #         ],
    #         submission_result,
    #     )




    gr.Markdown("""
    ## Links

    - [**Website**](https://cotaeval.github.io): The website for CoTaEval Project.
    - [**GitHub Repository**](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval.
    - [**Datasets**](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning.

    This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard).


    """)

# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
# scheduler.start()
# demo.queue(default_concurrency_limit=40).launch()

# demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()

custom_css = """
<style>
    select {
        max-width: 200px; /* 根据需要调整这个值 */
    }
    option {
        white-space: normal;
    }
</style>
"""


# demo.launch(debug=True, custom_css=custom_css)
demo.launch(debug=True)