File size: 9,877 Bytes
b182afd
 
1c6d55d
b182afd
1c6d55d
 
 
 
9e4aab9
1ed88c3
 
 
 
 
1c6d55d
 
 
 
fd71a7a
b182afd
1c6d55d
 
605a986
 
 
 
1c6d55d
a246870
 
1c6d55d
06b4546
1c6d55d
 
c94c38d
 
 
 
 
 
1c6d55d
 
 
b182afd
 
2916d58
30c2633
 
 
 
cb09ce6
30c2633
 
 
 
765f337
30c2633
e701d13
765f337
eab7ed9
8f0d64b
30c2633
 
8f0d64b
e701d13
30c2633
01ef3bc
864023f
30c2633
 
a246870
 
1c6d55d
 
a246870
 
 
 
 
 
 
1c6d55d
 
 
 
 
 
 
 
 
 
a246870
 
1c6d55d
 
 
 
 
b182afd
1c6d55d
 
7422514
9e4aab9
 
 
1c6d55d
b182afd
1c6d55d
 
 
 
 
 
 
 
b182afd
1c6d55d
 
 
30c2633
d410a83
30c2633
e701d13
1c6d55d
30c2633
 
 
 
1c6d55d
30c2633
765f337
30c2633
eab7ed9
1c6d55d
a246870
 
 
 
 
b182afd
 
e701d13
1c6d55d
b182afd
 
30c2633
4596351
c8f0900
35e9319
c8f0900
35e9319
c8f0900
30c2633
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
35e9319
c8f0900
4596351
35e9319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
8e7f358
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
 
 
8e7f358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6d55d
 
 
 
 
8e7f358
1c6d55d
8e7f358
 
 
1c6d55d
39f3047
1c6d55d
 
 
 
 
 
 
 
 
 
b182afd
1c6d55d
b182afd
30c346e
 
 
 
 
 
 
 
 
 
 
 
 
9ae69bd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import gradio as gr
import pandas as pd
import os
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from uploads import add_new_eval

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{wei2024evaluating,
  title={Evaluating Copyright Takedown Methods for Language Models},
  author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter},
  journal={arXiv preprint arXiv:2406.18664},
  year={2024}
}"""

api = HfApi()
TOKEN = os.environ.get("TOKEN", None)
LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard"
def restart_space():
    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

def format_floats(x):
    if isinstance(x, float):
        return f"{x:.3f}"
    return x
# Function to load data from a given CSV file
def baseline_load_data(model, dataset, setting, criteria):
    file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv'  # Replace with your file paths
    df = pd.read_csv(file_path)
    df = df.applymap(format_floats)
    
    # we only want specific columns and in a specific order
    if dataset == 'news':
        column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", 
                    "MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"]
    elif dataset == 'books':
        column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity", 
                    "MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency"
                    ]
    df = df[column_names]
    
    return df

def update_dropdowns(setting, dataset, model, criteria):
    updates = {
        "setting": gr.update(interactive=True),
        "dataset": gr.update(interactive=True),
        "model": gr.update(interactive=True),
        "criteria": gr.update(interactive=True),
    }
    
    if setting == "memorization":
        updates["dataset"] = gr.update(value="news", interactive=False)
        updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False)
    elif dataset == "books":
        updates["setting"] = gr.update(value="rag", interactive=False)
        if model == "llama2-7b-chat-hf-newsqa":
            updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True)
    elif model == "llama2-7b-chat-hf-newsqa":
        updates["setting"] = gr.update(value="memorization", interactive=False)
        updates["dataset"] = gr.update(value="news", interactive=False)
    elif model != "llama2-7b-chat-hf-newsqa":
        updates["setting"] = gr.update(value="rag", interactive=False)
    
    return updates["model"], updates["dataset"], updates["setting"], updates["criteria"]

    

def load_data(model, dataset, setting, criteria):
    baseline_df = baseline_load_data(model, dataset, setting, criteria)
    # now for every file in "versions/{model}-{version}/*.csv"
    # if file name is not "model-version.csv", load the file and append it to the dataframe
    # version = version.replace("%", "p")
    # for file in os.listdir(f'versions/{model}-{version}'):
    #     if file == f"{model}-{version}.csv":
    #         continue
    #     df = pd.read_csv(f'versions/{model}-{version}/{file}')
    #     df = df[baseline_df.columns]
    #     baseline_df = pd.concat([baseline_df, df])
    return baseline_df

# Function for searching in the leaderboard
def search_leaderboard(df, query):
    if query == "":
        return df
    else:
        return df[df['Method'].str.contains(query)]

# Function to change the version of the leaderboard
def change_version(model, dataset, setting, criteria):
    new_df = load_data(model, dataset, setting, criteria)
    return new_df


# Initialize Gradio app
demo = gr.Blocks()

with demo:
    gr.Markdown("""
    ## ๐Ÿฅ‡ CoTaEval Leaderboard
    CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models.
    
    Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/).
    """)

    with gr.Row():
        with gr.Accordion("๐Ÿ“™ Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
                show_copy_button=True,
            ) #.style(show_copy_button=True)

    with gr.Tabs():
        with gr.TabItem("Leaderboard"):
            with gr.Row():
                setting_dropdown = gr.Dropdown(
                    choices = ["rag", "memorization"],
                    label="๐Ÿ”„ Select Setting",
                    value="rag",
                )
                dataset_dropdown = gr.Dropdown(
                    choices = ['news', 'books'],
                    label="๐Ÿ”„ Select Dataset",
                    value="news",
                )
                model_dropdown = gr.Dropdown(
                    choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"],
                    label="๐Ÿ”„ Select Model",
                    value="llama2-7b-chat-hf",
                )
                criteria_dropdown = gr.Dropdown(
                    choices=['mean', 'max'],
                    label = "๐Ÿ”„ Select Criteria",
                    value = 'mean',
                )

            leaderboard_table = gr.components.Dataframe(
                value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"),
                interactive=True,
                visible=True,
            )
            
            
            # setting_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )
            
            # dataset_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )

            # model_dropdown.change(
            #     update_dropdowns,
            #     inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
            #     outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
            # )
            
            setting_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            dataset_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            model_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
            
            criteria_dropdown.change(
                change_version,
                inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
                outputs=leaderboard_table
            )
    
    # with gr.Accordion("Submit a new model for evaluation"):
    #     with gr.Row():
    #         with gr.Column():
    #             method_name_textbox = gr.Textbox(label="Method name")
    #             #llama, phi
    #             model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family")
    #             forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate")
    #             url_textbox = gr.Textbox(label="Url to model information")
    #         with gr.Column():
    #             organisation = gr.Textbox(label="Organisation")
    #             mail = gr.Textbox(label="Contact email")
    #             file_output = gr.File()
                


    #     submit_button = gr.Button("Submit Eval")
    #     submission_result = gr.Markdown()
    #     submit_button.click(
    #         add_new_eval,
    #         [
    #             method_name_textbox,
    #             model_family_radio,
    #             forget_rate_radio,
    #             url_textbox,
    #             file_output,
    #             organisation,
    #             mail
    #         ],
    #         submission_result,
    #     )




    gr.Markdown("""
    ## Links

    - [**Website**](https://cotaeval.github.io): The website for CoTaEval Project.
    - [**GitHub Repository**](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval.
    - [**Datasets**](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning.

    This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard).


    """)

# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
# scheduler.start()
# demo.queue(default_concurrency_limit=40).launch()

# demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()

custom_css = """
<style>
    select {
        max-width: 200px; /* ๆ นๆฎ้œ€่ฆ่ฐƒๆ•ด่ฟ™ไธชๅ€ผ */
    }
    option {
        white-space: normal;
    }
</style>
"""


# demo.launch(debug=True, custom_css=custom_css)
demo.launch(debug=True)