Spaces:

boyiwei
/

CoTaEval_leaderboard

Running

App Files Files Community

CoTaEval_leaderboard / app.py

boyiwei

Update app.py

7422514 verified 4 months ago

raw

history blame

9.88 kB

	import gradio as gr
	import pandas as pd
	import os
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi
	from uploads import add_new_eval

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	@article{wei2024evaluating,
	title={Evaluating Copyright Takedown Methods for Language Models},
	author={Wei, Boyi and Shi, Weijia and Huang, Yangsibo and Smith, Noah A and Zhang, Chiyuan and Zettlemoyer, Luke and Li, Kai and Henderson, Peter},
	journal={arXiv preprint arXiv:2406.18664},
	year={2024}
	}"""

	api = HfApi()
	TOKEN = os.environ.get("TOKEN", None)
	LEADERBOARD_PATH = f"boyiwei/CoTaEval_leaderboard"
	def restart_space():
	api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

	def format_floats(x):
	if isinstance(x, float):
	return f"{x:.3f}"
	return x
	# Function to load data from a given CSV file
	def baseline_load_data(model, dataset, setting, criteria):
	file_path = f'versions/{model}_{dataset}_{setting}_{criteria}.csv' # Replace with your file paths
	df = pd.read_csv(file_path)
	df = df.applymap(format_floats)

	# we only want specific columns and in a specific order
	if dataset == 'news':
	column_names = ["model_name","method","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity",
	"MMLU","MT-Bench","Blocklisted F1","In-Domain F1","Efficiency"]
	elif dataset == 'books':
	column_names = ["model_name","method","bleu","rouge1","rougeL","semantic_sim","LCS(character)","LCS(word)","ACS(word)","Levenshtein Distance","Minhash Similarity",
	"MMLU","MT-Bench","Blocklisted rougeL","In-Domain rougeL","Efficiency"
	]
	df = df[column_names]

	return df

	def update_dropdowns(setting, dataset, model, criteria):
	updates = {
	"setting": gr.update(interactive=True),
	"dataset": gr.update(interactive=True),
	"model": gr.update(interactive=True),
	"criteria": gr.update(interactive=True),
	}

	if setting == "memorization":
	updates["dataset"] = gr.update(value="news", interactive=False)
	updates["model"] = gr.update(value="llama2-7b-chat-hf-newsqa", interactive=False)
	elif dataset == "books":
	updates["setting"] = gr.update(value="rag", interactive=False)
	if model == "llama2-7b-chat-hf-newsqa":
	updates["model"] = gr.update(value="llama2-7b-chat-hf", interactive=True)
	elif model == "llama2-7b-chat-hf-newsqa":
	updates["setting"] = gr.update(value="memorization", interactive=False)
	updates["dataset"] = gr.update(value="news", interactive=False)
	elif model != "llama2-7b-chat-hf-newsqa":
	updates["setting"] = gr.update(value="rag", interactive=False)

	return updates["model"], updates["dataset"], updates["setting"], updates["criteria"]



	def load_data(model, dataset, setting, criteria):
	baseline_df = baseline_load_data(model, dataset, setting, criteria)
	# now for every file in "versions/{model}-{version}/*.csv"
	# if file name is not "model-version.csv", load the file and append it to the dataframe
	# version = version.replace("%", "p")
	# for file in os.listdir(f'versions/{model}-{version}'):
	# if file == f"{model}-{version}.csv":
	# continue
	# df = pd.read_csv(f'versions/{model}-{version}/{file}')
	# df = df[baseline_df.columns]
	# baseline_df = pd.concat([baseline_df, df])
	return baseline_df

	# Function for searching in the leaderboard
	def search_leaderboard(df, query):
	if query == "":
	return df
	else:
	return df[df['Method'].str.contains(query)]

	# Function to change the version of the leaderboard
	def change_version(model, dataset, setting, criteria):
	new_df = load_data(model, dataset, setting, criteria)
	return new_df


	# Initialize Gradio app
	demo = gr.Blocks()

	with demo:
	gr.Markdown("""
	## 🥇 CoTaEval Leaderboard
	CoTaEval is a benchmark to evaluate the feasibility and side effects of copyright takedown methods for language models.

	Project website: [https://cotaeval.github.io/](https://cotaeval.github.io/).
	""")

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id="citation-button",
	show_copy_button=True,
	) #.style(show_copy_button=True)

	with gr.Tabs():
	with gr.TabItem("Leaderboard"):
	with gr.Row():
	setting_dropdown = gr.Dropdown(
	choices = ["rag", "memorization"],
	label="🔄 Select Setting",
	value="rag",
	)
	dataset_dropdown = gr.Dropdown(
	choices = ['news', 'books'],
	label="🔄 Select Dataset",
	value="news",
	)
	model_dropdown = gr.Dropdown(
	choices=["llama2-7b-chat-hf", "llama2-70b-chat-hf", "dbrx-instruct", "llama2-7b-chat-hf-newsqa"],
	label="🔄 Select Model",
	value="llama2-7b-chat-hf",
	)
	criteria_dropdown = gr.Dropdown(
	choices=['mean', 'max'],
	label = "🔄 Select Criteria",
	value = 'mean',
	)

	leaderboard_table = gr.components.Dataframe(
	value=load_data("llama2-7b-chat-hf", "news", "rag", "mean"),
	interactive=True,
	visible=True,
	)


	# setting_dropdown.change(
	# update_dropdowns,
	# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
	# )

	# dataset_dropdown.change(
	# update_dropdowns,
	# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
	# )

	# model_dropdown.change(
	# update_dropdowns,
	# inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	# outputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown]
	# )

	setting_dropdown.change(
	change_version,
	inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	outputs=leaderboard_table
	)

	dataset_dropdown.change(
	change_version,
	inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	outputs=leaderboard_table
	)

	model_dropdown.change(
	change_version,
	inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	outputs=leaderboard_table
	)

	criteria_dropdown.change(
	change_version,
	inputs=[model_dropdown, dataset_dropdown, setting_dropdown, criteria_dropdown],
	outputs=leaderboard_table
	)

	# with gr.Accordion("Submit a new model for evaluation"):
	# with gr.Row():
	# with gr.Column():
	# method_name_textbox = gr.Textbox(label="Method name")
	# #llama, phi
	# model_family_radio = gr.Radio(["llama", "phi"], value="llama", label="Model family")
	# forget_rate_radio = gr.Radio(["1%", "5%", "10%"], value="10%", label="Forget rate")
	# url_textbox = gr.Textbox(label="Url to model information")
	# with gr.Column():
	# organisation = gr.Textbox(label="Organisation")
	# mail = gr.Textbox(label="Contact email")
	# file_output = gr.File()



	# submit_button = gr.Button("Submit Eval")
	# submission_result = gr.Markdown()
	# submit_button.click(
	# add_new_eval,
	# [
	# method_name_textbox,
	# model_family_radio,
	# forget_rate_radio,
	# url_textbox,
	# file_output,
	# organisation,
	# mail
	# ],
	# submission_result,
	# )




	gr.Markdown("""
	## Links

	- [Website](https://cotaeval.github.io): The website for CoTaEval Project.
	- [GitHub Repository](https://github.com/boyiwei/CoTaEval): For source code of evaluating the takedown methods with CoTaEval.
	- [Datasets](https://huggingface.co/datasets/boyiwei/CoTaEval): Dataset for evaluation and unlearning.

	This leaderboard is based on the design of the [TOFU Leaderboard](https://huggingface.co/spaces/locuslab/tofu_leaderboard).


	""")

	# scheduler = BackgroundScheduler()
	# scheduler.add_job(restart_space, "interval", seconds=1800)
	# scheduler.start()
	# demo.queue(default_concurrency_limit=40).launch()

	# demo.launch()
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()

	custom_css = """
	<style>
	select {
	max-width: 200px; /* 根据需要调整这个值 */
	}
	option {
	white-space: normal;
	}
	</style>
	"""


	# demo.launch(debug=True, custom_css=custom_css)
	demo.launch(debug=True)