import os import sys sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation"))) sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard"))) os.chdir(os.path.dirname(os.path.abspath(__file__))) os.environ['CURL_CA_BUNDLE'] = '' import json import datetime from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings # from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink from eval import eval_score TOKEN = os.environ.get("TOKEN", None) OWNER="osunlp" DATA_DATASET = f"{OWNER}/TravelPlanner" EVAL_DATASET = f"{OWNER}/TravelPlannerEval" RESULTS_DATASET = f"{OWNER}/TravelPlannerPublicResults" api = HfApi() # 'scores' = "2024" os.makedirs("scored", exist_ok=True) # # Display the results eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) def get_dataframe_from_results(eval_results, split, mode): local_df = eval_results[f'{split}_{mode}'] local_df = local_df.remove_columns(["Mail"]) df = pd.DataFrame(local_df) df = df.sort_values(by=["Final Pass Rate"], ascending=False) numeric_cols = [c for c in local_df.column_names if "Rate" in c] df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2) return df eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage') eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning') eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage') eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning') # def restart_space(): # api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) def load_line_json_data(filename): data = [] with open(filename, 'r', encoding='utf-8') as f: for line in f.read().strip().split('\n'): unit = json.loads(line) data.append(unit) return data def add_new_eval( val_or_test: str, eval_mode: str, model: str, tooluse_strategy: str, planning_strategy: str, organization: str, mail: str, path_to_file: str, ): # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") print("Adding new eval") if path_to_file is None: return format_warning("Please attach a file.") # Save submitted file api.upload_file( repo_id=RESULTS_DATASET, path_or_fileobj=path_to_file.name, path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=TOKEN ) # Compute score file_path = path_to_file.name result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN) with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file: scored_file.write(json.dumps(result) + "\n") # Save scored file api.upload_file( repo_id=RESULTS_DATASET, path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=TOKEN ) # Actual submission eval_entry = { "Model": model, "Tool-use Strategy": tooluse_strategy, "Planning Strategy": planning_strategy, "Organization": organization, "Mail": mail, "Delivery Rate": result['Delivery Rate'], "Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'], "Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'], "Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'], "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'], "Final Pass Rate":result['Final Pass Rate'] } eval_mode = eval_mode.replace('-','') eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry) print(eval_results) eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN) return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).") def refresh(): eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage') eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning') eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage') eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning') return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning # def upload_file(files): # file_paths = [file.name for file in files] # return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tab("Results: Validation | Two-Stage "): leaderboard_table_val_twostage = gr.components.Dataframe( value=eval_dataframe_val_twostage, interactive=False, ) with gr.Tab("Results: Validation | Sole-Planning"): leaderboard_table_val_soleplanning = gr.components.Dataframe( value=eval_dataframe_val_soleplanning, interactive=False, ) with gr.Tab("Results: Test | Two-Stage "): leaderboard_table_test_twostage = gr.components.Dataframe( value=eval_dataframe_test_twostage, interactive=False, ) with gr.Tab("Results: Test | Sole-Planning"): leaderboard_table_test_soleplanning = gr.components.Dataframe( value=eval_dataframe_test_soleplanning, interactive=False, ) refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ leaderboard_table_val_twostage, leaderboard_table_val_soleplanning, leaderboard_table_test_twostage, leaderboard_table_test_soleplanning, ], ) with gr.Accordion("Submit a new file for evaluation"): with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode") model = gr.Textbox(label="Foundation Model") tooluse_strategy = gr.Textbox(label="Tool-use Strategy") planning_strategy = gr.Textbox(label="Planning Strategy") with gr.Column(): organization = gr.Textbox(label="Organization") mail = gr.Textbox(label="Contact email") file_output = gr.File() submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ level_of_test, eval_mode, model, tooluse_strategy, planning_strategy, organization, mail, file_output, ], submission_result, ) demo.launch(debug=True)