import os import sys sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation"))) sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard"))) os.chdir(os.path.dirname(os.path.abspath(__file__))) os.environ['CURL_CA_BUNDLE'] = '' import json from datetime import datetime from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings # from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink from eval import eval_score TOKEN = os.environ.get("TOKEN", None) OWNER="osunlp" DATA_DATASET = f"{OWNER}/TravelPlanner" EVAL_DATASET = f"{OWNER}/TravelPlannerEval" RESULTS_DATASET = f"{OWNER}/TravelPlannerPublicResults" api = HfApi() # 'scores' = "2024" os.makedirs("scored", exist_ok=True) # # Display the results eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) def get_dataframe_from_results(eval_results, split, mode): local_df = eval_results[f'{split}_{mode}'] local_df = local_df.remove_columns(["Mail"]) df = pd.DataFrame(local_df) df['Organization'].mask(df['Organization']=='TravelBench Team','TravelPlanner Team',inplace=True) df = df.sort_values(by=["Final Pass Rate"], ascending=False) numeric_cols = [c for c in local_df.column_names if "Rate" in c] df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2) return df eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage') eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning') eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage') eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning') # def restart_space(): # api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) def load_line_json_data(filename): data = [] with open(filename, 'r', encoding='utf-8') as f: for line in f.read().strip().split('\n'): unit = json.loads(line) data.append(unit) return data def add_new_eval( val_or_test: str, eval_mode: str, path_to_file: str, ): print("Adding new eval") if path_to_file is None: return format_warning("Please attach a file.") # Compute score file_path = path_to_file.name result, detail_json = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN) print(detail_json) print(type(detail_json)) outputPath=os.path.join('.',datetime.now().strftime('%Y%m%d%H%M%S') + '.json') with open(outputPath,'w') as w: json.dump(detail_json,w) return format_log(f"{result}"), gr.File(label=f"Download the detailed constraint pass rate reports", value=outputPath, visible=True) def refresh(): eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage') eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning') eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage') eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning') return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tab("Results: Validation | Two-Stage "): leaderboard_table_val_twostage = gr.components.Dataframe( value=eval_dataframe_val_twostage, interactive=False, ) with gr.Tab("Results: Validation | Sole-Planning"): leaderboard_table_val_soleplanning = gr.components.Dataframe( value=eval_dataframe_val_soleplanning, interactive=False, ) with gr.Tab("Results: Test | Two-Stage "): leaderboard_table_test_twostage = gr.components.Dataframe( value=eval_dataframe_test_twostage, interactive=False, ) with gr.Tab("Results: Test | Sole-Planning"): leaderboard_table_test_soleplanning = gr.components.Dataframe( value=eval_dataframe_test_soleplanning, interactive=False, ) refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ leaderboard_table_val_twostage, leaderboard_table_val_soleplanning, leaderboard_table_test_twostage, leaderboard_table_test_soleplanning, ], ) with gr.Accordion("Submit a new file for evaluation"): with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode") file_input = gr.File(label="Upload file") file_output = gr.File(label="Download the detailed constraint pass rate reports", visible=False) submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ level_of_test, eval_mode, file_input, ], [submission_result, file_output] ) demo.launch(debug=True)