|
import os |
|
import sys |
|
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation"))) |
|
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard"))) |
|
os.chdir(os.path.dirname(os.path.abspath(__file__))) |
|
os.environ['CURL_CA_BUNDLE'] = '' |
|
import json |
|
import datetime |
|
from email.utils import parseaddr |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
from datasets import load_dataset |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import HfApi |
|
|
|
|
|
|
|
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink |
|
from eval import eval_score |
|
|
|
TOKEN = os.environ.get("TOKEN", None) |
|
OWNER="osunlp" |
|
DATA_DATASET = f"{OWNER}/TravelBench" |
|
EVAL_DATASET = f"{OWNER}/TravelBenchEval" |
|
|
|
api = HfApi() |
|
|
|
YEAR_VERSION = "2024" |
|
|
|
os.makedirs("scored", exist_ok=True) |
|
|
|
|
|
query_data_list = load_dataset('osunlp/TravelBenchEval','validation',token=TOKEN)['validation'] |
|
eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN) |
|
def get_dataframe_from_results(eval_results, split): |
|
local_df = eval_results[split] |
|
local_df = local_df.remove_columns(["Mail"]) |
|
df = pd.DataFrame(local_df) |
|
df = df.sort_values(by=["Final Pass Rate"], ascending=False) |
|
numeric_cols = [c for c in local_df.column_names if "Rate" in c] |
|
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2) |
|
return df |
|
|
|
|
|
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") |
|
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_line_json_data(filename): |
|
data = [] |
|
with open(filename, 'r', encoding='utf-8') as f: |
|
for line in f.read().strip().split('\n'): |
|
unit = json.loads(line) |
|
data.append(unit) |
|
return data |
|
|
|
|
|
def add_new_eval( |
|
val_or_test: str, |
|
eval_mode: str, |
|
model: str, |
|
planning_strategy: str, |
|
organization: str, |
|
mail: str, |
|
path_to_file: str, |
|
): |
|
|
|
_, parsed_mail = parseaddr(mail) |
|
if not "@" in parsed_mail: |
|
return format_warning("Please provide a valid email adress.") |
|
|
|
print("Adding new eval") |
|
|
|
if path_to_file is None: |
|
return format_warning("Please attach a file.") |
|
|
|
|
|
api.upload_file( |
|
repo_id=EVAL_DATASET, |
|
path_or_fileobj=path_to_file.name, |
|
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=TOKEN |
|
) |
|
|
|
|
|
file_path = path_to_file.name |
|
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN) |
|
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file: |
|
scored_file.write(json.dumps(result) + "\n") |
|
|
|
|
|
api.upload_file( |
|
repo_id=EVAL_DATASET, |
|
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", |
|
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=TOKEN |
|
) |
|
|
|
|
|
eval_entry = { |
|
"Model": model, |
|
"Planning Strategy": planning_strategy, |
|
"Organization": organization, |
|
"Mail": mail, |
|
"Delivery Rate": result['Delivery Rate'], |
|
"Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'], |
|
"Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'], |
|
"Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'], |
|
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'], |
|
"Final Pass Rate":result['Final Pass Rate'] |
|
} |
|
|
|
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) |
|
|
|
print(eval_results) |
|
|
|
eval_results.push_to_hub(EVAL_DATASET, config_name = 'scores', token=TOKEN) |
|
|
|
return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed") |
|
|
|
|
|
def refresh(): |
|
eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN) |
|
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") |
|
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") |
|
return eval_dataframe_val, eval_dataframe_test |
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Tab("Results: Validation"): |
|
leaderboard_table_val = gr.components.Dataframe( |
|
value=eval_dataframe_val, interactive=False, |
|
) |
|
with gr.Tab("Results: Test"): |
|
leaderboard_table_test = gr.components.Dataframe( |
|
value=eval_dataframe_test, interactive=False, |
|
) |
|
|
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click( |
|
refresh, |
|
inputs=[], |
|
outputs=[ |
|
leaderboard_table_val, |
|
leaderboard_table_test, |
|
], |
|
) |
|
with gr.Accordion("Submit a new file for evaluation"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") |
|
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode") |
|
model = gr.Textbox(label="Foundation Model") |
|
planning_strategy = gr.Textbox(label="Planning Strategy") |
|
with gr.Column(): |
|
organization = gr.Textbox(label="Organization") |
|
mail = gr.Textbox(label="Contact email") |
|
file_output = gr.File() |
|
|
|
|
|
submit_button = gr.Button("Submit Eval") |
|
submission_result = gr.Markdown() |
|
submit_button.click( |
|
add_new_eval, |
|
[ |
|
level_of_test, |
|
eval_mode, |
|
model, |
|
planning_strategy, |
|
organization, |
|
mail, |
|
file_output, |
|
], |
|
submission_result, |
|
) |
|
|
|
|
|
|
|
|
|
demo.launch(debug=True) |
|
|