shizue's picture
...
55f3023
import os
import json
import datetime
from email.utils import parseaddr
import gradio as gr
import pandas as pd
import numpy as np
from datasets import load_dataset, DatasetDict
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
# InfoStrings
from scorer import question_scorer
from content import (
format_error,
format_warning,
format_log,
TITLE,
INTRODUCTION_TEXT,
model_hyperlink,
)
TOKEN = os.environ.get("TOKEN", None)
OWNER = "stemdataset"
INTERNAL_DATA_DATASET = f"{OWNER}/STEM-Labels-Private"
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
CONTACT_DATASET = f"{OWNER}/contact_info"
RESULTS_DATASET = f"{OWNER}/results"
LEADERBOARD_PATH = f"{OWNER}/stem-leaderboard"
api = HfApi()
os.makedirs("scored", exist_ok=True)
# Display the results
eval_results = load_dataset(
RESULTS_DATASET,
token=TOKEN,
download_mode="force_redownload",
verification_mode="no_checks",
)
contact_infos = load_dataset(
CONTACT_DATASET,
token=TOKEN,
download_mode="force_redownload",
verification_mode="no_checks",
)
def get_dataframe_from_results(eval_results: DatasetDict, split):
local_df = eval_results[split]
local_df = local_df.map(
lambda row: {"model": model_hyperlink(row["url"], row["model"])}
)
local_df = local_df.remove_columns(["url"])
local_df = local_df.rename_column("model", "Model Name")
local_df = local_df.rename_column("model_family", "Model Family")
local_df = local_df.rename_column("average", "Average")
local_df = local_df.rename_column("science", "Science")
local_df = local_df.rename_column("technology", "Technology")
local_df = local_df.rename_column("engineering", "Engineering")
local_df = local_df.rename_column("math", "Math")
local_df = local_df.rename_column("organisation", "Organisation")
local_df = local_df.rename_column("submit_date", "Submit Date")
df = pd.DataFrame(local_df)
df = df[[
"Model Name",
"Model Family",
"Science",
"Technology",
"Engineering",
"Math",
"Average",
"Organisation",
"Submit Date",
]]
df = df.sort_values(by=["Average"], ascending=False)
numeric_cols = ["Science", "Technology", "Engineering", "Math", "Average"]
df[numeric_cols] = df[numeric_cols].round(decimals=1)
for col in numeric_cols:
df[col] = df[col].apply(lambda x: f"{x:.1f}")
return df
eval_dataframe_test = get_dataframe_from_results(
eval_results=eval_results, split="basic"
)
# Gold answers
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN)["labels"]
def restart_space():
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
def calc_test_acc(preds: list[int]) -> dict[str, float]:
tmp_accs = {
"science": [0, 0],
"technology": [0, 0],
"engineer": [0, 0],
"math": [0, 0],
}
labels = gold_dataset
for pred, label in zip(preds, labels):
subject = label["subject"]
tmp_accs[subject][1] += 1
if pred == label["answer_idx"]:
tmp_accs[subject][0] += 1
accs = {k: v[0] / v[1] for k, v in tmp_accs.items()}
accs["average"] = np.mean(list(accs.values()))
accs = {k: round(v * 100, 1) for k, v in accs.items()}
return accs
def add_new_eval(
val_or_test: str,
model: str,
model_family: str,
url: str,
path_to_file: gr.File,
organisation: str,
mail: str,
):
curr_timestamp = datetime.datetime.today()
# Very basic email parsing
_, parsed_mail = parseaddr(mail)
if not "@" in parsed_mail:
return format_warning("Please provide a valid email adress.")
if model == "":
return format_warning("Please provide a model name.")
if model_family == "":
return format_warning("Please provide a model family.")
print(
json.dumps(
{
"val_or_test": val_or_test,
"model": model,
"model_family": model_family,
"url": url,
"path_to_file": path_to_file,
"organisation": organisation,
"mail": mail,
},
indent=2,
)
)
print("Adding new eval")
# Check if the combination model/org already exists and prints a warning message if yes
if model.lower() in set(
[m.lower() for m in eval_results["basic"]["model"]]
) and organisation.lower() in set(
[l.lower() for l in eval_results["basic"]["organisation"]]
):
return format_warning("This model has been already submitted.")
if path_to_file is None:
return format_warning("Please attach a file.")
# Save submitted file
api.upload_file(
repo_id=SUBMISSION_DATASET,
path_or_fileobj=path_to_file.name,
path_in_repo=f"{organisation}/{model}/{val_or_test}_raw_{curr_timestamp}.txt",
repo_type="dataset",
token=TOKEN,
)
# Compute score
file_path = path_to_file.name
with open(f"scored/{organisation}_{model}.json", "w") as scored_file:
with open(file_path, "r") as f:
preds = []
for ix, line in enumerate(f):
try:
pred_idx = int(line.strip())
except Exception:
return format_error(
f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file."
)
preds.append(pred_idx)
stem_scores = calc_test_acc(preds)
scored_file.write(json.dumps(stem_scores, indent=2))
# Save scored file
api.upload_file(
repo_id=SUBMISSION_DATASET,
path_or_fileobj=f"scored/{organisation}_{model}.json",
path_in_repo=f"{organisation}/{model}/{val_or_test}_scored_{curr_timestamp}.json",
repo_type="dataset",
token=TOKEN,
)
# Actual submission
eval_entry = {
"model": model,
"model_family": model_family,
"url": url,
"organisation": organisation,
"submit_date": "\n".join(str(curr_timestamp).split(" ")),
"science": stem_scores["science"],
"technology": stem_scores["technology"],
"engineering": stem_scores["engineer"],
"math": stem_scores["math"],
"average": stem_scores["average"],
}
eval_results["basic"] = eval_results["basic"].add_item(eval_entry)
print(eval_results)
eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN)
contact_info = {
"model": model,
"model_family": model_family,
"url": url,
"organisation": organisation,
"mail": mail,
"submit_date": "\n".join(str(curr_timestamp).split(" ")),
}
contact_infos["basic"] = contact_infos["basic"].add_item(contact_info)
contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)
return format_log(
f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed"
)
def refresh():
eval_results = load_dataset(
RESULTS_DATASET,
token=TOKEN,
download_mode="force_redownload",
verification_mode="no_checks",
)
eval_dataframe_test = get_dataframe_from_results(
eval_results=eval_results, split="basic"
)
return eval_dataframe_test
def upload_file(files):
file_paths = [file.name for file in files]
return file_paths
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tab("Results: Test"):
leaderboard_table_test = gr.components.Dataframe(
value=eval_dataframe_test,
datatype=TYPES,
interactive=False,
wrap=True,
)
refresh_button = gr.Button("Refresh")
refresh_button.click(
refresh,
inputs=[],
outputs=[
leaderboard_table_test,
],
)
with gr.Accordion("Submit a new model for evaluation"):
with gr.Row():
with gr.Column():
level_of_test = gr.Radio(["test"], value="test", label="Split")
model_name_textbox = gr.Textbox(label="Model name")
model_family_textbox = gr.Textbox(label="Model family")
url_textbox = gr.Textbox(label="Url to model information")
with gr.Column():
organisation = gr.Textbox(label="Organisation")
mail = gr.Textbox(
label="Contact email (will be stored privately, & used if there is an issue with your submission)"
)
file_output = gr.File()
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
level_of_test,
model_name_textbox,
model_family_textbox,
url_textbox,
file_output,
organisation,
mail,
],
submission_result,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True, server_name="0.0.0.0")