ltzheng's picture
Update app.py
6b216f5 verified
import os
import zipfile
import json
from pathlib import Path
import gradio as gr
import pandas as pd
from email_validator import validate_email, EmailNotValidError
from huggingface_hub import HfApi, HfFileSystem
from utils import (
INTRODUCTION_TEXT,
TITLE,
format_error,
format_log,
format_warning,
model_hyperlink,
read_jsonl,
)
TOKEN = os.environ.get("TOKEN", None)
OWNER="Skywork"
REAL_WORLD_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/real_world_result.parquet"
GUI_GROUNDING_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/gui_grounding_result.parquet"
SUBMISSION_DATASET = f"{OWNER}/submitted_results"
GROUNDING_FOLDER = f"hf://datasets/{OWNER}/agent-studio-data/grounding"
class ScoreManager:
def __init__(self) -> None:
self.eval_results : pd.DataFrame
self.display_eval_results : pd.DataFrame
self.grounding_results: pd.DataFrame
self.display_grounding_results: pd.DataFrame
self.api = HfApi(token=TOKEN)
self.fs = HfFileSystem(token=TOKEN)
self.refresh()
@staticmethod
def calc_real_task_scores(base_path: Path):
apps = ["filesystem", "google", "GUI"]
scores_per_app = {}
for app in apps:
if app == "google":
data = []
try:
data += read_jsonl((base_path / "gcalendar.jsonl").as_posix())
data += read_jsonl((base_path / "gmail.jsonl").as_posix())
data += read_jsonl((base_path / "gdocs.jsonl").as_posix())
except FileNotFoundError:
print("No google data found")
continue
elif app == "filesystem":
try:
data = read_jsonl((base_path / "filesystem.jsonl").as_posix())
except FileNotFoundError:
print("No filesystem data found")
continue
elif app == "GUI":
data = []
try:
data += read_jsonl((base_path / "desktop_hard.jsonl").as_posix())
data += read_jsonl((base_path / "vscode.jsonl").as_posix())
except FileNotFoundError:
print("No GUI data found")
continue
else:
raise ValueError("Invalid app")
scores = [entry["score"] for entry in data]
tp = 0
fp = 0
tn = 0
fn = 0
for entry in data:
if entry["score"] > 0:
if entry["self_eval"]["score"] > 0:
tp += 1
else:
fp += 1
else:
if entry["self_eval"]["score"] > 0:
fn += 1
else:
tn += 1
score = round(sum(scores) / len(scores) * 100, 1)
accuracy = round((tp + tn) / (tp + tn + fp + fn) * 100, 1)
# print(f"Average score: {score}")
# print(f"Total tasks: {tp + fp + tn + fn}")
# print(f"True positive: {tp}")
# print(f"False positive: {fp}")
# print(f"True negative: {tn}")
# print(f"False negative: {fn}")
# print(f"Accuracy: {accuracy}\n")
scores_per_app[app] = {
"score": score,
"success_tasks": tp + fp,
"total_tasks": tp + fp + tn + fn,
"accuracy": accuracy,
}
return scores_per_app
def calc_gui_grounding_scores(self, base_path: Path):
def calc_per_app_grounding_scores(result_dict, task_configs):
total_tasks = len(result_dict)
task_ids = set([task_config["task_id"] for task_config in task_configs])
success = 0
for result in result_dict:
if result["task_id"] not in task_ids:
raise ValueError(f"Task id {result['task_id']} not found!")
if result["score"] == 1.0:
success += 1
return {
"score": success / total_tasks * 100,
"total_tasks": total_tasks,
"success_tasks": success,
}
scores_per_os = {}
for os in base_path.iterdir():
if not os.is_dir():
continue
try:
scores_per_app = {}
for app in os.iterdir():
if not app.is_dir():
continue
with self.fs.open(
f"{GROUNDING_FOLDER}/{app.relative_to(base_path).as_posix()}/actions.jsonl",
"r"
) as f:
task_configs = read_jsonl(f)
results_dict = read_jsonl((base_path / os / app / "results.jsonl").as_posix())
results = calc_per_app_grounding_scores(results_dict, task_configs)
scores_per_app[app.name] = results
scores_per_os[os.name] = scores_per_app
except FileNotFoundError:
print(f"No data found for {os.name}")
continue
return scores_per_os
@staticmethod
def to_displayed_table(df: pd.DataFrame):
df_display = df.copy()
df_display['model'] = df_display.apply(
lambda row: model_hyperlink(row['url'], row['model']) if \
(row['url'] != "") and (not pd.isna(row['url'])) \
else row['model'],
axis=1
)
df_display = df_display.drop(columns=["url", "organization"])
df_display = df_display.sort_values(by="Average (%) ⬆️", ascending=False)
df_display = df_display.map(lambda x: round(x, 2) if isinstance(x, float) else x)
return df_display
def refresh(self):
try:
with self.fs.open(REAL_WORLD_RESULTS_FILE, "rb") as f:
self.eval_results = pd.read_parquet(f)
except FileNotFoundError:
self.eval_results = pd.DataFrame(
columns=["model", "agent_type", "Average (%) ⬆️", "filesystem (%)", "google (%)", "GUI (%)", "organization", "url", "model_family"]
)
try:
with self.fs.open(GUI_GROUNDING_RESULTS_FILE, "rb") as f:
self.grounding_results = pd.read_parquet(f)
except FileNotFoundError:
self.grounding_results = pd.DataFrame(
columns=["model", "agent_type", "Average (%) ⬆️", "windows (%)", "linux (%)", "macos (%)", "organization", "url", "model_family"]
)
self.display_eval_results = self.to_displayed_table(self.eval_results)
self.display_grounding_results = self.to_displayed_table(self.grounding_results)
return self.display_eval_results, self.display_grounding_results
def add_new_eval(
self,
dataset_selection: str,
model_name: str,
model_family: str,
agent_type: str,
url: str,
uploaded_file_path: str,
organization: str,
mail: str,
):
# Mandatory fields
if model_name == "":
return format_error("Model name cannot be empty")
elif model_family == "":
return format_error("Model family cannot be empty")
elif agent_type == "":
return format_error("Agent type cannot be empty")
elif organization == "":
return format_error("Organization cannot be empty")
elif mail == "":
return format_error("Mail cannot be empty")
elif uploaded_file_path == "":
return format_error("File cannot be empty")
# Check if the model has been already submitted
if dataset_selection == "Real-world tasks":
if model_name.lower() in set([m.lower() for m in self.eval_results["model"]]) \
and organization.lower() in set([l.lower() for l in self.eval_results["organization"]]):
return format_warning("This model has been already submitted.")
else:
if model_name.lower() in set([m.lower() for m in self.grounding_results["model"]]) \
and organization.lower() in set([l.lower() for l in self.grounding_results["organization"]]):
return format_warning("This model has been already submitted.")
# Check if the email is valid
try:
validate_email(mail, check_deliverability=True)
except EmailNotValidError as e:
return format_error(f"Invalid email")
if url == "":
url = None
self.refresh()
try:
file_path = Path(uploaded_file_path)
results_folder_path = file_path.parent / model_name
with zipfile.ZipFile(file_path, 'r') as zip_file:
zip_file.extractall(results_folder_path)
print(results_folder_path)
contact_info = {
"model": model_name,
"model_family": model_family,
"url": url,
"organization": organization,
"mail": mail,
}
if dataset_selection == "Real-world tasks":
scores = self.calc_real_task_scores(results_folder_path)
if scores == {}:
return format_error("No data found in the zip file, please make sure the file structure is correct.")
eval_entry = {
"model": model_name,
"model_family": model_family,
"agent_type": agent_type,
"url": url,
"organization": organization,
}
succ = 0
total = 0
for app, scores in scores.items():
eval_entry[f"{app} (%)"] = scores["score"]
succ += scores["success_tasks"]
total += scores["total_tasks"]
eval_entry["Average (%) ⬆️"] = succ / total * 100
print(eval_entry)
self.eval_results = pd.concat(
[self.eval_results, pd.DataFrame([eval_entry])],
ignore_index=True
)
self.upload2hub(
results_path=REAL_WORLD_RESULTS_FILE,
results=self.eval_results,
folder_path=results_folder_path,
path_in_repo=f"origin/{organization.lower()}/{model_name.lower()}/real_world",
contact_info=contact_info,
)
elif dataset_selection == "GUI grounding tasks":
scores = self.calc_gui_grounding_scores(results_folder_path)
if scores == {}:
return format_error("No data found in the zip file, please make sure the file structure is correct.")
print(scores)
eval_entry: dict[str, str | float] = {
"model": model_name,
"model_family": model_family,
"agent_type": agent_type,
"url": url,
"organization": organization,
}
succ = 0
total = 0
for os, app_scores in scores.items():
succ_per_app = 0
total_per_app = 0
for app, score in app_scores.items():
succ_per_app += score["success_tasks"]
total_per_app += score["total_tasks"]
succ += succ_per_app
total += total_per_app
eval_entry[f"{os} (%)"] = succ_per_app / total_per_app * 100
print(f"Total: {total}, Success: {succ}")
eval_entry["Average (%) ⬆️"] = succ / total * 100
self.grounding_results = pd.concat(
[self.grounding_results, pd.DataFrame([eval_entry])],
ignore_index=True
)
self.upload2hub(
results_path=GUI_GROUNDING_RESULTS_FILE,
results=self.grounding_results,
folder_path=results_folder_path,
path_in_repo=f"origin/{organization.lower()}/{model_name.lower()}/grounding",
contact_info=contact_info,
)
else:
return format_error("Invalid dataset selection")
except Exception as e:
return format_error(f"Internal Error: {e}")
return format_log("Submitted successfully")
def upload2hub(
self,
results_path: str,
results: pd.DataFrame,
folder_path: Path,
path_in_repo: str,
contact_info: str,
) -> None:
with self.fs.open(results_path, "wb") as f:
results.to_parquet(f)
with open(folder_path / "contact_info.json", "w") as f:
f.write(json.dumps(contact_info))
self.api.upload_folder(
folder_path=folder_path,
path_in_repo=path_in_repo,
repo_id=SUBMISSION_DATASET,
repo_type="dataset",
)
if __name__ == "__main__":
score_manager = ScoreManager()
iface = gr.Blocks()
with iface:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="main_tabs") as main_tabs:
with gr.TabItem("🎞️ GUI grounding tasks table", id=0):
leaderboard_gui_grounding_table = gr.components.Dataframe(
value=score_manager.display_grounding_results,
datatype=["str", "str", "number", "number", "number", "number", "str"],
interactive=False,
column_widths=["20%"]
)
with gr.TabItem("🌍 Real-world tasks table", id=1):
leaderboard_real_world_table = gr.components.Dataframe(
value=score_manager.display_eval_results,
datatype=["str", "str", "number", "number", "number", "number", "str"],
interactive=False,
column_widths=["20%"]
)
refresh_button = gr.Button("Refresh")
refresh_button.click(
score_manager.refresh,
inputs=[],
outputs=[
leaderboard_real_world_table,
leaderboard_gui_grounding_table,
],
)
with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
with gr.Row():
with gr.Column():
dataset_selection = gr.Radio(["GUI grounding tasks", "Real-world tasks"], value="GUI grounding tasks")
model_name_textbox = gr.Textbox(label="Model name*")
model_family_textbox = gr.Textbox(label="Model family*")
agent_type_textbox = gr.Textbox(label="Agent type*")
url_textbox = gr.Textbox(label="Url to model information")
with gr.Column():
organization = gr.Textbox(label="Organization*")
mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
file_output = gr.File(label="Upload model output* (one zip file)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
score_manager.add_new_eval,
[
dataset_selection,
model_name_textbox,
model_family_textbox,
agent_type_textbox,
url_textbox,
file_output,
organization,
mail
],
submission_result,
)
iface.launch()