FRENKIE-CHIANG's picture
Upload app.py with huggingface_hub
c9835d3 verified
import gradio as gr
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from huggingface_hub import snapshot_download, HfApi
# =========================
# Basic Config
# =========================
DATASET_REPO = "Fysics-AI/FysicsWorld-Leaderborad-Result"
HF_TOKEN = os.environ.get("HF_TOKEN")
TRACK_TO_CSV = {
"omni-mllm": "omni-mllm.csv",
"image-gen": "image-gen.csv",
"video-gen": "video-gen.csv",
}
# =========================
# Download Dataset (once)
# =========================
LOCAL_DATA_DIR = Path(
snapshot_download(
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
)
)
print("๐Ÿ“‚ Dataset dir:", LOCAL_DATA_DIR)
print("๐Ÿ“„ Files:", [p.name for p in LOCAL_DATA_DIR.iterdir()])
# =========================
# Column Rename Maps (ๅ…ณ้”ฎไฟฎๅค็‚น)
# =========================
OMNI_MLLM_RENAME = {
"Task1-1": "Image\nUnderstanding",
"Task1-2": "Video\nUnderstanding",
"Task2-1": "Speech-Driven\nImage Understanding",
"Task2-2": "Image-Audio\nReasoning",
"Task2-3": "Speech-Based\nImage QA",
"Task2-4": "Speech Generation\nfrom Image",
"Task2-5": "Audio Matching\nfrom Image",
"Task3-1": "Speech-Driven\nVideo Understanding",
"Task3-2": "Video-Audio\nReasoning",
"Task3-3": "Speech-Based\nVideo QA",
"Task3-4": "Speech Generation\nfrom Video",
"Task3-5": "Audio Matching\nfrom Video",
"Task3-6": "Next-Action\nPrediction",
}
AUDIO_RENAME = {
"Task1-3": "Audio Reasoning"
}
IMAGE_GEN_RENAME = {
"WIScore": "WIScore",
"SC": "Semantic\nConsistency",
"PQ": "Perceptual\nQuality",
"OR": "Overall\nQuality",
}
VIDEO_GEN_RENAME = {
"Imaging": "Imaging",
"Aesthetic": "Aesthetic",
"Motion": "Motion",
"Temporal": "Temporal",
}
# =========================
# Utils
# =========================
def format_numeric_columns(df, decimals=2):
df = df.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
df[col] = df[col].map(
lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else ""
)
return df
def load_csv(filename, sort_key=None, ascending=False):
csv_path = LOCAL_DATA_DIR / filename
df = pd.read_csv(csv_path)
if sort_key and sort_key in df.columns:
df = df.sort_values(sort_key, ascending=ascending)
df = format_numeric_columns(df, decimals=2)
return df
# =========================
# Submission Logic๏ผˆไธๅŠจ๏ผ‰
# =========================
api = HfApi()
def parse_submission(file_bytes):
data = json.loads(file_bytes.decode("utf-8"))
required = ["benchmark", "track", "model", "type", "metrics"]
for k in required:
if k not in data:
raise ValueError(f"Missing field: {k}")
if data["benchmark"] != "OmniWorld":
raise ValueError("Invalid benchmark")
if data["track"] not in TRACK_TO_CSV:
raise ValueError("Invalid track")
return data
def append_submission(data):
csv_name = TRACK_TO_CSV[data["track"]]
csv_path = LOCAL_DATA_DIR / csv_name
df = pd.read_csv(csv_path)
if data["model"] in df["Model"].values:
raise ValueError("Model already exists in leaderboard")
row = {
"Model": data["model"],
"Type": data["type"],
}
row.update(data["metrics"])
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
df.to_csv(csv_path, index=False)
api.upload_file(
path_or_fileobj=str(csv_path),
path_in_repo=csv_name,
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
)
def handle_submit(file):
if file is None:
return "โŒ No file uploaded"
try:
data = parse_submission(file)
append_submission(data)
return "โœ… Submission successful! Please refresh leaderboard."
except Exception as e:
return f"โŒ Error: {str(e)}"
# =========================
# Gradio UI
# =========================
with gr.Blocks(
theme=gr.themes.Soft(),
css="""
.container {
max-width: 1200px;
margin: auto;
}
.leaderboard-links a {
display: inline-block;
margin: 0 8px;
padding: 6px 12px;
border-radius: 20px;
background: #f4f4f5;
color: #111827;
text-decoration: none;
font-weight: 500;
font-size: 14px;
}
.leaderboard-links a:hover {
background: #e5e7eb;
}
.description {
max-width: 900px;
margin: 18px auto 30px auto;
font-size: 16px;
line-height: 1.7;
color: #374151;
text-align: center;
}
body, .gradio-container {
font-family:
-apple-system,
BlinkMacSystemFont,
"Segoe UI",
Roboto,
"Helvetica Neue",
Arial,
"Noto Sans",
"Liberation Sans",
sans-serif;
}
/* OmniLLM ่กจๆ ผ๏ผš็ฌฌ 1 ๅˆ—๏ผˆModel๏ผ‰ */
table th:nth-child(1),
table td:nth-child(1) {
min-width: 220px;
max-width: 220px;
white-space: nowrap;
}
/* ็ฌฌ 2 ๅˆ—๏ผˆType๏ผ‰ */
table th:nth-child(2),
table td:nth-child(2) {
min-width: 120px;
max-width: 120px;
}
.overall-definition {
max-width: 900px;
margin: 30px auto 40px auto;
padding: 22px 28px;
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 14px;
font-size: 15px;
line-height: 1.7;
color: #1f2937;
}
.overall-definition h3 {
text-align: center;
font-size: 22px;
margin-bottom: 16px;
}
.overall-definition strong {
color: #111827;
}
""") as demo:
gr.Markdown(
"""
<h1 style="text-align:center; font-size:42px; margin-bottom:10px;">
๐Ÿ† FysicsWorld Leaderboard
</h1>
<div class="leaderboard-links" style="text-align:center; margin-bottom:12px;">
<a href="https://github.com/Fysics-AI/FysicsWorld" target="_blank"
style="margin: 0 10px;">
๐Ÿ  Project Page
</a>
<a href="https://arxiv.org/pdf/2512.12756" target="_blank"
style="margin: 0 10px;">
๐Ÿ“– Paper
</a>
<a href="https://huggingface.co/datasets/Fysics-AI/FysicsWorld" target="_blank"
style="margin: 0 10px;">
๐Ÿค— Dataset
</a>
<a href="https://www.modelscope.cn/datasets/Fysics-AI/FysicsWorld" target="_blank"
style="margin: 0 10px;">
๐Ÿ‘พ ModelScope
</a>
</div>
<div class="description">
We introduce <b><i>FysicsWorld</i></b>, the <b>first</b> unified full-modality benchmark
that supports bidirectional input-output across <i>image, video, audio, and text</i>,
enabling comprehensive any-to-any evaluation across understanding, generation, and reasoning.
Our systematic design spans uni-modal perception tasks to fusion-dependent reasoning
under strong cross-modal coupling, allowing us to diagnose, with unprecedented clarity,
the limitations and emerging strengths of modern multimodal and omni-modal architectures.
</div>
"""
)
with gr.Tabs():
# ---------- OmniLLM / MLLM ----------
with gr.Tab("๐Ÿง  OmniLLM / MLLM"):
gr.Markdown("Evaluation results for OmniLLM / MLLM models.")
df_omni = load_csv("omni-mllm.csv", sort_key="Overall")
df_omni = df_omni.rename(columns=OMNI_MLLM_RENAME)
omni_table = gr.Dataframe(
value=df_omni,
interactive=False,
wrap=True
)
# ---------- Image Generation ----------
with gr.Tab("๐ŸŽจ Image Generation"):
gr.Markdown("Evaluation results for image generation models.")
df_img = load_csv("image-gen.csv", sort_key="Overall")
df_img = df_img.rename(columns=IMAGE_GEN_RENAME)
image_table = gr.Dataframe(
value=df_img,
interactive=False,
)
# ---------- Video Generation ----------
with gr.Tab("๐ŸŽฌ Video Generation"):
gr.Markdown("Evaluation results for video generation models.")
df_vid = load_csv("video-gen.csv", sort_key="Overall")
df_vid = df_vid.rename(columns=VIDEO_GEN_RENAME)
video_table = gr.Dataframe(
value=df_vid,
interactive=False,
)
# ---------- Audio Reasoning ----------
with gr.Tab("๐ŸŽต Audio Reasoning"):
gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.")
df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
df_aud = df_aud.rename(columns=AUDIO_RENAME)
audio_table = gr.Dataframe(
value=df_aud,
interactive=False,
)
# ---------- Refresh ----------
gr.Button("๐Ÿ”„ Refresh All").click(
fn=lambda: (
load_csv("omni-mllm.csv", "Overall").rename(columns=OMNI_MLLM_RENAME),
load_csv("image-gen.csv", "Overall").rename(columns=IMAGE_GEN_RENAME),
load_csv("video-gen.csv", "Overall").rename(columns=VIDEO_GEN_RENAME),
load_csv("audio-reasoning.csv", "Task1-3").rename(columns=AUDIO_RENAME),
),
outputs=[omni_table, image_table, video_table, audio_table],
)
gr.Markdown(
r"""
### ๐Ÿ“Š Overall Score Definition
To facilitate clearer and more consistent comparison across models, we introduce an **Overall** score for each leaderboard track.
**1. OmniLLM / MLLM**
The **Overall** score is computed as the arithmetic mean of all reported task-specific scores.
**2. Image Generation**
The evaluation involves metrics defined on different numerical scales. **WIScore** is used for image generation, while **VIEScore** (averaged over three dimensions) is used for image editing.
The **Overall** score is defined as:
$$
\text{Overall}=\frac{(\text{WIScore}\times 10)+\left(\frac{\sum \text{VIEScore}}{3}\right)}{2}
$$
This normalization-based formulation ensures a balanced contribution from both image generation and image editing performance.
**3. Video Generation**
The **Overall** score is calculated as the arithmetic mean of all evaluated dimensions, including imaging quality, aesthetics, motion, and temporal consistency.
"""
)
demo.launch()