Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import json | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download, HfApi | |
| # ========================= | |
| # Basic Config | |
| # ========================= | |
| DATASET_REPO = "Fysics-AI/FysicsWorld-Leaderborad-Result" | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| TRACK_TO_CSV = { | |
| "omni-mllm": "omni-mllm.csv", | |
| "image-gen": "image-gen.csv", | |
| "video-gen": "video-gen.csv", | |
| } | |
| # ========================= | |
| # Download Dataset (once) | |
| # ========================= | |
| LOCAL_DATA_DIR = Path( | |
| snapshot_download( | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| ) | |
| print("๐ Dataset dir:", LOCAL_DATA_DIR) | |
| print("๐ Files:", [p.name for p in LOCAL_DATA_DIR.iterdir()]) | |
| # ========================= | |
| # Column Rename Maps (ๅ ณ้ฎไฟฎๅค็น) | |
| # ========================= | |
| OMNI_MLLM_RENAME = { | |
| "Task1-1": "Image\nUnderstanding", | |
| "Task1-2": "Video\nUnderstanding", | |
| "Task2-1": "Speech-Driven\nImage Understanding", | |
| "Task2-2": "Image-Audio\nReasoning", | |
| "Task2-3": "Speech-Based\nImage QA", | |
| "Task2-4": "Speech Generation\nfrom Image", | |
| "Task2-5": "Audio Matching\nfrom Image", | |
| "Task3-1": "Speech-Driven\nVideo Understanding", | |
| "Task3-2": "Video-Audio\nReasoning", | |
| "Task3-3": "Speech-Based\nVideo QA", | |
| "Task3-4": "Speech Generation\nfrom Video", | |
| "Task3-5": "Audio Matching\nfrom Video", | |
| "Task3-6": "Next-Action\nPrediction", | |
| } | |
| AUDIO_RENAME = { | |
| "Task1-3": "Audio Reasoning" | |
| } | |
| IMAGE_GEN_RENAME = { | |
| "WIScore": "WIScore", | |
| "SC": "Semantic\nConsistency", | |
| "PQ": "Perceptual\nQuality", | |
| "OR": "Overall\nQuality", | |
| } | |
| VIDEO_GEN_RENAME = { | |
| "Imaging": "Imaging", | |
| "Aesthetic": "Aesthetic", | |
| "Motion": "Motion", | |
| "Temporal": "Temporal", | |
| } | |
| # ========================= | |
| # Utils | |
| # ========================= | |
| def format_numeric_columns(df, decimals=2): | |
| df = df.copy() | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| for col in numeric_cols: | |
| df[col] = df[col].map( | |
| lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else "" | |
| ) | |
| return df | |
| def load_csv(filename, sort_key=None, ascending=False): | |
| csv_path = LOCAL_DATA_DIR / filename | |
| df = pd.read_csv(csv_path) | |
| if sort_key and sort_key in df.columns: | |
| df = df.sort_values(sort_key, ascending=ascending) | |
| df = format_numeric_columns(df, decimals=2) | |
| return df | |
| # ========================= | |
| # Submission Logic๏ผไธๅจ๏ผ | |
| # ========================= | |
| api = HfApi() | |
| def parse_submission(file_bytes): | |
| data = json.loads(file_bytes.decode("utf-8")) | |
| required = ["benchmark", "track", "model", "type", "metrics"] | |
| for k in required: | |
| if k not in data: | |
| raise ValueError(f"Missing field: {k}") | |
| if data["benchmark"] != "OmniWorld": | |
| raise ValueError("Invalid benchmark") | |
| if data["track"] not in TRACK_TO_CSV: | |
| raise ValueError("Invalid track") | |
| return data | |
| def append_submission(data): | |
| csv_name = TRACK_TO_CSV[data["track"]] | |
| csv_path = LOCAL_DATA_DIR / csv_name | |
| df = pd.read_csv(csv_path) | |
| if data["model"] in df["Model"].values: | |
| raise ValueError("Model already exists in leaderboard") | |
| row = { | |
| "Model": data["model"], | |
| "Type": data["type"], | |
| } | |
| row.update(data["metrics"]) | |
| df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) | |
| df.to_csv(csv_path, index=False) | |
| api.upload_file( | |
| path_or_fileobj=str(csv_path), | |
| path_in_repo=csv_name, | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| def handle_submit(file): | |
| if file is None: | |
| return "โ No file uploaded" | |
| try: | |
| data = parse_submission(file) | |
| append_submission(data) | |
| return "โ Submission successful! Please refresh leaderboard." | |
| except Exception as e: | |
| return f"โ Error: {str(e)}" | |
| # ========================= | |
| # Gradio UI | |
| # ========================= | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .container { | |
| max-width: 1200px; | |
| margin: auto; | |
| } | |
| .leaderboard-links a { | |
| display: inline-block; | |
| margin: 0 8px; | |
| padding: 6px 12px; | |
| border-radius: 20px; | |
| background: #f4f4f5; | |
| color: #111827; | |
| text-decoration: none; | |
| font-weight: 500; | |
| font-size: 14px; | |
| } | |
| .leaderboard-links a:hover { | |
| background: #e5e7eb; | |
| } | |
| .description { | |
| max-width: 900px; | |
| margin: 18px auto 30px auto; | |
| font-size: 16px; | |
| line-height: 1.7; | |
| color: #374151; | |
| text-align: center; | |
| } | |
| body, .gradio-container { | |
| font-family: | |
| -apple-system, | |
| BlinkMacSystemFont, | |
| "Segoe UI", | |
| Roboto, | |
| "Helvetica Neue", | |
| Arial, | |
| "Noto Sans", | |
| "Liberation Sans", | |
| sans-serif; | |
| } | |
| /* OmniLLM ่กจๆ ผ๏ผ็ฌฌ 1 ๅ๏ผModel๏ผ */ | |
| table th:nth-child(1), | |
| table td:nth-child(1) { | |
| min-width: 220px; | |
| max-width: 220px; | |
| white-space: nowrap; | |
| } | |
| /* ็ฌฌ 2 ๅ๏ผType๏ผ */ | |
| table th:nth-child(2), | |
| table td:nth-child(2) { | |
| min-width: 120px; | |
| max-width: 120px; | |
| } | |
| .overall-definition { | |
| max-width: 900px; | |
| margin: 30px auto 40px auto; | |
| padding: 22px 28px; | |
| background: #f9fafb; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 14px; | |
| font-size: 15px; | |
| line-height: 1.7; | |
| color: #1f2937; | |
| } | |
| .overall-definition h3 { | |
| text-align: center; | |
| font-size: 22px; | |
| margin-bottom: 16px; | |
| } | |
| .overall-definition strong { | |
| color: #111827; | |
| } | |
| """) as demo: | |
| gr.Markdown( | |
| """ | |
| <h1 style="text-align:center; font-size:42px; margin-bottom:10px;"> | |
| ๐ FysicsWorld Leaderboard | |
| </h1> | |
| <div class="leaderboard-links" style="text-align:center; margin-bottom:12px;"> | |
| <a href="https://github.com/Fysics-AI/FysicsWorld" target="_blank" | |
| style="margin: 0 10px;"> | |
| ๐ Project Page | |
| </a> | |
| <a href="https://arxiv.org/pdf/2512.12756" target="_blank" | |
| style="margin: 0 10px;"> | |
| ๐ Paper | |
| </a> | |
| <a href="https://huggingface.co/datasets/Fysics-AI/FysicsWorld" target="_blank" | |
| style="margin: 0 10px;"> | |
| ๐ค Dataset | |
| </a> | |
| <a href="https://www.modelscope.cn/datasets/Fysics-AI/FysicsWorld" target="_blank" | |
| style="margin: 0 10px;"> | |
| ๐พ ModelScope | |
| </a> | |
| </div> | |
| <div class="description"> | |
| We introduce <b><i>FysicsWorld</i></b>, the <b>first</b> unified full-modality benchmark | |
| that supports bidirectional input-output across <i>image, video, audio, and text</i>, | |
| enabling comprehensive any-to-any evaluation across understanding, generation, and reasoning. | |
| Our systematic design spans uni-modal perception tasks to fusion-dependent reasoning | |
| under strong cross-modal coupling, allowing us to diagnose, with unprecedented clarity, | |
| the limitations and emerging strengths of modern multimodal and omni-modal architectures. | |
| </div> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # ---------- OmniLLM / MLLM ---------- | |
| with gr.Tab("๐ง OmniLLM / MLLM"): | |
| gr.Markdown("Evaluation results for OmniLLM / MLLM models.") | |
| df_omni = load_csv("omni-mllm.csv", sort_key="Overall") | |
| df_omni = df_omni.rename(columns=OMNI_MLLM_RENAME) | |
| omni_table = gr.Dataframe( | |
| value=df_omni, | |
| interactive=False, | |
| wrap=True | |
| ) | |
| # ---------- Image Generation ---------- | |
| with gr.Tab("๐จ Image Generation"): | |
| gr.Markdown("Evaluation results for image generation models.") | |
| df_img = load_csv("image-gen.csv", sort_key="Overall") | |
| df_img = df_img.rename(columns=IMAGE_GEN_RENAME) | |
| image_table = gr.Dataframe( | |
| value=df_img, | |
| interactive=False, | |
| ) | |
| # ---------- Video Generation ---------- | |
| with gr.Tab("๐ฌ Video Generation"): | |
| gr.Markdown("Evaluation results for video generation models.") | |
| df_vid = load_csv("video-gen.csv", sort_key="Overall") | |
| df_vid = df_vid.rename(columns=VIDEO_GEN_RENAME) | |
| video_table = gr.Dataframe( | |
| value=df_vid, | |
| interactive=False, | |
| ) | |
| # ---------- Audio Reasoning ---------- | |
| with gr.Tab("๐ต Audio Reasoning"): | |
| gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.") | |
| df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3") | |
| df_aud = df_aud.rename(columns=AUDIO_RENAME) | |
| audio_table = gr.Dataframe( | |
| value=df_aud, | |
| interactive=False, | |
| ) | |
| # ---------- Refresh ---------- | |
| gr.Button("๐ Refresh All").click( | |
| fn=lambda: ( | |
| load_csv("omni-mllm.csv", "Overall").rename(columns=OMNI_MLLM_RENAME), | |
| load_csv("image-gen.csv", "Overall").rename(columns=IMAGE_GEN_RENAME), | |
| load_csv("video-gen.csv", "Overall").rename(columns=VIDEO_GEN_RENAME), | |
| load_csv("audio-reasoning.csv", "Task1-3").rename(columns=AUDIO_RENAME), | |
| ), | |
| outputs=[omni_table, image_table, video_table, audio_table], | |
| ) | |
| gr.Markdown( | |
| r""" | |
| ### ๐ Overall Score Definition | |
| To facilitate clearer and more consistent comparison across models, we introduce an **Overall** score for each leaderboard track. | |
| **1. OmniLLM / MLLM** | |
| The **Overall** score is computed as the arithmetic mean of all reported task-specific scores. | |
| **2. Image Generation** | |
| The evaluation involves metrics defined on different numerical scales. **WIScore** is used for image generation, while **VIEScore** (averaged over three dimensions) is used for image editing. | |
| The **Overall** score is defined as: | |
| $$ | |
| \text{Overall}=\frac{(\text{WIScore}\times 10)+\left(\frac{\sum \text{VIEScore}}{3}\right)}{2} | |
| $$ | |
| This normalization-based formulation ensures a balanced contribution from both image generation and image editing performance. | |
| **3. Video Generation** | |
| The **Overall** score is calculated as the arithmetic mean of all evaluated dimensions, including imaging quality, aesthetics, motion, and temporal consistency. | |
| """ | |
| ) | |
| demo.launch() | |