Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +4 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 16 Feb 2026.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -56,3 +56,7 @@
|
|
| 56 |
{"Model": "MoonshotAI/Kimi-K2.5", "Category": "Open-weight Instruct", "Overall": 50.4, "Physics": 32.4, "Chemistry": 62.0, "Finance": 37.4, "Consulting": 69.7, "Extraction": 42.6, "Reasoning": 50.7, "Style": 55.6, "Response Characters": 6173, "Input Tokens": 468, "Output Tokens": 14104, "Cost": 5.68}
|
| 57 |
{"Model": "Z-AI/GLM-5 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 43.3, "Physics": 22.4, "Chemistry": 56.6, "Finance": 27.8, "Consulting": 66.3, "Extraction": 36.1, "Reasoning": 41.4, "Style": 53.1, "Response Characters": 5395, "Input Tokens": 461, "Output Tokens": 19499, "Cost": 10.06}
|
| 58 |
{"Model": "Z-AI/GLM-5", "Category": "Open-weight Instruct", "Overall": 42.6, "Physics": 25.7, "Chemistry": 52.2, "Finance": 26.6, "Consulting": 65.9, "Extraction": 33.2, "Reasoning": 41.0, "Style": 53.1, "Response Characters": 5497, "Input Tokens": 459, "Output Tokens": 17179, "Cost": 8.87}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
{"Model": "MoonshotAI/Kimi-K2.5", "Category": "Open-weight Instruct", "Overall": 50.4, "Physics": 32.4, "Chemistry": 62.0, "Finance": 37.4, "Consulting": 69.7, "Extraction": 42.6, "Reasoning": 50.7, "Style": 55.6, "Response Characters": 6173, "Input Tokens": 468, "Output Tokens": 14104, "Cost": 5.68}
|
| 57 |
{"Model": "Z-AI/GLM-5 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 43.3, "Physics": 22.4, "Chemistry": 56.6, "Finance": 27.8, "Consulting": 66.3, "Extraction": 36.1, "Reasoning": 41.4, "Style": 53.1, "Response Characters": 5395, "Input Tokens": 461, "Output Tokens": 19499, "Cost": 10.06}
|
| 58 |
{"Model": "Z-AI/GLM-5", "Category": "Open-weight Instruct", "Overall": 42.6, "Physics": 25.7, "Chemistry": 52.2, "Finance": 26.6, "Consulting": 65.9, "Extraction": 33.2, "Reasoning": 41.0, "Style": 53.1, "Response Characters": 5497, "Input Tokens": 459, "Output Tokens": 17179, "Cost": 8.87}
|
| 59 |
+
{"Model": "Qwen/Qwen3.5-397B-A17B (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.7, "Physics": 34.9, "Chemistry": 54.9, "Finance": 26.9, "Consulting": 62.1, "Extraction": 37.8, "Reasoning": 45.2, "Style": 64.9, "Response Characters": 5656, "Input Tokens": 496, "Output Tokens": 11851, "Cost": 6.87}
|
| 60 |
+
{"Model": "Qwen/Qwen3.5-397B-A17B", "Category": "Open-weight Instruct", "Overall": 45.1, "Physics": 38.5, "Chemistry": 51.6, "Finance": 27.4, "Consulting": 62.8, "Extraction": 37.1, "Reasoning": 46.0, "Style": 61.6, "Response Characters": 6134, "Input Tokens": 496, "Output Tokens": 11052, "Cost": 6.41}
|
| 61 |
+
{"Model": "MiniMax/MiniMax-M2.5 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 42.2, "Physics": 29.6, "Chemistry": 50.5, "Finance": 24.8, "Consulting": 63.8, "Extraction": 30.2, "Reasoning": 41.5, "Style": 62.6, "Response Characters": 15835, "Input Tokens": 484, "Output Tokens": 20003, "Cost": 3.86}
|
| 62 |
+
{"Model": "MiniMax/MiniMax-M2.5", "Category": "Open-weight Instruct", "Overall": 40.0, "Physics": 29.6, "Chemistry": 43.9, "Finance": 22.8, "Consulting": 63.5, "Extraction": 28.9, "Reasoning": 39.8, "Style": 53.6, "Response Characters": 5547, "Input Tokens": 484, "Output Tokens": 18356, "Cost": 3.55}
|