Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +3 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 26 Apr 2026.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -75,3 +75,6 @@
|
|
| 75 |
{"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
|
| 76 |
{"Model": "MoonshotAI/Kimi-K2.6 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 25.6, "Chemistry": 49.0, "Finance": 22.0, "Consulting": 69.2, "Extraction": 31.3, "Reasoning": 39.3, "Style": 60.7, "Response Characters": 6924, "Input Tokens": 459, "Output Tokens": 27876, "Cost": 12.53}
|
| 77 |
{"Model": "MoonshotAI/Kimi-K2.6", "Category": "Open-weight Instruct", "Overall": 30.7, "Physics": 13.0, "Chemistry": 29.0, "Finance": 21.9, "Consulting": 58.8, "Extraction": 22.4, "Reasoning": 28.8, "Style": 52.6, "Response Characters": 7107, "Input Tokens": 465, "Output Tokens": 20374, "Cost": 9.17}
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
{"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
|
| 76 |
{"Model": "MoonshotAI/Kimi-K2.6 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 25.6, "Chemistry": 49.0, "Finance": 22.0, "Consulting": 69.2, "Extraction": 31.3, "Reasoning": 39.3, "Style": 60.7, "Response Characters": 6924, "Input Tokens": 459, "Output Tokens": 27876, "Cost": 12.53}
|
| 77 |
{"Model": "MoonshotAI/Kimi-K2.6", "Category": "Open-weight Instruct", "Overall": 30.7, "Physics": 13.0, "Chemistry": 29.0, "Finance": 21.9, "Consulting": 58.8, "Extraction": 22.4, "Reasoning": 28.8, "Style": 52.6, "Response Characters": 7107, "Input Tokens": 465, "Output Tokens": 20374, "Cost": 9.17}
|
| 78 |
+
{"Model": "OpenAI/GPT-5.5 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 58.0, "Physics": 39.5, "Chemistry": 70.2, "Finance": 52.2, "Consulting": 70.0, "Extraction": 52.0, "Reasoning": 60.6, "Style": 72.5, "Response Characters": 6546, "Input Tokens": 467, "Output Tokens": 18362, "Cost": 88.51}
|
| 79 |
+
{"Model": "DeepSeek-AI/DeepSeek-V4-Pro (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.6, "Physics": 30.5, "Chemistry": 64.0, "Finance": 24.7, "Consulting": 63.4, "Extraction": 39.0, "Reasoning": 44.3, "Style": 59.7, "Response Characters": 5334, "Input Tokens": 461, "Output Tokens": 22351, "Cost": 3.14}
|
| 80 |
+
{"Model": "DeepSeek-AI/DeepSeek-V4-Pro", "Category": "Open-weight Instruct", "Overall": 48.2, "Physics": 33.6, "Chemistry": 63.8, "Finance": 28.3, "Consulting": 67.3, "Extraction": 40.0, "Reasoning": 44.5, "Style": 56.9, "Response Characters": 5138, "Input Tokens": 412, "Output Tokens": 15197, "Cost": 2.14}
|