zhilinw commited on
Commit
9d220e1
·
verified ·
1 Parent(s): c996b9f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. report_generation.jsonl +1 -0
app.py CHANGED
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
- gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 26 Apr 2026.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
 
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
+ gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 1 May 2026.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
report_generation.jsonl CHANGED
@@ -78,3 +78,4 @@
78
  {"Model": "OpenAI/GPT-5.5 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 58.0, "Physics": 39.5, "Chemistry": 70.2, "Finance": 52.2, "Consulting": 70.0, "Extraction": 52.0, "Reasoning": 60.6, "Style": 72.5, "Response Characters": 6546, "Input Tokens": 467, "Output Tokens": 18362, "Cost": 88.51}
79
  {"Model": "DeepSeek-AI/DeepSeek-V4-Pro (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.6, "Physics": 30.5, "Chemistry": 64.0, "Finance": 24.7, "Consulting": 63.4, "Extraction": 39.0, "Reasoning": 44.3, "Style": 59.7, "Response Characters": 5334, "Input Tokens": 461, "Output Tokens": 22351, "Cost": 3.14}
80
  {"Model": "DeepSeek-AI/DeepSeek-V4-Pro", "Category": "Open-weight Instruct", "Overall": 48.2, "Physics": 33.6, "Chemistry": 63.8, "Finance": 28.3, "Consulting": 67.3, "Extraction": 40.0, "Reasoning": 44.5, "Style": 56.9, "Response Characters": 5138, "Input Tokens": 412, "Output Tokens": 15197, "Cost": 2.14}
 
 
78
  {"Model": "OpenAI/GPT-5.5 (xhigh)", "Category": "Closed-source Reasoning", "Overall": 58.0, "Physics": 39.5, "Chemistry": 70.2, "Finance": 52.2, "Consulting": 70.0, "Extraction": 52.0, "Reasoning": 60.6, "Style": 72.5, "Response Characters": 6546, "Input Tokens": 467, "Output Tokens": 18362, "Cost": 88.51}
79
  {"Model": "DeepSeek-AI/DeepSeek-V4-Pro (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.6, "Physics": 30.5, "Chemistry": 64.0, "Finance": 24.7, "Consulting": 63.4, "Extraction": 39.0, "Reasoning": 44.3, "Style": 59.7, "Response Characters": 5334, "Input Tokens": 461, "Output Tokens": 22351, "Cost": 3.14}
80
  {"Model": "DeepSeek-AI/DeepSeek-V4-Pro", "Category": "Open-weight Instruct", "Overall": 48.2, "Physics": 33.6, "Chemistry": 63.8, "Finance": 28.3, "Consulting": 67.3, "Extraction": 40.0, "Reasoning": 44.5, "Style": 56.9, "Response Characters": 5138, "Input Tokens": 412, "Output Tokens": 15197, "Cost": 2.14}
81
+ {"Model": "xAI/grok-4.3 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 43.3, "Physics": 30.3, "Chemistry": 64.6, "Finance": 19.7, "Consulting": 58.7, "Extraction": 34.7, "Reasoning": 43.9, "Style": 57.3, "Response Characters": 3854, "Input Tokens": 566, "Output Tokens": 18603, "Cost": 7.55}