zhilinw commited on
Commit
38075e2
·
verified ·
1 Parent(s): 64cb0f7

Upload 2 files

Browse files
convert_wo_docs_into_json.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ filename_to_args = {
5
+ "gpt-5_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5 (high)", "Closed-source Reasoning", 1.25, 10],
6
+ "gpt-5-mini_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-mini (high)","Closed-source Reasoning", 0.25, 2],
7
+ "gpt-5-nano_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-nano (high)","Closed-source Reasoning", 0.05, 0.4],
8
+ "o3_reasoning_medium_search_0.jsonl": ["OpenAI/o3","Closed-source Reasoning", 2, 8],
9
+ "o4-mini_reasoning_medium_search_0.jsonl": ["OpenAI/o4-mini","Closed-source Reasoning", 1.1, 4.4],
10
+ "gemini-2.5-pro_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Pro","Closed-source Reasoning", 1.25, 10],
11
+ "gemini-2.5-flash_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash (Thinking)","Closed-source Reasoning", 0.3, 2.5],
12
+ "gemini-2.5-flash-lite_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite (Thinking)","Closed-source Reasoning", 0.1, 0.4],
13
+ "x-ai_grok-4_reasoning_high_search_0.jsonl": ["xAI/grok-4-0709","Closed-source Reasoning", 3, 15],
14
+ "anthropic_claude-sonnet-4_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4 (Thinking)","Closed-source Reasoning", 3, 15],
15
+ "openai_gpt-oss-120b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-120b (high)", "Open-weight Reasoning", 0.04, 0.4],
16
+ "openai_gpt-oss-20b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-20b (high)", "Open-weight Reasoning", 0.03, 0.14],
17
+ "deepseek_deepseek-chat-v3.1_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Open-weight Reasoning", 0.2, 0.8],
18
+ "qwen_qwen3-235b-a22b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Thinking-2507", "Open-weight Reasoning", 0.11, 0.6],
19
+ "qwen_qwen3-30b-a3b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Thinking-2507", "Open-weight Reasoning", 0.08, 0.29],
20
+ "gpt-4.1_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1", "Closed-source Instruct", 2, 8],
21
+ "gpt-4.1-mini_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-mini", "Closed-source Instruct", 0.4, 1.6],
22
+ "gpt-4.1-nano_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-nano", "Closed-source Instruct", 0.1, 0.4],
23
+ "gemini-2.5-flash_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash","Closed-source Instruct", 0.3, 2.5],
24
+ "gemini-2.5-flash-lite_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite","Closed-source Instruct", 0.1, 0.4],
25
+ "anthropic_claude-sonnet-4_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4","Closed-source Instruct", 3, 15],
26
+ "anthropic_claude-3.5-haiku_reasoning_False_search_0.jsonl": ["Anthropic/claude-3.5-haiku", "Closed-source Instruct", 0.8, 4],
27
+ "qwen_qwen3-235b-a22b-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Instruct-2507", "Open-weight Instruct", 0.08, 0.55],
28
+ "qwen_qwen3-30b-a3b-instruct-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Instruct-2507", "Open-weight Instruct", 0.08, 0.33],
29
+ "deepseek_deepseek-chat-v3.1_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1", "Open-weight Instruct", 0.2, 0.8],
30
+ "moonshotai_kimi-k2-0905_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Instruct-0905", "Open-weight Instruct", 0.39, 1.9],
31
+ "meta-llama_llama-4-maverick_reasoning_False_search_0.jsonl": ["Meta/llama-4-maverick", "Open-weight Instruct", 0.15, 0.6],
32
+ "meta-llama_llama-4-scout_reasoning_False_search_0.jsonl": ["Meta/llama-4-scout", "Open-weight Instruct", 0.08, 0.3],
33
+ "x-ai_grok-4-fast_reasoning_high_search_0.jsonl":["xAI/grok-4-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
34
+ "x-ai_grok-4-fast_reasoning_False_search_0.jsonl":["xAI/grok-4-fast", "Closed-source Instruct", 0.2, 0.5],
35
+ "anthropic_claude-haiku-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-haiku-4.5 (Thinking)","Closed-source Reasoning", 1, 5],
36
+ "anthropic_claude-haiku-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-haiku-4.5","Closed-source Instruct", 1, 5],
37
+ "anthropic_claude-sonnet-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4.5 (Thinking)","Closed-source Reasoning", 3, 15],
38
+ "anthropic_claude-sonnet-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.5","Closed-source Instruct", 3, 15],
39
+ "minimax_minimax-m2_reasoning_high_search_0.jsonl": ["MiniMax/M2 (Thinking)","Open-weight Reasoning", 0.15, 0.45],
40
+ "minimax_minimax-m2_reasoning_False_search_0.jsonl": ["MiniMax/M2","Open-weight Instruct", 0.15, 0.45],
41
+ "moonshotai_kimi-k2-thinking_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Thinking","Open-weight Reasoning", 0.55, 2.25],
42
+ "deepseek_deepseek-v3.2-exp_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
43
+ "deepseek_deepseek-v3.2-exp_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp", "Open-weight Instruct", 0.27, 0.4],
44
+ "gemini-3-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Pro-Preview","Closed-source Reasoning", 2, 12],
45
+ "x-ai_grok-4.1-fast_reasoning_high_search_0.jsonl":["xAI/grok-4.1-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
46
+ "x-ai_grok-4.1-fast_reasoning_False_search_0.jsonl":["xAI/grok-4.1-fast", "Closed-source Instruct", 0.2, 0.5],
47
+ "gpt-5.1_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5.1 (high)", "Closed-source Reasoning", 1.25, 10],
48
+ "anthropic_claude-opus-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-opus-4.5 (Thinking)","Closed-source Reasoning", 5, 25],
49
+ "deepseek_deepseek-v3.2_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
50
+ "deepseek_deepseek-v3.2_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2", "Open-weight Instruct", 0.27, 0.4],
51
+ "gpt-5.2_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.2 (xhigh)", "Closed-source Reasoning", 1.75, 14],
52
+ "gemini-3-flash-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Flash-Preview (Thinking)","Closed-source Reasoning", 0.5, 3],
53
+ "gemini-3-flash-preview_reasoning_False_search_0.jsonl": ["Google/Gemini-3-Flash-Preview","Closed-source Instruct", 0.5, 3],
54
+ "z-ai_glm-4.7_reasoning_high_search_0.jsonl":["Z-AI/GLM-4.7 (Thinking)", "Open-weight Reasoning", 0.4, 1.5],
55
+ "z-ai_glm-4.7_reasoning_False_search_0.jsonl":["Z-AI/GLM-4.7", "Open-weight Instruct", 0.4, 1.5],
56
+ "minimax_minimax-m2.1_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.1 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
57
+ "minimax_minimax-m2.1_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.1", "Open-weight Instruct", 0.3, 1.2],
58
+ "anthropic_claude-opus-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.6 (Thinking)","Closed-source Reasoning", 5, 25],
59
+ "moonshotai_kimi-k2.5_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.5 (Thinking)","Open-weight Reasoning", 0.45, 2.5],
60
+ "moonshotai_kimi-k2.5_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.5","Open-weight Instruct", 0.45, 2.5],
61
+ "z-ai_glm-5_reasoning_high_search_0.jsonl":["Z-AI/GLM-5 (Thinking)", "Open-weight Reasoning", 1.0, 3.2],
62
+ "z-ai_glm-5_reasoning_False_search_0.jsonl":["Z-AI/GLM-5", "Open-weight Instruct", 1.0, 3.2],
63
+ "qwen_qwen3.5-397b-a17b_reasoning_high_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B (Thinking)", "Open-weight Reasoning", 0.6, 3.6],
64
+ "qwen_qwen3.5-397b-a17b_reasoning_False_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B", "Open-weight Instruct", 0.6, 3.6],
65
+ "minimax_minimax-m2.5_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.5 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
66
+ "minimax_minimax-m2.5_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.5", "Open-weight Instruct", 0.3, 1.2],
67
+ "anthropic_claude-sonnet-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.6 (Thinking)","Closed-source Reasoning", 3, 15],
68
+ "gemini-3.1-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3.1-Pro-Preview","Closed-source Reasoning", 2, 12],
69
+ "gpt-5.4_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.4 (xhigh)", "Closed-source Reasoning", 2.5, 15],
70
+ "gpt-5.3-codex_reasoning_xhigh_search_0.jsonl":["OpenAI/GPT-5.3-Codex (xhigh)", "Closed-source Reasoning", 1.75, 14],
71
+ "gpt-5.3-chat-latest_reasoning_False_search_0.jsonl":["OpenAI/GPT-5.3-Chat", "Closed-source Instruct", 1.75, 14],
72
+ "x-ai_grok-4.20-beta_reasoning_high_search_0.jsonl":["xAI/grok-4.20 Beta (Thinking)", "Closed-source Reasoning", 2, 6],
73
+ "minimax_minimax-m2.7_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.7 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
74
+ "minimax_minimax-m2.7_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.7", "Open-weight Instruct", 0.3, 1.2],
75
+ "google_gemma-4-31b-it_reasoning_high_search_0.jsonl": ["Google/Gemma-4-31B-It (Thinking)", "Open-weight Reasoning", 0.14, 0.4],
76
+ "google_gemma-4-31b-it_reasoning_False_search_0.jsonl": ["Google/Gemma-4-31B-It", "Open-weight Instruct", 0.14, 0.4],
77
+ "anthropic_claude-opus-4.7_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.7 (Thinking)","Closed-source Reasoning", 5, 25],
78
+ "z-ai_glm-5.1_reasoning_high_search_0.jsonl":["Z-AI/GLM-5.1 (Thinking)", "Open-weight Reasoning", 0.95, 3.15],
79
+ "z-ai_glm-5.1_reasoning_False_search_0.jsonl":["Z-AI/GLM-5.1", "Open-weight Instruct", 0.95, 3.15],
80
+ "moonshotai_kimi-k2.6_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.6 (Thinking)","Open-weight Reasoning", 0.6, 2.8],
81
+ "moonshotai_kimi-k2.6_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.6","Open-weight Instruct", 0.6, 2.8],
82
+
83
+ }
84
+
85
+ names = "Model & Physics & Chemistry & Finance & Consulting & Overall & Extraction & Reasoning & Style & Response Characters & Input Tokens & Output Tokens & Cost "
86
+
87
+ columns = [i.strip() for i in names.split("&")]
88
+
89
+ output_filename = "report_generation.jsonl"
90
+
91
+
92
+ folder = "../ProfBench/scores/"
93
+
94
+ with open(output_filename, "w") as fw:
95
+ for filename in filename_to_args:
96
+ if not os.path.exists(folder+filename):
97
+ raise ValueError(filename + " is not found")
98
+ # continue
99
+ with open(folder+filename, "r") as f:
100
+ one_row = json.load(f)
101
+ args = filename_to_args[filename]
102
+
103
+ new_dp = {}
104
+ print(args)
105
+ model =args[0]
106
+ category = args[1]
107
+ in_cost = args[2]
108
+ out_cost = args[3]
109
+ print(model)
110
+ # model, category, in_cost, out_cost = args[0]
111
+
112
+ new_dp["Model"] = model
113
+ new_dp["Category"] = category
114
+
115
+ new_dp["Overall"] = one_row["Overall"]
116
+
117
+ new_dp["Physics"] = one_row["Physics PhD"]
118
+ new_dp["Chemistry"] = one_row["Chemistry PhD"]
119
+ new_dp["Finance"] = one_row["Finance MBA"]
120
+ new_dp["Consulting"] = one_row["Consulting MBA"]
121
+
122
+
123
+ new_dp["Extraction"] = one_row["Extraction (recall)"]
124
+ new_dp["Reasoning"] = one_row["Reasoning"]
125
+ new_dp["Style"] = one_row["Style"]
126
+
127
+ new_dp["Response Characters"] = one_row["response_len_chars"]
128
+ new_dp["Input Tokens"] = one_row["prompt_tokens"]
129
+ new_dp["Output Tokens"] = one_row["completion_tokens"]
130
+ new_dp["Cost"] = round(160 / 1000000 * (in_cost * one_row["prompt_tokens"] + out_cost * one_row["completion_tokens"]),2)
131
+ fw.write(json.dumps(new_dp)+'\n')
132
+
133
+
134
+
report_generation.jsonl CHANGED
@@ -73,3 +73,5 @@
73
  {"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
74
  {"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
75
  {"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
 
 
 
73
  {"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
74
  {"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
75
  {"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
76
+ {"Model": "MoonshotAI/Kimi-K2.6 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 25.6, "Chemistry": 49.0, "Finance": 22.0, "Consulting": 69.2, "Extraction": 31.3, "Reasoning": 39.3, "Style": 60.7, "Response Characters": 6924, "Input Tokens": 459, "Output Tokens": 27876, "Cost": 12.53}
77
+ {"Model": "MoonshotAI/Kimi-K2.6", "Category": "Open-weight Instruct", "Overall": 30.7, "Physics": 13.0, "Chemistry": 29.0, "Finance": 21.9, "Consulting": 58.8, "Extraction": 22.4, "Reasoning": 28.8, "Style": 52.6, "Response Characters": 7107, "Input Tokens": 465, "Output Tokens": 20374, "Cost": 9.17}