Upload 2 files
Browse files- convert_wo_docs_into_json.py +134 -0
- report_generation.jsonl +2 -0
convert_wo_docs_into_json.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
filename_to_args = {
|
| 5 |
+
"gpt-5_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5 (high)", "Closed-source Reasoning", 1.25, 10],
|
| 6 |
+
"gpt-5-mini_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-mini (high)","Closed-source Reasoning", 0.25, 2],
|
| 7 |
+
"gpt-5-nano_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-nano (high)","Closed-source Reasoning", 0.05, 0.4],
|
| 8 |
+
"o3_reasoning_medium_search_0.jsonl": ["OpenAI/o3","Closed-source Reasoning", 2, 8],
|
| 9 |
+
"o4-mini_reasoning_medium_search_0.jsonl": ["OpenAI/o4-mini","Closed-source Reasoning", 1.1, 4.4],
|
| 10 |
+
"gemini-2.5-pro_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Pro","Closed-source Reasoning", 1.25, 10],
|
| 11 |
+
"gemini-2.5-flash_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash (Thinking)","Closed-source Reasoning", 0.3, 2.5],
|
| 12 |
+
"gemini-2.5-flash-lite_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite (Thinking)","Closed-source Reasoning", 0.1, 0.4],
|
| 13 |
+
"x-ai_grok-4_reasoning_high_search_0.jsonl": ["xAI/grok-4-0709","Closed-source Reasoning", 3, 15],
|
| 14 |
+
"anthropic_claude-sonnet-4_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4 (Thinking)","Closed-source Reasoning", 3, 15],
|
| 15 |
+
"openai_gpt-oss-120b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-120b (high)", "Open-weight Reasoning", 0.04, 0.4],
|
| 16 |
+
"openai_gpt-oss-20b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-20b (high)", "Open-weight Reasoning", 0.03, 0.14],
|
| 17 |
+
"deepseek_deepseek-chat-v3.1_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Open-weight Reasoning", 0.2, 0.8],
|
| 18 |
+
"qwen_qwen3-235b-a22b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Thinking-2507", "Open-weight Reasoning", 0.11, 0.6],
|
| 19 |
+
"qwen_qwen3-30b-a3b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Thinking-2507", "Open-weight Reasoning", 0.08, 0.29],
|
| 20 |
+
"gpt-4.1_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1", "Closed-source Instruct", 2, 8],
|
| 21 |
+
"gpt-4.1-mini_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-mini", "Closed-source Instruct", 0.4, 1.6],
|
| 22 |
+
"gpt-4.1-nano_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-nano", "Closed-source Instruct", 0.1, 0.4],
|
| 23 |
+
"gemini-2.5-flash_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash","Closed-source Instruct", 0.3, 2.5],
|
| 24 |
+
"gemini-2.5-flash-lite_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite","Closed-source Instruct", 0.1, 0.4],
|
| 25 |
+
"anthropic_claude-sonnet-4_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4","Closed-source Instruct", 3, 15],
|
| 26 |
+
"anthropic_claude-3.5-haiku_reasoning_False_search_0.jsonl": ["Anthropic/claude-3.5-haiku", "Closed-source Instruct", 0.8, 4],
|
| 27 |
+
"qwen_qwen3-235b-a22b-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Instruct-2507", "Open-weight Instruct", 0.08, 0.55],
|
| 28 |
+
"qwen_qwen3-30b-a3b-instruct-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Instruct-2507", "Open-weight Instruct", 0.08, 0.33],
|
| 29 |
+
"deepseek_deepseek-chat-v3.1_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1", "Open-weight Instruct", 0.2, 0.8],
|
| 30 |
+
"moonshotai_kimi-k2-0905_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Instruct-0905", "Open-weight Instruct", 0.39, 1.9],
|
| 31 |
+
"meta-llama_llama-4-maverick_reasoning_False_search_0.jsonl": ["Meta/llama-4-maverick", "Open-weight Instruct", 0.15, 0.6],
|
| 32 |
+
"meta-llama_llama-4-scout_reasoning_False_search_0.jsonl": ["Meta/llama-4-scout", "Open-weight Instruct", 0.08, 0.3],
|
| 33 |
+
"x-ai_grok-4-fast_reasoning_high_search_0.jsonl":["xAI/grok-4-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
|
| 34 |
+
"x-ai_grok-4-fast_reasoning_False_search_0.jsonl":["xAI/grok-4-fast", "Closed-source Instruct", 0.2, 0.5],
|
| 35 |
+
"anthropic_claude-haiku-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-haiku-4.5 (Thinking)","Closed-source Reasoning", 1, 5],
|
| 36 |
+
"anthropic_claude-haiku-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-haiku-4.5","Closed-source Instruct", 1, 5],
|
| 37 |
+
"anthropic_claude-sonnet-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4.5 (Thinking)","Closed-source Reasoning", 3, 15],
|
| 38 |
+
"anthropic_claude-sonnet-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.5","Closed-source Instruct", 3, 15],
|
| 39 |
+
"minimax_minimax-m2_reasoning_high_search_0.jsonl": ["MiniMax/M2 (Thinking)","Open-weight Reasoning", 0.15, 0.45],
|
| 40 |
+
"minimax_minimax-m2_reasoning_False_search_0.jsonl": ["MiniMax/M2","Open-weight Instruct", 0.15, 0.45],
|
| 41 |
+
"moonshotai_kimi-k2-thinking_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Thinking","Open-weight Reasoning", 0.55, 2.25],
|
| 42 |
+
"deepseek_deepseek-v3.2-exp_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
|
| 43 |
+
"deepseek_deepseek-v3.2-exp_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp", "Open-weight Instruct", 0.27, 0.4],
|
| 44 |
+
"gemini-3-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Pro-Preview","Closed-source Reasoning", 2, 12],
|
| 45 |
+
"x-ai_grok-4.1-fast_reasoning_high_search_0.jsonl":["xAI/grok-4.1-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
|
| 46 |
+
"x-ai_grok-4.1-fast_reasoning_False_search_0.jsonl":["xAI/grok-4.1-fast", "Closed-source Instruct", 0.2, 0.5],
|
| 47 |
+
"gpt-5.1_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5.1 (high)", "Closed-source Reasoning", 1.25, 10],
|
| 48 |
+
"anthropic_claude-opus-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-opus-4.5 (Thinking)","Closed-source Reasoning", 5, 25],
|
| 49 |
+
"deepseek_deepseek-v3.2_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
|
| 50 |
+
"deepseek_deepseek-v3.2_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2", "Open-weight Instruct", 0.27, 0.4],
|
| 51 |
+
"gpt-5.2_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.2 (xhigh)", "Closed-source Reasoning", 1.75, 14],
|
| 52 |
+
"gemini-3-flash-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Flash-Preview (Thinking)","Closed-source Reasoning", 0.5, 3],
|
| 53 |
+
"gemini-3-flash-preview_reasoning_False_search_0.jsonl": ["Google/Gemini-3-Flash-Preview","Closed-source Instruct", 0.5, 3],
|
| 54 |
+
"z-ai_glm-4.7_reasoning_high_search_0.jsonl":["Z-AI/GLM-4.7 (Thinking)", "Open-weight Reasoning", 0.4, 1.5],
|
| 55 |
+
"z-ai_glm-4.7_reasoning_False_search_0.jsonl":["Z-AI/GLM-4.7", "Open-weight Instruct", 0.4, 1.5],
|
| 56 |
+
"minimax_minimax-m2.1_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.1 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
|
| 57 |
+
"minimax_minimax-m2.1_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.1", "Open-weight Instruct", 0.3, 1.2],
|
| 58 |
+
"anthropic_claude-opus-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.6 (Thinking)","Closed-source Reasoning", 5, 25],
|
| 59 |
+
"moonshotai_kimi-k2.5_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.5 (Thinking)","Open-weight Reasoning", 0.45, 2.5],
|
| 60 |
+
"moonshotai_kimi-k2.5_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.5","Open-weight Instruct", 0.45, 2.5],
|
| 61 |
+
"z-ai_glm-5_reasoning_high_search_0.jsonl":["Z-AI/GLM-5 (Thinking)", "Open-weight Reasoning", 1.0, 3.2],
|
| 62 |
+
"z-ai_glm-5_reasoning_False_search_0.jsonl":["Z-AI/GLM-5", "Open-weight Instruct", 1.0, 3.2],
|
| 63 |
+
"qwen_qwen3.5-397b-a17b_reasoning_high_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B (Thinking)", "Open-weight Reasoning", 0.6, 3.6],
|
| 64 |
+
"qwen_qwen3.5-397b-a17b_reasoning_False_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B", "Open-weight Instruct", 0.6, 3.6],
|
| 65 |
+
"minimax_minimax-m2.5_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.5 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
|
| 66 |
+
"minimax_minimax-m2.5_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.5", "Open-weight Instruct", 0.3, 1.2],
|
| 67 |
+
"anthropic_claude-sonnet-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.6 (Thinking)","Closed-source Reasoning", 3, 15],
|
| 68 |
+
"gemini-3.1-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3.1-Pro-Preview","Closed-source Reasoning", 2, 12],
|
| 69 |
+
"gpt-5.4_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.4 (xhigh)", "Closed-source Reasoning", 2.5, 15],
|
| 70 |
+
"gpt-5.3-codex_reasoning_xhigh_search_0.jsonl":["OpenAI/GPT-5.3-Codex (xhigh)", "Closed-source Reasoning", 1.75, 14],
|
| 71 |
+
"gpt-5.3-chat-latest_reasoning_False_search_0.jsonl":["OpenAI/GPT-5.3-Chat", "Closed-source Instruct", 1.75, 14],
|
| 72 |
+
"x-ai_grok-4.20-beta_reasoning_high_search_0.jsonl":["xAI/grok-4.20 Beta (Thinking)", "Closed-source Reasoning", 2, 6],
|
| 73 |
+
"minimax_minimax-m2.7_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.7 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
|
| 74 |
+
"minimax_minimax-m2.7_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.7", "Open-weight Instruct", 0.3, 1.2],
|
| 75 |
+
"google_gemma-4-31b-it_reasoning_high_search_0.jsonl": ["Google/Gemma-4-31B-It (Thinking)", "Open-weight Reasoning", 0.14, 0.4],
|
| 76 |
+
"google_gemma-4-31b-it_reasoning_False_search_0.jsonl": ["Google/Gemma-4-31B-It", "Open-weight Instruct", 0.14, 0.4],
|
| 77 |
+
"anthropic_claude-opus-4.7_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.7 (Thinking)","Closed-source Reasoning", 5, 25],
|
| 78 |
+
"z-ai_glm-5.1_reasoning_high_search_0.jsonl":["Z-AI/GLM-5.1 (Thinking)", "Open-weight Reasoning", 0.95, 3.15],
|
| 79 |
+
"z-ai_glm-5.1_reasoning_False_search_0.jsonl":["Z-AI/GLM-5.1", "Open-weight Instruct", 0.95, 3.15],
|
| 80 |
+
"moonshotai_kimi-k2.6_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.6 (Thinking)","Open-weight Reasoning", 0.6, 2.8],
|
| 81 |
+
"moonshotai_kimi-k2.6_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.6","Open-weight Instruct", 0.6, 2.8],
|
| 82 |
+
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
names = "Model & Physics & Chemistry & Finance & Consulting & Overall & Extraction & Reasoning & Style & Response Characters & Input Tokens & Output Tokens & Cost "
|
| 86 |
+
|
| 87 |
+
columns = [i.strip() for i in names.split("&")]
|
| 88 |
+
|
| 89 |
+
output_filename = "report_generation.jsonl"
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
folder = "../ProfBench/scores/"
|
| 93 |
+
|
| 94 |
+
with open(output_filename, "w") as fw:
|
| 95 |
+
for filename in filename_to_args:
|
| 96 |
+
if not os.path.exists(folder+filename):
|
| 97 |
+
raise ValueError(filename + " is not found")
|
| 98 |
+
# continue
|
| 99 |
+
with open(folder+filename, "r") as f:
|
| 100 |
+
one_row = json.load(f)
|
| 101 |
+
args = filename_to_args[filename]
|
| 102 |
+
|
| 103 |
+
new_dp = {}
|
| 104 |
+
print(args)
|
| 105 |
+
model =args[0]
|
| 106 |
+
category = args[1]
|
| 107 |
+
in_cost = args[2]
|
| 108 |
+
out_cost = args[3]
|
| 109 |
+
print(model)
|
| 110 |
+
# model, category, in_cost, out_cost = args[0]
|
| 111 |
+
|
| 112 |
+
new_dp["Model"] = model
|
| 113 |
+
new_dp["Category"] = category
|
| 114 |
+
|
| 115 |
+
new_dp["Overall"] = one_row["Overall"]
|
| 116 |
+
|
| 117 |
+
new_dp["Physics"] = one_row["Physics PhD"]
|
| 118 |
+
new_dp["Chemistry"] = one_row["Chemistry PhD"]
|
| 119 |
+
new_dp["Finance"] = one_row["Finance MBA"]
|
| 120 |
+
new_dp["Consulting"] = one_row["Consulting MBA"]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
new_dp["Extraction"] = one_row["Extraction (recall)"]
|
| 124 |
+
new_dp["Reasoning"] = one_row["Reasoning"]
|
| 125 |
+
new_dp["Style"] = one_row["Style"]
|
| 126 |
+
|
| 127 |
+
new_dp["Response Characters"] = one_row["response_len_chars"]
|
| 128 |
+
new_dp["Input Tokens"] = one_row["prompt_tokens"]
|
| 129 |
+
new_dp["Output Tokens"] = one_row["completion_tokens"]
|
| 130 |
+
new_dp["Cost"] = round(160 / 1000000 * (in_cost * one_row["prompt_tokens"] + out_cost * one_row["completion_tokens"]),2)
|
| 131 |
+
fw.write(json.dumps(new_dp)+'\n')
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
report_generation.jsonl
CHANGED
|
@@ -73,3 +73,5 @@
|
|
| 73 |
{"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
|
| 74 |
{"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
|
| 75 |
{"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
|
|
|
|
|
|
|
|
|
| 73 |
{"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
|
| 74 |
{"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
|
| 75 |
{"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
|
| 76 |
+
{"Model": "MoonshotAI/Kimi-K2.6 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 25.6, "Chemistry": 49.0, "Finance": 22.0, "Consulting": 69.2, "Extraction": 31.3, "Reasoning": 39.3, "Style": 60.7, "Response Characters": 6924, "Input Tokens": 459, "Output Tokens": 27876, "Cost": 12.53}
|
| 77 |
+
{"Model": "MoonshotAI/Kimi-K2.6", "Category": "Open-weight Instruct", "Overall": 30.7, "Physics": 13.0, "Chemistry": 29.0, "Finance": 21.9, "Consulting": 58.8, "Extraction": 22.4, "Reasoning": 28.8, "Style": 52.6, "Response Characters": 7107, "Input Tokens": 465, "Output Tokens": 20374, "Cost": 9.17}
|