Spaces:

nvidia
/

ProfBench

Running

App Files Files Community

zhilinw commited on 22 days ago

Commit

38075e2

verified ·

1 Parent(s): 64cb0f7

Upload 2 files

Browse files

Files changed (2) hide show

convert_wo_docs_into_json.py +134 -0
report_generation.jsonl +2 -0

convert_wo_docs_into_json.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import json
+filename_to_args = {
+    "gpt-5_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5 (high)", "Closed-source Reasoning", 1.25, 10],
+    "gpt-5-mini_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-mini (high)","Closed-source Reasoning", 0.25, 2],
+    "gpt-5-nano_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-nano (high)","Closed-source Reasoning", 0.05, 0.4],
+    "o3_reasoning_medium_search_0.jsonl": ["OpenAI/o3","Closed-source Reasoning", 2, 8],
+    "o4-mini_reasoning_medium_search_0.jsonl": ["OpenAI/o4-mini","Closed-source Reasoning", 1.1, 4.4],
+    "gemini-2.5-pro_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Pro","Closed-source Reasoning", 1.25, 10],
+    "gemini-2.5-flash_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash (Thinking)","Closed-source Reasoning", 0.3, 2.5],
+    "gemini-2.5-flash-lite_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite (Thinking)","Closed-source Reasoning", 0.1, 0.4],
+    "x-ai_grok-4_reasoning_high_search_0.jsonl": ["xAI/grok-4-0709","Closed-source Reasoning", 3, 15],
+    "anthropic_claude-sonnet-4_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4 (Thinking)","Closed-source Reasoning", 3, 15],
+    "openai_gpt-oss-120b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-120b (high)", "Open-weight Reasoning", 0.04, 0.4],
+    "openai_gpt-oss-20b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-20b (high)", "Open-weight Reasoning", 0.03, 0.14],
+    "deepseek_deepseek-chat-v3.1_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Open-weight Reasoning", 0.2, 0.8],
+    "qwen_qwen3-235b-a22b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Thinking-2507", "Open-weight Reasoning", 0.11, 0.6],
+    "qwen_qwen3-30b-a3b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Thinking-2507", "Open-weight Reasoning", 0.08, 0.29],
+    "gpt-4.1_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1", "Closed-source Instruct", 2, 8],
+    "gpt-4.1-mini_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-mini", "Closed-source Instruct", 0.4, 1.6],
+    "gpt-4.1-nano_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-nano", "Closed-source Instruct", 0.1, 0.4],
+    "gemini-2.5-flash_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash","Closed-source Instruct", 0.3, 2.5],
+    "gemini-2.5-flash-lite_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite","Closed-source Instruct", 0.1, 0.4],
+    "anthropic_claude-sonnet-4_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4","Closed-source Instruct", 3, 15],
+    "anthropic_claude-3.5-haiku_reasoning_False_search_0.jsonl": ["Anthropic/claude-3.5-haiku", "Closed-source Instruct", 0.8, 4],
+    "qwen_qwen3-235b-a22b-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Instruct-2507", "Open-weight Instruct", 0.08, 0.55],
+    "qwen_qwen3-30b-a3b-instruct-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Instruct-2507", "Open-weight Instruct", 0.08, 0.33],
+    "deepseek_deepseek-chat-v3.1_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1", "Open-weight Instruct", 0.2, 0.8],
+    "moonshotai_kimi-k2-0905_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Instruct-0905", "Open-weight Instruct", 0.39, 1.9],
+    "meta-llama_llama-4-maverick_reasoning_False_search_0.jsonl": ["Meta/llama-4-maverick", "Open-weight Instruct", 0.15, 0.6],
+    "meta-llama_llama-4-scout_reasoning_False_search_0.jsonl": ["Meta/llama-4-scout", "Open-weight Instruct", 0.08, 0.3],
+    "x-ai_grok-4-fast_reasoning_high_search_0.jsonl":["xAI/grok-4-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
+    "x-ai_grok-4-fast_reasoning_False_search_0.jsonl":["xAI/grok-4-fast", "Closed-source Instruct", 0.2, 0.5],
+    "anthropic_claude-haiku-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-haiku-4.5 (Thinking)","Closed-source Reasoning", 1, 5],
+    "anthropic_claude-haiku-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-haiku-4.5","Closed-source Instruct", 1, 5],
+    "anthropic_claude-sonnet-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4.5 (Thinking)","Closed-source Reasoning", 3, 15],
+    "anthropic_claude-sonnet-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.5","Closed-source Instruct", 3, 15],
+    "minimax_minimax-m2_reasoning_high_search_0.jsonl": ["MiniMax/M2 (Thinking)","Open-weight Reasoning", 0.15, 0.45],
+    "minimax_minimax-m2_reasoning_False_search_0.jsonl": ["MiniMax/M2","Open-weight Instruct", 0.15, 0.45],
+    "moonshotai_kimi-k2-thinking_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Thinking","Open-weight Reasoning", 0.55, 2.25],
+    "deepseek_deepseek-v3.2-exp_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
+    "deepseek_deepseek-v3.2-exp_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp", "Open-weight Instruct", 0.27, 0.4],
+    "gemini-3-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Pro-Preview","Closed-source Reasoning", 2, 12],
+    "x-ai_grok-4.1-fast_reasoning_high_search_0.jsonl":["xAI/grok-4.1-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
+    "x-ai_grok-4.1-fast_reasoning_False_search_0.jsonl":["xAI/grok-4.1-fast", "Closed-source Instruct", 0.2, 0.5],
+    "gpt-5.1_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5.1 (high)", "Closed-source Reasoning", 1.25, 10],
+    "anthropic_claude-opus-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-opus-4.5 (Thinking)","Closed-source Reasoning", 5, 25],
+    "deepseek_deepseek-v3.2_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
+    "deepseek_deepseek-v3.2_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2", "Open-weight Instruct", 0.27, 0.4],
+    "gpt-5.2_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.2 (xhigh)", "Closed-source Reasoning", 1.75, 14],
+    "gemini-3-flash-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Flash-Preview (Thinking)","Closed-source Reasoning", 0.5, 3],
+    "gemini-3-flash-preview_reasoning_False_search_0.jsonl": ["Google/Gemini-3-Flash-Preview","Closed-source Instruct", 0.5, 3],
+    "z-ai_glm-4.7_reasoning_high_search_0.jsonl":["Z-AI/GLM-4.7 (Thinking)", "Open-weight Reasoning", 0.4, 1.5],
+    "z-ai_glm-4.7_reasoning_False_search_0.jsonl":["Z-AI/GLM-4.7", "Open-weight Instruct", 0.4, 1.5],
+    "minimax_minimax-m2.1_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.1 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
+    "minimax_minimax-m2.1_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.1", "Open-weight Instruct", 0.3, 1.2],
+    "anthropic_claude-opus-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.6 (Thinking)","Closed-source Reasoning", 5, 25],
+    "moonshotai_kimi-k2.5_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.5 (Thinking)","Open-weight Reasoning", 0.45, 2.5],
+    "moonshotai_kimi-k2.5_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.5","Open-weight Instruct", 0.45, 2.5],
+    "z-ai_glm-5_reasoning_high_search_0.jsonl":["Z-AI/GLM-5 (Thinking)", "Open-weight Reasoning", 1.0, 3.2],
+    "z-ai_glm-5_reasoning_False_search_0.jsonl":["Z-AI/GLM-5", "Open-weight Instruct", 1.0, 3.2],
+    "qwen_qwen3.5-397b-a17b_reasoning_high_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B (Thinking)", "Open-weight Reasoning", 0.6, 3.6],
+    "qwen_qwen3.5-397b-a17b_reasoning_False_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B", "Open-weight Instruct", 0.6, 3.6],
+    "minimax_minimax-m2.5_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.5 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
+    "minimax_minimax-m2.5_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.5", "Open-weight Instruct", 0.3, 1.2],
+    "anthropic_claude-sonnet-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.6 (Thinking)","Closed-source Reasoning", 3, 15],
+    "gemini-3.1-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3.1-Pro-Preview","Closed-source Reasoning", 2, 12],
+    "gpt-5.4_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.4 (xhigh)", "Closed-source Reasoning", 2.5, 15],
+    "gpt-5.3-codex_reasoning_xhigh_search_0.jsonl":["OpenAI/GPT-5.3-Codex (xhigh)", "Closed-source Reasoning", 1.75, 14],
+    "gpt-5.3-chat-latest_reasoning_False_search_0.jsonl":["OpenAI/GPT-5.3-Chat", "Closed-source Instruct", 1.75, 14],
+    "x-ai_grok-4.20-beta_reasoning_high_search_0.jsonl":["xAI/grok-4.20 Beta (Thinking)", "Closed-source Reasoning", 2, 6],
+    "minimax_minimax-m2.7_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.7 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
+    "minimax_minimax-m2.7_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.7", "Open-weight Instruct", 0.3, 1.2],
+    "google_gemma-4-31b-it_reasoning_high_search_0.jsonl": ["Google/Gemma-4-31B-It (Thinking)", "Open-weight Reasoning", 0.14, 0.4],
+    "google_gemma-4-31b-it_reasoning_False_search_0.jsonl": ["Google/Gemma-4-31B-It", "Open-weight Instruct", 0.14, 0.4],
+    "anthropic_claude-opus-4.7_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.7 (Thinking)","Closed-source Reasoning", 5, 25],
+    "z-ai_glm-5.1_reasoning_high_search_0.jsonl":["Z-AI/GLM-5.1 (Thinking)", "Open-weight Reasoning", 0.95, 3.15],
+    "z-ai_glm-5.1_reasoning_False_search_0.jsonl":["Z-AI/GLM-5.1", "Open-weight Instruct", 0.95, 3.15],
+    "moonshotai_kimi-k2.6_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.6 (Thinking)","Open-weight Reasoning", 0.6, 2.8],
+    "moonshotai_kimi-k2.6_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.6","Open-weight Instruct", 0.6, 2.8],
+}
+names = "Model & Physics & Chemistry & Finance & Consulting & Overall & Extraction & Reasoning & Style & Response Characters & Input Tokens & Output Tokens & Cost "
+columns = [i.strip() for i in names.split("&")]
+output_filename = "report_generation.jsonl"
+folder = "../ProfBench/scores/"
+with open(output_filename, "w") as fw:
+    for filename in filename_to_args:
+        if not  os.path.exists(folder+filename):
+            raise ValueError(filename + " is not found")
+        # continue
+        with open(folder+filename, "r") as f:
+            one_row = json.load(f)
+        args = filename_to_args[filename]
+        new_dp = {}
+        print(args)
+        model =args[0]
+        category = args[1]
+        in_cost = args[2]
+        out_cost = args[3]
+        print(model)
+        # model, category, in_cost, out_cost = args[0]
+        new_dp["Model"] = model
+        new_dp["Category"] = category
+        new_dp["Overall"] = one_row["Overall"]
+        new_dp["Physics"] = one_row["Physics PhD"]
+        new_dp["Chemistry"] = one_row["Chemistry PhD"]
+        new_dp["Finance"] = one_row["Finance MBA"]
+        new_dp["Consulting"] = one_row["Consulting MBA"]
+        new_dp["Extraction"] = one_row["Extraction (recall)"]
+        new_dp["Reasoning"] = one_row["Reasoning"]
+        new_dp["Style"] = one_row["Style"]
+        new_dp["Response Characters"] = one_row["response_len_chars"]
+        new_dp["Input Tokens"] = one_row["prompt_tokens"]
+        new_dp["Output Tokens"] = one_row["completion_tokens"]
+        new_dp["Cost"] = round(160 / 1000000 * (in_cost  * one_row["prompt_tokens"] + out_cost * one_row["completion_tokens"]),2)
+        fw.write(json.dumps(new_dp)+'\n')

report_generation.jsonl CHANGED Viewed

@@ -73,3 +73,5 @@
 {"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
 {"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
 {"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}

 {"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
 {"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
 {"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
+{"Model": "MoonshotAI/Kimi-K2.6 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 25.6, "Chemistry": 49.0, "Finance": 22.0, "Consulting": 69.2, "Extraction": 31.3, "Reasoning": 39.3, "Style": 60.7, "Response Characters": 6924, "Input Tokens": 459, "Output Tokens": 27876, "Cost": 12.53}
+{"Model": "MoonshotAI/Kimi-K2.6", "Category": "Open-weight Instruct", "Overall": 30.7, "Physics": 13.0, "Chemistry": 29.0, "Finance": 21.9, "Consulting": 58.8, "Extraction": 22.4, "Reasoning": 28.8, "Style": 52.6, "Response Characters": 7107, "Input Tokens": 465, "Output Tokens": 20374, "Cost": 9.17}