\n", " | Mean Input Tokens | \n", "Mean Output Tokens | \n", "Mean Reasoning Tokens | \n", "Mean Total Tokens | \n", "Mean Latency (ms) | \n", "Correct Answers | \n", "
---|---|---|---|---|---|---|
Model | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
claude-3-5-haiku-latest | \n", "325.95 | \n", "147.65 | \n", "0.0 | \n", "473.60 | \n", "3407.85 | \n", "2 | \n", "
claude-3-7-sonnet-latest | \n", "325.95 | \n", "270.60 | \n", "0.0 | \n", "596.55 | \n", "5670.65 | \n", "2 | \n", "
claude-3-7-sonnet-latest-with-thinking | \n", "354.95 | \n", "501.15 | \n", "0.0 | \n", "856.10 | \n", "8594.80 | \n", "6 | \n", "
claude-3-opus-latest | \n", "325.95 | \n", "215.80 | \n", "0.0 | \n", "541.75 | \n", "8450.95 | \n", "4 | \n", "
gemini-2.0-flash | \n", "397.15 | \n", "175.25 | \n", "0.0 | \n", "572.40 | \n", "1685.25 | \n", "9 | \n", "
gemini-2.5-pro-preview-05-06 | \n", "369.05 | \n", "417.40 | \n", "2490.3 | \n", "3276.75 | \n", "30308.30 | \n", "12 | \n", "
gpt-4.1 | \n", "306.15 | \n", "168.20 | \n", "0.0 | \n", "474.35 | \n", "4943.70 | \n", "6 | \n", "
gpt-4.1-mini | \n", "318.60 | \n", "168.65 | \n", "0.0 | \n", "487.25 | \n", "3106.65 | \n", "3 | \n", "
o1 | \n", "271.85 | \n", "2893.75 | \n", "2857.6 | \n", "6023.20 | \n", "54333.15 | \n", "8 | \n", "
o4-mini | \n", "320.70 | \n", "3744.95 | \n", "3699.2 | \n", "7764.85 | \n", "42317.45 | \n", "5 | \n", "
\n", " | Mean Input Tokens | \n", "Mean Output Tokens | \n", "Mean Reasoning Tokens | \n", "Mean Total Tokens | \n", "Mean Latency (ms) | \n", "Task Complexity | \n", "
---|---|---|---|---|---|---|
Task ID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
cca530fc-4052-43b2-b130-b30968d8aa44 | \n", "977.9 | \n", "2173.8 | \n", "3124.7 | \n", "6276.4 | \n", "45423.8 | \n", "100.0 | \n", "
a0c07678-e491-4bbc-8f0b-07405144218f | \n", "226.7 | \n", "1294.1 | \n", "1230.4 | \n", "2751.2 | \n", "21048.4 | \n", "100.0 | \n", "
3f57289b-8c60-48be-bd80-01f8099ca449 | \n", "201.1 | \n", "896.6 | \n", "816.2 | \n", "1913.9 | \n", "16104.3 | \n", "100.0 | \n", "
cabe07ed-9eca-40ea-8ead-410ef5e83f91 | \n", "238.5 | \n", "557.5 | \n", "459.8 | \n", "1255.8 | \n", "10311.2 | \n", "100.0 | \n", "
305ac316-eef6-4446-960a-92d80d542f82 | \n", "206.2 | \n", "2048.7 | \n", "1959.9 | \n", "4214.8 | \n", "31215.1 | \n", "90.0 | \n", "
5a0c1adf-205e-4841-a666-7c3ef95def9d | \n", "213.5 | \n", "1436.8 | \n", "1285.1 | \n", "2935.4 | \n", "20778.0 | \n", "90.0 | \n", "
cf106601-ab4f-4af9-b045-5295fe67b37d | \n", "220.5 | \n", "861.5 | \n", "858.6 | \n", "1940.6 | \n", "14924.4 | \n", "90.0 | \n", "
8e867cd7-cff9-4e6c-867a-ff5ddc2550be | \n", "213.1 | \n", "931.4 | \n", "829.6 | \n", "1974.1 | \n", "14306.3 | \n", "80.0 | \n", "
3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | \n", "411.1 | \n", "550.0 | \n", "723.9 | \n", "1685.0 | \n", "14052.1 | \n", "80.0 | \n", "
a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | \n", "213.3 | \n", "298.5 | \n", "886.6 | \n", "1398.4 | \n", "14061.3 | \n", "80.0 | \n", "
99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | \n", "540.9 | \n", "460.1 | \n", "389.3 | \n", "1390.3 | \n", "9815.4 | \n", "80.0 | \n", "
1f975693-876d-457b-a649-393859e79bf3 | \n", "530.2 | \n", "432.8 | \n", "328.8 | \n", "1291.8 | \n", "9528.9 | \n", "80.0 | \n", "
9d191bce-651d-4746-be2d-7ef8ecadb9c2 | \n", "219.0 | \n", "408.5 | \n", "376.8 | \n", "1004.3 | \n", "7617.2 | \n", "80.0 | \n", "
7bd855d8-463d-4ed5-93ca-5fe35145f733 | \n", "629.6 | \n", "792.6 | \n", "931.1 | \n", "2353.3 | \n", "16474.0 | \n", "70.0 | \n", "
840bfca7-4f7b-481a-8794-c560c340185d | \n", "244.0 | \n", "1518.6 | \n", "1349.9 | \n", "3112.5 | \n", "26725.2 | \n", "60.0 | \n", "
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | \n", "197.9 | \n", "789.7 | \n", "820.0 | \n", "1807.6 | \n", "13030.2 | \n", "60.0 | \n", "
6f37996b-2ac7-44b0-8e68-6d28256631b4 | \n", "328.3 | \n", "762.8 | \n", "777.7 | \n", "1868.8 | \n", "14833.9 | \n", "50.0 | \n", "
bda648d7-d618-4883-88f4-3466eabd860e | \n", "211.4 | \n", "550.0 | \n", "486.9 | \n", "1248.3 | \n", "11084.1 | \n", "40.0 | \n", "
f918266a-b3e0-4914-865d-4faa564f1aef | \n", "396.4 | \n", "449.5 | \n", "299.4 | \n", "1145.3 | \n", "8643.4 | \n", "0.0 | \n", "
2d83110e-a098-4ebb-9987-066c06fa42d0 | \n", "213.0 | \n", "193.3 | \n", "159.5 | \n", "565.8 | \n", "5660.3 | \n", "0.0 | \n", "
\n", " | Task ID | \n", "Model | \n", "Answer | \n", "Ground Truth | \n", "Semantically Correct | \n", "Correct | \n", "Input Tokens | \n", "Output Tokens | \n", "Reasoning Tokens | \n", "Total Tokens | \n", "Latency | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
197 | \n", "a0c07678-e491-4bbc-8f0b-07405144218f | \n", "final-assignment-agent-v1 | \n", "Yamasaki, Uehara | \n", "Yoshida, Uehara | \n", "False | \n", "False | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "162751 | \n", "