diff --git "a/src/results/models_2024-10-24-08:08:59.127307.json" "b/src/results/models_2024-10-24-08:08:59.127307.json" new file mode 100644--- /dev/null +++ "b/src/results/models_2024-10-24-08:08:59.127307.json" @@ -0,0 +1,2847 @@ +[ + { + "config": { + "model_name": "ChatGPT-4o-latest (2024-09-03)", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 94.56827761, + "Standard Deviation": 0.009435818, + "Rank": 4 + }, + "Geometry": { + "Average Score": 82.306, + "Standard Deviation": null, + "Rank": 5 + }, + "Algebra": { + "Average Score": 91.701, + "Standard Deviation": null, + "Rank": 8 + }, + "Probability": { + "Average Score": 86.681, + "Standard Deviation": null, + "Rank": 4 + }, + "Logical": { + "Average Score": 97.425, + "Standard Deviation": null, + "Rank": 2 + }, + "Social": { + "Average Score": 91.333, + "Standard Deviation": null, + "Rank": 5 + }, + "Chemistry": { + "Average Score": 90.77, + "Standard Deviation": null, + "Rank": 3 + }, + "CPP": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + } + } + }, + { + "config": { + "model_name": "gpt-4o-2024-08-06", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 83.58608983, + "Standard Deviation": 4.528687523, + "Rank": 12 + }, + "Geometry": { + "Average Score": 86.632, + "Standard Deviation": null, + "Rank": 2 + }, + "Algebra": { + "Average Score": 95.242, + "Standard Deviation": null, + "Rank": 5 + }, + "Probability": { + "Average Score": 78.89, + "Standard Deviation": null, + "Rank": 8 + }, + "Logical": { + "Average Score": 77.458, + "Standard Deviation": null, + "Rank": 14 + }, + "Social": { + "Average Score": 70.351, + "Standard Deviation": null, + "Rank": 13 + }, + "Chemistry": { + "Average Score": 80.088, + "Standard Deviation": null, + "Rank": 9 + }, + "CPP": { + "Average Score": 92.43090226400756, + "Standard Deviation": null, + "Rank": 2 + } + } + }, + { + "config": { + "model_name": "gpt-4o-2024-05-13", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 89.45175971, + "Standard Deviation": 0.030431012, + "Rank": 8 + }, + "Geometry": { + "Average Score": 82.859, + "Standard Deviation": null, + "Rank": 4 + }, + "Algebra": { + "Average Score": 90.056, + "Standard Deviation": null, + "Rank": 9 + }, + "Probability": { + "Average Score": 82.051, + "Standard Deviation": null, + "Rank": 5 + }, + "Logical": { + "Average Score": 86.969, + "Standard Deviation": null, + "Rank": 10 + }, + "Social": { + "Average Score": 67.017, + "Standard Deviation": null, + "Rank": 16 + }, + "Chemistry": { + "Average Score": 84.501, + "Standard Deviation": null, + "Rank": 7 + }, + "CPP": { + "Average Score": 79.1592634699295, + "Standard Deviation": null, + "Rank": 6 + } + } + }, + { + "config": { + "model_name": "gpt-4-turbo-2024-04-09", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 89.34848344, + "Standard Deviation": 0.303734513, + "Rank": 9 + }, + "Geometry": { + "Average Score": 79.296, + "Standard Deviation": null, + "Rank": 7 + }, + "Algebra": { + "Average Score": 84.668, + "Standard Deviation": null, + "Rank": 12 + }, + "Probability": { + "Average Score": 77.859, + "Standard Deviation": null, + "Rank": 9 + }, + "Logical": { + "Average Score": 88.359, + "Standard Deviation": null, + "Rank": 9 + }, + "Social": { + "Average Score": 67.671, + "Standard Deviation": null, + "Rank": 15 + }, + "Chemistry": { + "Average Score": 79.61, + "Standard Deviation": null, + "Rank": 11 + }, + "CPP": { + "Average Score": 70.73143363230263, + "Standard Deviation": null, + "Rank": 11 + } + } + }, + { + "config": { + "model_name": "gemini-1.5-pro-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 83.17822062, + "Standard Deviation": 4.166312552, + "Rank": 13 + }, + "Geometry": { + "Average Score": 84.696, + "Standard Deviation": null, + "Rank": 3 + }, + "Algebra": { + "Average Score": 98.832, + "Standard Deviation": null, + "Rank": 3 + }, + "Probability": { + "Average Score": 74.233, + "Standard Deviation": null, + "Rank": 11 + }, + "Logical": { + "Average Score": 77.421, + "Standard Deviation": null, + "Rank": 15 + }, + "Social": { + "Average Score": 70.057, + "Standard Deviation": null, + "Rank": 14 + } + } + }, + { + "config": { + "model_name": "qwen2-72b-instruct", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/09" + }, + "results": { + "OVERALL": { + "Average Score": 80.78104505, + "Standard Deviation": 2.776695545, + "Rank": 15 + }, + "Geometry": { + "Average Score": 70.775, + "Standard Deviation": null, + "Rank": 12 + }, + "Algebra": { + "Average Score": 95.816, + "Standard Deviation": null, + "Rank": 4 + }, + "Probability": { + "Average Score": 80.38, + "Standard Deviation": null, + "Rank": 6 + }, + "Logical": { + "Average Score": 71.975, + "Standard Deviation": null, + "Rank": 20 + }, + "Social": { + "Average Score": 50.407, + "Standard Deviation": null, + "Rank": 20 + }, + "Chemistry": { + "Average Score": 76.621, + "Standard Deviation": null, + "Rank": 13 + }, + "CPP": { + "Average Score": 73.54037778797029, + "Standard Deviation": null, + "Rank": 7 + } + } + }, + { + "config": { + "model_name": "gpt-4o-mini-2024-07-18", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 85.99929202, + "Standard Deviation": 2.479470643, + "Rank": 11 + }, + "Geometry": { + "Average Score": 79.42, + "Standard Deviation": null, + "Rank": 6 + }, + "Algebra": { + "Average Score": 89.997, + "Standard Deviation": null, + "Rank": 10 + }, + "Probability": { + "Average Score": 78.89, + "Standard Deviation": null, + "Rank": 7 + }, + "Logical": { + "Average Score": 84.755, + "Standard Deviation": null, + "Rank": 11 + }, + "Social": { + "Average Score": 72.014, + "Standard Deviation": null, + "Rank": 11 + }, + "Chemistry": { + "Average Score": 76.194, + "Standard Deviation": null, + "Rank": 15 + }, + "CPP": { + "Average Score": 88.3877070580296, + "Standard Deviation": null, + "Rank": 3 + } + } + }, + { + "config": { + "model_name": "claude-3.5-sonnet", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 90.43169444, + "Standard Deviation": 0.123754719, + "Rank": 7 + }, + "Geometry": { + "Average Score": 74.36, + "Standard Deviation": null, + "Rank": 11 + }, + "Algebra": { + "Average Score": 83.137, + "Standard Deviation": null, + "Rank": 14 + }, + "Probability": { + "Average Score": 73.278, + "Standard Deviation": null, + "Rank": 14 + }, + "Logical": { + "Average Score": 88.581, + "Standard Deviation": null, + "Rank": 8 + }, + "Social": { + "Average Score": 97.694, + "Standard Deviation": null, + "Rank": 3 + }, + "Chemistry": { + "Average Score": 86.294, + "Standard Deviation": null, + "Rank": 4 + }, + "CPP": { + "Average Score": 82.37734076815008, + "Standard Deviation": null, + "Rank": 5 + } + } + }, + { + "config": { + "model_name": "claude-3.5-sonnet-20241022", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "UNKNOW" + }, + "results": { + "OVERALL": { + "Average Score": 82.08873036, + "Standard Deviation": 20.89052134, + "Rank": 14 + }, + "Geometry": { + "Average Score": 74.362, + "Standard Deviation": null, + "Rank": 10 + }, + "Algebra": { + "Average Score": 89.387, + "Standard Deviation": null, + "Rank": 11 + }, + "Probability": { + "Average Score": 73.919, + "Standard Deviation": null, + "Rank": 13 + }, + "Logical": { + "Average Score": 90.514, + "Standard Deviation": null, + "Rank": 7 + }, + "Social": { + "Average Score": 84.505, + "Standard Deviation": null, + "Rank": 7 + }, + "Chemistry": { + "Average Score": 85.611, + "Standard Deviation": null, + "Rank": 6 + } + } + }, + { + "config": { + "model_name": "o1-mini", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 97.53705747, + "Standard Deviation": 0.013240268, + "Rank": 2 + }, + "Geometry": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Algebra": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Probability": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Logical": { + "Average Score": 96.558, + "Standard Deviation": null, + "Rank": 3 + }, + "Social": { + "Average Score": 84.884, + "Standard Deviation": null, + "Rank": 6 + }, + "Chemistry": { + "Average Score": 93.717, + "Standard Deviation": null, + "Rank": 2 + } + } + }, + { + "config": { + "model_name": "o1-preview", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 93.04608514, + "Standard Deviation": 0.005729293, + "Rank": 5 + }, + "Geometry": { + "Average Score": "N/A", + "Standard Deviation": "N/A", + "Rank": "N/A" + }, + "Algebra": { + "Average Score": 99.212, + "Standard Deviation": null, + "Rank": 2 + }, + "Probability": { + "Average Score": 94.181, + "Standard Deviation": null, + "Rank": 2 + }, + "Logical": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Social": { + "Average Score": 96.978, + "Standard Deviation": null, + "Rank": 4 + } + } + }, + { + "config": { + "model_name": "gemini-1.5-flash-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 64.39324213, + "Standard Deviation": 1.348364198, + "Rank": 20 + }, + "Geometry": { + "Average Score": 65.135, + "Standard Deviation": null, + "Rank": 14 + }, + "Algebra": { + "Average Score": 84.28, + "Standard Deviation": null, + "Rank": 13 + }, + "Probability": { + "Average Score": 67.22, + "Standard Deviation": null, + "Rank": 16 + }, + "Logical": { + "Average Score": 71.975, + "Standard Deviation": null, + "Rank": 19 + }, + "Social": { + "Average Score": 60.374, + "Standard Deviation": null, + "Rank": 18 + }, + "Chemistry": { + "Average Score": 79.569, + "Standard Deviation": null, + "Rank": 12 + }, + "CPP": { + "Average Score": 72.1127762005651, + "Standard Deviation": null, + "Rank": 10 + } + } + }, + { + "config": { + "model_name": "gpt4-1106", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 88.25145246, + "Standard Deviation": 0.889714647, + "Rank": 10 + }, + "Geometry": { + "Average Score": 61.784, + "Standard Deviation": null, + "Rank": 16 + }, + "Algebra": { + "Average Score": 80.579, + "Standard Deviation": null, + "Rank": 15 + }, + "Probability": { + "Average Score": 70.693, + "Standard Deviation": null, + "Rank": 15 + }, + "Logical": { + "Average Score": 75.513, + "Standard Deviation": null, + "Rank": 16 + }, + "Social": { + "Average Score": 40.498, + "Standard Deviation": null, + "Rank": 26 + }, + "Chemistry": { + "Average Score": 73.251, + "Standard Deviation": null, + "Rank": 16 + }, + "CPP": { + "Average Score": 69.11824072252848, + "Standard Deviation": null, + "Rank": 12 + } + } + }, + { + "config": { + "model_name": "gemma-2-27b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/06" + }, + "results": { + "OVERALL": { + "Average Score": 71.08619043, + "Standard Deviation": 41.54124623, + "Rank": 19 + }, + "Geometry": { + "Average Score": 56.805, + "Standard Deviation": null, + "Rank": 17 + }, + "Algebra": { + "Average Score": 76.352, + "Standard Deviation": null, + "Rank": 18 + }, + "Probability": { + "Average Score": 65.472, + "Standard Deviation": null, + "Rank": 18 + }, + "Logical": { + "Average Score": 71.976, + "Standard Deviation": null, + "Rank": 18 + }, + "Social": { + "Average Score": 47.308, + "Standard Deviation": null, + "Rank": 22 + }, + "Chemistry": { + "Average Score": 69.606, + "Standard Deviation": null, + "Rank": 20 + }, + "CPP": { + "Average Score": 63.28920072143611, + "Standard Deviation": null, + "Rank": 14 + } + } + }, + { + "config": { + "model_name": "claude-3-opus", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 79.97608403, + "Standard Deviation": 5.382942441, + "Rank": 16 + }, + "Geometry": { + "Average Score": 56.54, + "Standard Deviation": null, + "Rank": 18 + }, + "Algebra": { + "Average Score": 75.405, + "Standard Deviation": null, + "Rank": 19 + }, + "Probability": { + "Average Score": 67.208, + "Standard Deviation": null, + "Rank": 17 + }, + "Logical": { + "Average Score": 77.458, + "Standard Deviation": null, + "Rank": 13 + }, + "Social": { + "Average Score": 80.318, + "Standard Deviation": null, + "Rank": 9 + }, + "Chemistry": { + "Average Score": 79.694, + "Standard Deviation": null, + "Rank": 10 + }, + "CPP": { + "Average Score": 73.5404403567132, + "Standard Deviation": null, + "Rank": 8 + } + } + }, + { + "config": { + "model_name": "gemma-2-9b-it-simpo", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/07" + }, + "results": { + "OVERALL": { + "Average Score": "N/A", + "Standard Deviation": "N/A", + "Rank": "N/A" + }, + "Geometry": { + "Average Score": 51.492, + "Standard Deviation": null, + "Rank": 20 + }, + "Algebra": { + "Average Score": 70.836, + "Standard Deviation": null, + "Rank": 20 + }, + "Probability": { + "Average Score": 58.976, + "Standard Deviation": null, + "Rank": 22 + }, + "Logical": { + "Average Score": 62.887, + "Standard Deviation": null, + "Rank": 24 + }, + "Social": { + "Average Score": 70.351, + "Standard Deviation": null, + "Rank": 12 + }, + "Chemistry": { + "Average Score": 85.813, + "Standard Deviation": null, + "Rank": 5 + }, + "CPP": { + "Average Score": 73.43757596214863, + "Standard Deviation": null, + "Rank": 9 + } + } + }, + { + "config": { + "model_name": "qwen1.5-72b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 62.1296631, + "Standard Deviation": 10.31242823, + "Rank": 21 + }, + "Geometry": { + "Average Score": 47.314, + "Standard Deviation": null, + "Rank": 25 + }, + "Algebra": { + "Average Score": 69.575, + "Standard Deviation": null, + "Rank": 21 + }, + "Probability": { + "Average Score": 49.066, + "Standard Deviation": null, + "Rank": 27 + }, + "Logical": { + "Average Score": 36.931, + "Standard Deviation": null, + "Rank": 36 + }, + "Social": { + "Average Score": 40.498, + "Standard Deviation": null, + "Rank": 27 + }, + "Chemistry": { + "Average Score": 53.127, + "Standard Deviation": null, + "Rank": 25 + }, + "CPP": { + "Average Score": 48.69302376665551, + "Standard Deviation": null, + "Rank": 20 + } + } + }, + { + "config": { + "model_name": "qwen1.5-32b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 46.27600711, + "Standard Deviation": 4.159365923, + "Rank": 30 + }, + "Geometry": { + "Average Score": 43.846, + "Standard Deviation": null, + "Rank": 27 + }, + "Algebra": { + "Average Score": 63.321, + "Standard Deviation": null, + "Rank": 24 + }, + "Probability": { + "Average Score": 48.15, + "Standard Deviation": null, + "Rank": 28 + }, + "Logical": { + "Average Score": 41.573, + "Standard Deviation": null, + "Rank": 34 + }, + "Social": { + "Average Score": 38.018, + "Standard Deviation": null, + "Rank": 29 + }, + "Chemistry": { + "Average Score": 48.041, + "Standard Deviation": null, + "Rank": 28 + }, + "CPP": { + "Average Score": 45.14284028264288, + "Standard Deviation": null, + "Rank": 24 + } + } + }, + { + "config": { + "model_name": "google-gemma-2-9b-it", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2024/06" + }, + "results": { + "OVERALL": { + "Average Score": 59.59324506, + "Standard Deviation": 5.156822857, + "Rank": 23 + }, + "Geometry": { + "Average Score": 51.184, + "Standard Deviation": null, + "Rank": 21 + }, + "Algebra": { + "Average Score": 64.38, + "Standard Deviation": null, + "Rank": 22 + }, + "Probability": { + "Average Score": 63.362, + "Standard Deviation": null, + "Rank": 21 + }, + "Logical": { + "Average Score": 69.422, + "Standard Deviation": null, + "Rank": 21 + }, + "Social": { + "Average Score": 76.113, + "Standard Deviation": null, + "Rank": 10 + }, + "Chemistry": { + "Average Score": 58.379, + "Standard Deviation": null, + "Rank": 22 + }, + "CPP": { + "Average Score": 54.03167523687635, + "Standard Deviation": null, + "Rank": 17 + } + } + }, + { + "config": { + "model_name": "yi-1.5-34b-chat", + "organization": "01 AI", + "license": "Proprietary", + "knowledge_cutoff": "2024/05" + }, + "results": { + "OVERALL": { + "Average Score": 72.39079733, + "Standard Deviation": 98.90928937, + "Rank": 18 + }, + "Geometry": { + "Average Score": 52.638, + "Standard Deviation": null, + "Rank": 19 + }, + "Algebra": { + "Average Score": 64.055, + "Standard Deviation": null, + "Rank": 23 + }, + "Probability": { + "Average Score": 64.137, + "Standard Deviation": null, + "Rank": 20 + }, + "Logical": { + "Average Score": 65.671, + "Standard Deviation": null, + "Rank": 22 + }, + "Social": { + "Average Score": 47.308, + "Standard Deviation": null, + "Rank": 23 + }, + "Chemistry": { + "Average Score": 57.484, + "Standard Deviation": null, + "Rank": 23 + }, + "CPP": { + "Average Score": 52.148798061768964, + "Standard Deviation": null, + "Rank": 18 + } + } + }, + { + "config": { + "model_name": "meta-llama-3.1-70b-instruct", + "organization": "Meta", + "license": "Llama 3.1 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 73.27773635, + "Standard Deviation": 5.72723675, + "Rank": 17 + }, + "Geometry": { + "Average Score": 65.135, + "Standard Deviation": null, + "Rank": 15 + }, + "Algebra": { + "Average Score": 80.579, + "Standard Deviation": null, + "Rank": 16 + }, + "Probability": { + "Average Score": 65.472, + "Standard Deviation": null, + "Rank": 19 + }, + "Logical": { + "Average Score": 72.879, + "Standard Deviation": null, + "Rank": 17 + }, + "Social": { + "Average Score": 60.374, + "Standard Deviation": null, + "Rank": 17 + }, + "Chemistry": { + "Average Score": 71.8, + "Standard Deviation": null, + "Rank": 17 + }, + "CPP": { + "Average Score": 84.36815192532764, + "Standard Deviation": null, + "Rank": 4 + } + } + }, + { + "config": { + "model_name": "meta-llama-3.1-8b-instruct", + "organization": "Meta", + "license": "Llama 3.1 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 52.8664657, + "Standard Deviation": 3.607384863, + "Rank": 27 + }, + "Geometry": { + "Average Score": 41.384, + "Standard Deviation": null, + "Rank": 29 + }, + "Algebra": { + "Average Score": 62.508, + "Standard Deviation": null, + "Rank": 25 + }, + "Probability": { + "Average Score": 51.889, + "Standard Deviation": null, + "Rank": 25 + }, + "Logical": { + "Average Score": 53.587, + "Standard Deviation": null, + "Rank": 29 + }, + "Social": { + "Average Score": 34.405, + "Standard Deviation": null, + "Rank": 32 + }, + "Chemistry": { + "Average Score": 45.032, + "Standard Deviation": null, + "Rank": 32 + }, + "CPP": { + "Average Score": 44.41846841004584, + "Standard Deviation": null, + "Rank": 26 + } + } + }, + { + "config": { + "model_name": "gpt3.5-turbo-0125", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2021/09" + }, + "results": { + "OVERALL": { + "Average Score": 33.7046204, + "Standard Deviation": 45.16937959, + "Rank": 40 + }, + "Geometry": { + "Average Score": 50.19, + "Standard Deviation": null, + "Rank": 22 + }, + "Algebra": { + "Average Score": 60.978, + "Standard Deviation": null, + "Rank": 26 + }, + "Probability": { + "Average Score": 46.284, + "Standard Deviation": null, + "Rank": 30 + }, + "Logical": { + "Average Score": 20.595, + "Standard Deviation": null, + "Rank": 47 + }, + "Social": { + "Average Score": 24.926, + "Standard Deviation": null, + "Rank": 42 + }, + "Chemistry": { + "Average Score": 42.78, + "Standard Deviation": null, + "Rank": 33 + }, + "CPP": { + "Average Score": 40.46958736582551, + "Standard Deviation": null, + "Rank": 29 + } + } + }, + { + "config": { + "model_name": "llama-3-70b-instruct", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 59.24245274, + "Standard Deviation": 4.878897527, + "Rank": 25 + }, + "Geometry": { + "Average Score": 45.249, + "Standard Deviation": null, + "Rank": 26 + }, + "Algebra": { + "Average Score": 60.736, + "Standard Deviation": null, + "Rank": 27 + }, + "Probability": { + "Average Score": 54.515, + "Standard Deviation": null, + "Rank": 23 + }, + "Logical": { + "Average Score": 83.08, + "Standard Deviation": null, + "Rank": 12 + }, + "Social": { + "Average Score": 42.172, + "Standard Deviation": null, + "Rank": 24 + }, + "Chemistry": { + "Average Score": 71.8, + "Standard Deviation": null, + "Rank": 18 + }, + "CPP": { + "Average Score": 65.32140697218945, + "Standard Deviation": null, + "Rank": 13 + } + } + }, + { + "config": { + "model_name": "claude-3-sonnet", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 61.81320888, + "Standard Deviation": 10.27472205, + "Rank": 22 + }, + "Geometry": { + "Average Score": 50.185, + "Standard Deviation": null, + "Rank": 23 + }, + "Algebra": { + "Average Score": 58.739, + "Standard Deviation": null, + "Rank": 28 + }, + "Probability": { + "Average Score": 54.182, + "Standard Deviation": null, + "Rank": 24 + }, + "Logical": { + "Average Score": 65.118, + "Standard Deviation": null, + "Rank": 23 + }, + "Social": { + "Average Score": 55.325, + "Standard Deviation": null, + "Rank": 19 + }, + "Chemistry": { + "Average Score": 69.778, + "Standard Deviation": null, + "Rank": 19 + }, + "CPP": { + "Average Score": 61.33538592327427, + "Standard Deviation": null, + "Rank": 15 + } + } + }, + { + "config": { + "model_name": "qwen1.5-14b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 43.97760317, + "Standard Deviation": 3.740375694, + "Rank": 31 + }, + "Geometry": { + "Average Score": 35.5, + "Standard Deviation": null, + "Rank": 31 + }, + "Algebra": { + "Average Score": 57.821, + "Standard Deviation": null, + "Rank": 29 + }, + "Probability": { + "Average Score": 38.886, + "Standard Deviation": null, + "Rank": 34 + }, + "Logical": { + "Average Score": 34.775, + "Standard Deviation": null, + "Rank": 39 + }, + "Social": { + "Average Score": 31.022, + "Standard Deviation": null, + "Rank": 35 + }, + "Chemistry": { + "Average Score": 40.55, + "Standard Deviation": null, + "Rank": 36 + }, + "CPP": { + "Average Score": 38.552779976347026, + "Standard Deviation": null, + "Rank": 31 + } + } + }, + { + "config": { + "model_name": "claude-3-haiku", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 55.60534246, + "Standard Deviation": 15.07600975, + "Rank": 26 + }, + "Geometry": { + "Average Score": 41.806, + "Standard Deviation": null, + "Rank": 28 + }, + "Algebra": { + "Average Score": 54.298, + "Standard Deviation": null, + "Rank": 31 + }, + "Probability": { + "Average Score": 49.344, + "Standard Deviation": null, + "Rank": 26 + }, + "Logical": { + "Average Score": 61.904, + "Standard Deviation": null, + "Rank": 25 + }, + "Social": { + "Average Score": 50.407, + "Standard Deviation": null, + "Rank": 21 + }, + "Chemistry": { + "Average Score": 61.491, + "Standard Deviation": null, + "Rank": 21 + }, + "CPP": { + "Average Score": 56.40200048817984, + "Standard Deviation": null, + "Rank": 16 + } + } + }, + { + "config": { + "model_name": "claude-2.1", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 40.35699809, + "Standard Deviation": 2.484317383, + "Rank": 35 + }, + "Geometry": { + "Average Score": 49.899, + "Standard Deviation": null, + "Rank": 24 + }, + "Algebra": { + "Average Score": 53.574, + "Standard Deviation": null, + "Rank": 32 + }, + "Probability": { + "Average Score": 44.011, + "Standard Deviation": null, + "Rank": 32 + }, + "Logical": { + "Average Score": 59.855, + "Standard Deviation": null, + "Rank": 26 + }, + "Social": { + "Average Score": 33.888, + "Standard Deviation": null, + "Rank": 33 + }, + "Chemistry": { + "Average Score": 51.038, + "Standard Deviation": null, + "Rank": 26 + }, + "CPP": { + "Average Score": 47.23672563994903, + "Standard Deviation": null, + "Rank": 21 + } + } + }, + { + "config": { + "model_name": "mistral-8x7b-instruct-v0.1", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 43.2937322, + "Standard Deviation": 2.659857412, + "Rank": 32 + }, + "Geometry": { + "Average Score": 32.639, + "Standard Deviation": null, + "Rank": 35 + }, + "Algebra": { + "Average Score": 48.901, + "Standard Deviation": null, + "Rank": 35 + }, + "Probability": { + "Average Score": 44.058, + "Standard Deviation": null, + "Rank": 31 + }, + "Logical": { + "Average Score": 42.194, + "Standard Deviation": null, + "Rank": 32 + }, + "Social": { + "Average Score": 26.702, + "Standard Deviation": null, + "Rank": 41 + }, + "Chemistry": { + "Average Score": 47.192, + "Standard Deviation": null, + "Rank": 29 + }, + "CPP": { + "Average Score": 44.533118241976666, + "Standard Deviation": null, + "Rank": 25 + } + } + }, + { + "config": { + "model_name": "claude-2.0", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 30.83692551, + "Standard Deviation": 1.816269, + "Rank": 43 + }, + "Geometry": { + "Average Score": 37.452, + "Standard Deviation": null, + "Rank": 30 + }, + "Algebra": { + "Average Score": 48.965, + "Standard Deviation": null, + "Rank": 34 + }, + "Probability": { + "Average Score": 46.284, + "Standard Deviation": null, + "Rank": 29 + }, + "Logical": { + "Average Score": 55.657, + "Standard Deviation": null, + "Rank": 28 + }, + "Social": { + "Average Score": 42.117, + "Standard Deviation": null, + "Rank": 25 + }, + "Chemistry": { + "Average Score": 55.869, + "Standard Deviation": null, + "Rank": 24 + }, + "CPP": { + "Average Score": 50.773143448036464, + "Standard Deviation": null, + "Rank": 19 + } + } + }, + { + "config": { + "model_name": "starling-lm-7b-beta", + "organization": "Nexusflow", + "license": "Apache-2.0", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 50.05304991, + "Standard Deviation": 3.017802027, + "Rank": 28 + }, + "Geometry": { + "Average Score": 33.79, + "Standard Deviation": null, + "Rank": 34 + }, + "Algebra": { + "Average Score": 49.685, + "Standard Deviation": null, + "Rank": 33 + }, + "Probability": { + "Average Score": 39.677, + "Standard Deviation": null, + "Rank": 33 + }, + "Logical": { + "Average Score": 47.501, + "Standard Deviation": null, + "Rank": 30 + }, + "Social": { + "Average Score": 37.7, + "Standard Deviation": null, + "Rank": 30 + }, + "Chemistry": { + "Average Score": 40.274, + "Standard Deviation": null, + "Rank": 37 + }, + "CPP": { + "Average Score": 38.27587102395908, + "Standard Deviation": null, + "Rank": 32 + } + } + }, + { + "config": { + "model_name": "gemini-1.0-pro-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 38.71255653, + "Standard Deviation": 8.592349353, + "Rank": 37 + }, + "Geometry": { + "Average Score": 34.596, + "Standard Deviation": null, + "Rank": 33 + }, + "Algebra": { + "Average Score": 48.159, + "Standard Deviation": null, + "Rank": 36 + }, + "Probability": { + "Average Score": 29.585, + "Standard Deviation": null, + "Rank": 43 + }, + "Logical": { + "Average Score": 23.882, + "Standard Deviation": null, + "Rank": 45 + }, + "Social": { + "Average Score": 13.261, + "Standard Deviation": null, + "Rank": 52 + }, + "Chemistry": { + "Average Score": 46.637, + "Standard Deviation": null, + "Rank": 30 + }, + "CPP": { + "Average Score": 45.22204471452975, + "Standard Deviation": null, + "Rank": 23 + } + } + }, + { + "config": { + "model_name": "openchat-3.5-0106", + "organization": "OpenChat", + "license": "Apache-2.0", + "knowledge_cutoff": "2024/01" + }, + "results": { + "OVERALL": { + "Average Score": 40.85094215, + "Standard Deviation": 6.631820541, + "Rank": 34 + }, + "Geometry": { + "Average Score": 29.115, + "Standard Deviation": null, + "Rank": 37 + }, + "Algebra": { + "Average Score": 45.456, + "Standard Deviation": null, + "Rank": 37 + }, + "Probability": { + "Average Score": 38.408, + "Standard Deviation": null, + "Rank": 35 + }, + "Logical": { + "Average Score": 41.678, + "Standard Deviation": null, + "Rank": 33 + }, + "Social": { + "Average Score": 28.236, + "Standard Deviation": null, + "Rank": 40 + }, + "Chemistry": { + "Average Score": 34.68, + "Standard Deviation": null, + "Rank": 39 + }, + "CPP": { + "Average Score": 33.70639271807677, + "Standard Deviation": null, + "Rank": 33 + } + } + }, + { + "config": { + "model_name": "openchat-3.5", + "organization": "OpenChat", + "license": "Apache-2.0", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 39.20699952, + "Standard Deviation": 1.576169927, + "Rank": 36 + }, + "Geometry": { + "Average Score": 30.009, + "Standard Deviation": null, + "Rank": 36 + }, + "Algebra": { + "Average Score": 42.04, + "Standard Deviation": null, + "Rank": 39 + }, + "Probability": { + "Average Score": 34.495, + "Standard Deviation": null, + "Rank": 38 + }, + "Logical": { + "Average Score": 35.828, + "Standard Deviation": null, + "Rank": 37 + }, + "Social": { + "Average Score": 33.096, + "Standard Deviation": null, + "Rank": 34 + }, + "Chemistry": { + "Average Score": 36.737, + "Standard Deviation": null, + "Rank": 38 + }, + "CPP": { + "Average Score": 33.020911255646965, + "Standard Deviation": null, + "Rank": 34 + } + } + }, + { + "config": { + "model_name": "command-r-(08-2024)", + "organization": "Cohere", + "license": "CC-BY-NC-4.0", + "knowledge_cutoff": "2024/08" + }, + "results": { + "OVERALL": { + "Average Score": 46.70245901, + "Standard Deviation": 3.665464964, + "Rank": 29 + }, + "Geometry": { + "Average Score": 35.43, + "Standard Deviation": null, + "Rank": 32 + }, + "Algebra": { + "Average Score": 41.852, + "Standard Deviation": null, + "Rank": 40 + }, + "Probability": { + "Average Score": 36.535, + "Standard Deviation": null, + "Rank": 37 + }, + "Logical": { + "Average Score": 25.941, + "Standard Deviation": null, + "Rank": 42 + }, + "Social": { + "Average Score": 30.911, + "Standard Deviation": null, + "Rank": 36 + }, + "Chemistry": { + "Average Score": 41.629, + "Standard Deviation": null, + "Rank": 35 + }, + "CPP": { + "Average Score": 39.61492485677676, + "Standard Deviation": null, + "Rank": 30 + } + } + }, + { + "config": { + "model_name": "gemma-1.1-7b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 32.61912991, + "Standard Deviation": 17.86038512, + "Rank": 41 + }, + "Geometry": { + "Average Score": 25.149, + "Standard Deviation": null, + "Rank": 41 + }, + "Algebra": { + "Average Score": 40.456, + "Standard Deviation": null, + "Rank": 41 + }, + "Probability": { + "Average Score": 29.307, + "Standard Deviation": null, + "Rank": 44 + }, + "Logical": { + "Average Score": 41.543, + "Standard Deviation": null, + "Rank": 35 + }, + "Social": { + "Average Score": 21.473, + "Standard Deviation": null, + "Rank": 45 + }, + "Chemistry": { + "Average Score": 45.033, + "Standard Deviation": null, + "Rank": 31 + }, + "CPP": { + "Average Score": 42.666504105798204, + "Standard Deviation": null, + "Rank": 27 + } + } + }, + { + "config": { + "model_name": "llama3-8b-instruct", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "2023/03" + }, + "results": { + "OVERALL": { + "Average Score": 37.29361351, + "Standard Deviation": 8.841996174, + "Rank": 39 + }, + "Geometry": { + "Average Score": 28.496, + "Standard Deviation": null, + "Rank": 39 + }, + "Algebra": { + "Average Score": 42.117, + "Standard Deviation": null, + "Rank": 38 + }, + "Probability": { + "Average Score": 33.841, + "Standard Deviation": null, + "Rank": 39 + }, + "Logical": { + "Average Score": 57.763, + "Standard Deviation": null, + "Rank": 27 + }, + "Social": { + "Average Score": 35.994, + "Standard Deviation": null, + "Rank": 31 + }, + "Chemistry": { + "Average Score": 50.023, + "Standard Deviation": null, + "Rank": 27 + }, + "CPP": { + "Average Score": 45.35392139264795, + "Standard Deviation": null, + "Rank": 22 + } + } + }, + { + "config": { + "model_name": "gemma-2-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/07" + }, + "results": { + "OVERALL": { + "Average Score": 59.3544514, + "Standard Deviation": 14.50864762, + "Rank": 24 + }, + "Geometry": { + "Average Score": 29.077, + "Standard Deviation": null, + "Rank": 38 + }, + "Algebra": { + "Average Score": 39.677, + "Standard Deviation": null, + "Rank": 42 + }, + "Probability": { + "Average Score": 31.561, + "Standard Deviation": null, + "Rank": 41 + }, + "Logical": { + "Average Score": 43.458, + "Standard Deviation": null, + "Rank": 31 + }, + "Social": { + "Average Score": 39.343, + "Standard Deviation": null, + "Rank": 28 + }, + "Chemistry": { + "Average Score": 31.156, + "Standard Deviation": null, + "Rank": 43 + }, + "CPP": { + "Average Score": 30.53406933106768, + "Standard Deviation": null, + "Rank": 36 + } + } + }, + { + "config": { + "model_name": "starling-lm-7b-alpha", + "organization": "Nexusflow", + "license": "Apache-2.0", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 37.94593338, + "Standard Deviation": 1.40532208, + "Rank": 38 + }, + "Geometry": { + "Average Score": 25.519, + "Standard Deviation": null, + "Rank": 40 + }, + "Algebra": { + "Average Score": 38.88, + "Standard Deviation": null, + "Rank": 43 + }, + "Probability": { + "Average Score": 32.068, + "Standard Deviation": null, + "Rank": 40 + }, + "Logical": { + "Average Score": 33.804, + "Standard Deviation": null, + "Rank": 40 + }, + "Social": { + "Average Score": 30.875, + "Standard Deviation": null, + "Rank": 37 + }, + "Chemistry": { + "Average Score": 31.354, + "Standard Deviation": null, + "Rank": 41 + }, + "CPP": { + "Average Score": 30.07926487356878, + "Standard Deviation": null, + "Rank": 37 + } + } + }, + { + "config": { + "model_name": "qwen1.5-4b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 9.779979052, + "Standard Deviation": 0.925129318, + "Rank": 54 + }, + "Geometry": { + "Average Score": 15.672, + "Standard Deviation": null, + "Rank": 46 + }, + "Algebra": { + "Average Score": 31.21, + "Standard Deviation": null, + "Rank": 44 + }, + "Probability": { + "Average Score": 13.853, + "Standard Deviation": null, + "Rank": 49 + }, + "Logical": { + "Average Score": 13.842, + "Standard Deviation": null, + "Rank": 52 + }, + "Social": { + "Average Score": 20.21, + "Standard Deviation": null, + "Rank": 46 + }, + "Chemistry": { + "Average Score": 14.794, + "Standard Deviation": null, + "Rank": 53 + }, + "CPP": { + "Average Score": 13.21208067122554, + "Standard Deviation": null, + "Rank": 47 + } + } + }, + { + "config": { + "model_name": "command-r-(04-2024)", + "organization": "Cohere", + "license": "CC-BY-NC-4.0", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 42.49175095, + "Standard Deviation": 5.556047496, + "Rank": 33 + }, + "Geometry": { + "Average Score": 23.438, + "Standard Deviation": null, + "Rank": 42 + }, + "Algebra": { + "Average Score": 31.204, + "Standard Deviation": null, + "Rank": 45 + }, + "Probability": { + "Average Score": 30.726, + "Standard Deviation": null, + "Rank": 42 + }, + "Logical": { + "Average Score": 35.111, + "Standard Deviation": null, + "Rank": 38 + }, + "Social": { + "Average Score": 30.623, + "Standard Deviation": null, + "Rank": 38 + }, + "Chemistry": { + "Average Score": 42.316, + "Standard Deviation": null, + "Rank": 34 + }, + "CPP": { + "Average Score": 41.346336503003236, + "Standard Deviation": null, + "Rank": 28 + } + } + }, + { + "config": { + "model_name": "vicuna-33b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 27.90851915, + "Standard Deviation": 4.55056913, + "Rank": 44 + }, + "Geometry": { + "Average Score": 16.634, + "Standard Deviation": null, + "Rank": 45 + }, + "Algebra": { + "Average Score": 25.075, + "Standard Deviation": null, + "Rank": 46 + }, + "Probability": { + "Average Score": 20.901, + "Standard Deviation": null, + "Rank": 47 + }, + "Logical": { + "Average Score": 22.962, + "Standard Deviation": null, + "Rank": 46 + }, + "Social": { + "Average Score": 28.487, + "Standard Deviation": null, + "Rank": 39 + }, + "Chemistry": { + "Average Score": 31.156, + "Standard Deviation": null, + "Rank": 42 + }, + "CPP": { + "Average Score": 28.01838653090379, + "Standard Deviation": null, + "Rank": 38 + } + } + }, + { + "config": { + "model_name": "gemma-7b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 25.25380823, + "Standard Deviation": 3.455163419, + "Rank": 46 + }, + "Geometry": { + "Average Score": 19.626, + "Standard Deviation": null, + "Rank": 43 + }, + "Algebra": { + "Average Score": 23.272, + "Standard Deviation": null, + "Rank": 48 + }, + "Probability": { + "Average Score": 16.98, + "Standard Deviation": null, + "Rank": 48 + }, + "Logical": { + "Average Score": 24.359, + "Standard Deviation": null, + "Rank": 43 + }, + "Social": { + "Average Score": 23.52, + "Standard Deviation": null, + "Rank": 43 + }, + "Chemistry": { + "Average Score": 31.139, + "Standard Deviation": null, + "Rank": 44 + }, + "CPP": { + "Average Score": 28.014658234926813, + "Standard Deviation": null, + "Rank": 39 + } + } + }, + { + "config": { + "model_name": "mistral-7b-instruct-2", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 31.49596208, + "Standard Deviation": 11.79471585, + "Rank": 42 + }, + "Geometry": { + "Average Score": 16.847, + "Standard Deviation": null, + "Rank": 44 + }, + "Algebra": { + "Average Score": 23.287, + "Standard Deviation": null, + "Rank": 47 + }, + "Probability": { + "Average Score": 24.868, + "Standard Deviation": null, + "Rank": 45 + }, + "Logical": { + "Average Score": 28.755, + "Standard Deviation": null, + "Rank": 41 + }, + "Social": { + "Average Score": 21.473, + "Standard Deviation": null, + "Rank": 44 + }, + "Chemistry": { + "Average Score": 31.994, + "Standard Deviation": null, + "Rank": 40 + }, + "CPP": { + "Average Score": 31.382959631870822, + "Standard Deviation": null, + "Rank": 35 + } + } + }, + { + "config": { + "model_name": "mistral-7b-instruct-1", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 20.53586787, + "Standard Deviation": 2.95650198, + "Rank": 51 + }, + "Geometry": { + "Average Score": 11.019, + "Standard Deviation": null, + "Rank": 50 + }, + "Algebra": { + "Average Score": 20.39, + "Standard Deviation": null, + "Rank": 49 + }, + "Probability": { + "Average Score": 24.279, + "Standard Deviation": null, + "Rank": 46 + }, + "Logical": { + "Average Score": 16.823, + "Standard Deviation": null, + "Rank": 50 + }, + "Social": { + "Average Score": 12.369, + "Standard Deviation": null, + "Rank": 53 + }, + "Chemistry": { + "Average Score": 22.121, + "Standard Deviation": null, + "Rank": 47 + }, + "CPP": { + "Average Score": 18.929093202755805, + "Standard Deviation": null, + "Rank": 42 + } + } + }, + { + "config": { + "model_name": "vicuna-13b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 17.42296198, + "Standard Deviation": 4.480901647, + "Rank": 52 + }, + "Geometry": { + "Average Score": 12.755, + "Standard Deviation": null, + "Rank": 49 + }, + "Algebra": { + "Average Score": 17.974, + "Standard Deviation": null, + "Rank": 50 + }, + "Probability": { + "Average Score": 13.004, + "Standard Deviation": null, + "Rank": 50 + }, + "Logical": { + "Average Score": 16.997, + "Standard Deviation": null, + "Rank": 49 + }, + "Social": { + "Average Score": 14.314, + "Standard Deviation": null, + "Rank": 51 + }, + "Chemistry": { + "Average Score": 25.307, + "Standard Deviation": null, + "Rank": 46 + }, + "CPP": { + "Average Score": 21.840013221590294, + "Standard Deviation": null, + "Rank": 40 + } + } + }, + { + "config": { + "model_name": "zephyr-7b-beta", + "organization": "HuggingFace", + "license": "MIT", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 9.323654861, + "Standard Deviation": 0.338544041, + "Rank": 55 + }, + "Geometry": { + "Average Score": 8.222, + "Standard Deviation": null, + "Rank": 51 + }, + "Algebra": { + "Average Score": 13.006, + "Standard Deviation": null, + "Rank": 51 + }, + "Probability": { + "Average Score": 7.573, + "Standard Deviation": null, + "Rank": 55 + }, + "Logical": { + "Average Score": 7.364, + "Standard Deviation": null, + "Rank": 56 + }, + "Social": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 58 + }, + "Chemistry": { + "Average Score": 17.18, + "Standard Deviation": null, + "Rank": 51 + }, + "CPP": { + "Average Score": 18.92902220864132, + "Standard Deviation": null, + "Rank": 43 + } + } + }, + { + "config": { + "model_name": "gemma-1.1-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 22.44740296, + "Standard Deviation": 3.95922917, + "Rank": 49 + }, + "Geometry": { + "Average Score": 12.834, + "Standard Deviation": null, + "Rank": 48 + }, + "Algebra": { + "Average Score": 12.291, + "Standard Deviation": null, + "Rank": 52 + }, + "Probability": { + "Average Score": 8.228, + "Standard Deviation": null, + "Rank": 53 + }, + "Logical": { + "Average Score": 10.822, + "Standard Deviation": null, + "Rank": 54 + }, + "Social": { + "Average Score": 19.303, + "Standard Deviation": null, + "Rank": 47 + }, + "Chemistry": { + "Average Score": 19.892, + "Standard Deviation": null, + "Rank": 48 + }, + "CPP": { + "Average Score": 20.724691953843916, + "Standard Deviation": null, + "Rank": 41 + } + } + }, + { + "config": { + "model_name": "llama2-7b-chat", + "organization": "Meta", + "license": "Llama 2 Community", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 23.53840413, + "Standard Deviation": 4.565404574, + "Rank": 47 + }, + "Geometry": { + "Average Score": 5.681, + "Standard Deviation": null, + "Rank": 52 + }, + "Algebra": { + "Average Score": 9.809, + "Standard Deviation": null, + "Rank": 54 + }, + "Probability": { + "Average Score": 8.089, + "Standard Deviation": null, + "Rank": 54 + }, + "Logical": { + "Average Score": 20.474, + "Standard Deviation": null, + "Rank": 48 + }, + "Social": { + "Average Score": 15.968, + "Standard Deviation": null, + "Rank": 48 + }, + "Chemistry": { + "Average Score": 18.153, + "Standard Deviation": null, + "Rank": 50 + }, + "CPP": { + "Average Score": 15.730513733660898, + "Standard Deviation": null, + "Rank": 45 + } + } + }, + { + "config": { + "model_name": "gemma-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 20.86803148, + "Standard Deviation": 4.810898787, + "Rank": 50 + }, + "Geometry": { + "Average Score": 15.137, + "Standard Deviation": null, + "Rank": 47 + }, + "Algebra": { + "Average Score": 10.108, + "Standard Deviation": null, + "Rank": 53 + }, + "Probability": { + "Average Score": 6.688, + "Standard Deviation": null, + "Rank": 56 + }, + "Logical": { + "Average Score": 5.296, + "Standard Deviation": null, + "Rank": 57 + }, + "Social": { + "Average Score": 9.63, + "Standard Deviation": null, + "Rank": 56 + }, + "Chemistry": { + "Average Score": 18.153, + "Standard Deviation": null, + "Rank": 49 + }, + "CPP": { + "Average Score": 17.2715657115764, + "Standard Deviation": null, + "Rank": 44 + } + } + }, + { + "config": { + "model_name": "llama2-13b-chat", + "organization": "Meta", + "license": "Llama 2 Community", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 23.34503255, + "Standard Deviation": 4.939571996, + "Rank": 48 + }, + "Geometry": { + "Average Score": 4.017, + "Standard Deviation": null, + "Rank": 54 + }, + "Algebra": { + "Average Score": 7.201, + "Standard Deviation": null, + "Rank": 55 + }, + "Probability": { + "Average Score": 11.451, + "Standard Deviation": null, + "Rank": 51 + }, + "Logical": { + "Average Score": 23.912, + "Standard Deviation": null, + "Rank": 44 + }, + "Social": { + "Average Score": 15.715, + "Standard Deviation": null, + "Rank": 50 + }, + "Chemistry": { + "Average Score": 14.773, + "Standard Deviation": null, + "Rank": 54 + }, + "CPP": { + "Average Score": 13.17258252933903, + "Standard Deviation": null, + "Rank": 48 + } + } + }, + { + "config": { + "model_name": "vicuna-7b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 16.78668722, + "Standard Deviation": 4.782003459, + "Rank": 53 + }, + "Geometry": { + "Average Score": 5.299, + "Standard Deviation": null, + "Rank": 53 + }, + "Algebra": { + "Average Score": 7.014, + "Standard Deviation": null, + "Rank": 56 + }, + "Probability": { + "Average Score": 8.228, + "Standard Deviation": null, + "Rank": 52 + }, + "Logical": { + "Average Score": 11.753, + "Standard Deviation": null, + "Rank": 53 + }, + "Social": { + "Average Score": 11.326, + "Standard Deviation": null, + "Rank": 54 + }, + "Chemistry": { + "Average Score": 15.092, + "Standard Deviation": null, + "Rank": 52 + }, + "CPP": { + "Average Score": 14.255194156624162, + "Standard Deviation": null, + "Rank": 46 + } + } + }, + { + "config": { + "model_name": "koala-13b", + "organization": "UC Berkeley", + "license": "Non-commercial", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 8.747324657, + "Standard Deviation": 0.645177403, + "Rank": 56 + }, + "Geometry": { + "Average Score": 0.156, + "Standard Deviation": null, + "Rank": 55 + }, + "Algebra": { + "Average Score": 2.242, + "Standard Deviation": null, + "Rank": 57 + }, + "Probability": { + "Average Score": 3.323, + "Standard Deviation": null, + "Rank": 57 + }, + "Logical": { + "Average Score": 8.156, + "Standard Deviation": null, + "Rank": 55 + }, + "Social": { + "Average Score": 9.649, + "Standard Deviation": null, + "Rank": 55 + }, + "Chemistry": { + "Average Score": 6.672, + "Standard Deviation": null, + "Rank": 55 + }, + "CPP": { + "Average Score": 6.36433272373514, + "Standard Deviation": null, + "Rank": 49 + } + } + }, + { + "config": { + "model_name": "openassistant-pythia-12b", + "organization": "OpenAssistant", + "license": "Non-commercial", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 0, + "Standard Deviation": 0, + "Rank": 57 + }, + "Geometry": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 56 + }, + "Algebra": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 58 + }, + "Probability": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 58 + }, + "Logical": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 58 + }, + "Social": { + "Average Score": 1.637, + "Standard Deviation": null, + "Rank": 57 + }, + "Chemistry": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 56 + }, + "CPP": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 50 + } + } + }, + { + "config": { + "model_name": "nemotron-70b", + "organization": "NVIDIA", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 100, + "Standard Deviation": 0, + "Rank": 1 + }, + "Algebra": { + "Average Score": 79.813, + "Standard Deviation": null, + "Rank": 17 + }, + "Geometry": { + "Average Score": 67.014, + "Standard Deviation": null, + "Rank": 13 + }, + "Probability": { + "Average Score": 75.535, + "Standard Deviation": null, + "Rank": 10 + }, + "Logical": { + "Average Score": 92.659, + "Standard Deviation": null, + "Rank": 5 + }, + "Social": { + "Average Score": 99.677, + "Standard Deviation": null, + "Rank": 2 + }, + "Chemistry": { + "Average Score": 76.262, + "Standard Deviation": null, + "Rank": 14 + } + } + }, + { + "config": { + "model_name": "yi-lightning", + "organization": "01 AI", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 96.84467293, + "Standard Deviation": 0.033152361, + "Rank": 3 + }, + "Geometry": { + "Average Score": 77.667, + "Standard Deviation": null, + "Rank": 8 + }, + "Algebra": { + "Average Score": 93.245, + "Standard Deviation": null, + "Rank": 6 + }, + "Chemistry": { + "Average Score": 100.000, + "Standard Deviation": null, + "Rank": 1 + }, + "Logical": { + "Average Score": 94.660, + "Standard Deviation": null, + "Rank": 4 + }, + "Social": { + "Average Score": 83.236, + "Standard Deviation": null, + "Rank": 8 + }, + "Probability": { + "Average Score": 90.329, + "Standard Deviation": null, + "Rank": 3 + } + } + }, + { + "config": { + "model_name": "glm-4-plus", + "organization": "Zhipu AI", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 92.48932574, + "Standard Deviation": 0.087973142, + "Rank": 6 + }, + "Geometry": { + "Average Score": 76.965, + "Standard Deviation": null, + "Rank": 9 + }, + "Algebra": { + "Average Score": 91.701, + "Standard Deviation": null, + "Rank": 7 + }, + "Chemistry": { + "Average Score": 83.527, + "Standard Deviation": null, + "Rank": 8 + }, + "Logical": { + "Average Score": 92.348, + "Standard Deviation": null, + "Rank": 6 + }, + "Social": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Probability": { + "Average Score": 74.233, + "Standard Deviation": null, + "Rank": 12 + } + } + }, + { + "config": { + "model_name": "llama-3.2-3b-it", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 26.58569941, + "Standard Deviation": 4.191042423, + "Rank": 45 + }, + "Algebra": { + "Average Score": 56.545, + "Standard Deviation": null, + "Rank": 30 + }, + "Probability": { + "Average Score": 37.496, + "Standard Deviation": null, + "Rank": 36 + }, + "Logical": { + "Average Score": 15.188, + "Standard Deviation": null, + "Rank": 51 + }, + "Social": { + "Average Score": 15.924, + "Standard Deviation": null, + "Rank": 49 + }, + "Chemistry": { + "Average Score": 30.78, + "Standard Deviation": null, + "Rank": 45 + } + } + } +] \ No newline at end of file