alielfilali01's picture
Update results.json with latest aggregated results.
2d6b85b verified
raw
history blame
43.4 kB
[
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7026,
"Completeness": 0.7014,
"Conciseness": 0.1631,
"Helpfulness": 0.6784,
"Honesty": 0.6972,
"Harmlessness": 0.7026,
"3C3H Score": 0.6076
},
"Tasks Scores": {
"Question Answering (QA)": 0.7151,
"Reasoning": 0.64,
"Orthographic and Grammatical Analysis": 0.0887,
"Safety": 0.4729
}
},
"Meta": {
"Model Name": "CohereForAI/aya-expanse-32b",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 32.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5612,
"Completeness": 0.5612,
"Conciseness": 0.1172,
"Helpfulness": 0.5468,
"Honesty": 0.5519,
"Harmlessness": 0.5594,
"3C3H Score": 0.4829
},
"Tasks Scores": {
"Question Answering (QA)": 0.5526,
"Reasoning": 0.5561,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.4271
}
},
"Meta": {
"Model Name": "CohereForAI/aya-expanse-8b",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4648,
"Completeness": 0.46,
"Conciseness": 0.1251,
"Helpfulness": 0.4415,
"Honesty": 0.4495,
"Harmlessness": 0.4639,
"3C3H Score": 0.4008
},
"Tasks Scores": {
"Question Answering (QA)": 0.5056,
"Reasoning": 0.3817,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2917
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-13B-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4158,
"Completeness": 0.4158,
"Conciseness": 0.0941,
"Helpfulness": 0.3817,
"Honesty": 0.3934,
"Harmlessness": 0.4158,
"3C3H Score": 0.3527
},
"Tasks Scores": {
"Question Answering (QA)": 0.4017,
"Reasoning": 0.4367,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2104
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-7B-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 7.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5568,
"Completeness": 0.546,
"Conciseness": 0.2094,
"Helpfulness": 0.5302,
"Honesty": 0.5391,
"Harmlessness": 0.5568,
"3C3H Score": 0.4897
},
"Tasks Scores": {
"Question Answering (QA)": 0.6084,
"Reasoning": 0.4717,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.4083
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-v2-8B-Chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.1547,
"Completeness": 0.1439,
"Conciseness": 0.0369,
"Helpfulness": 0.116,
"Honesty": 0.1286,
"Harmlessness": 0.1538,
"3C3H Score": 0.1223
},
"Tasks Scores": {
"Question Answering (QA)": 0.1201,
"Reasoning": 0.1094,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3771
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-0.5B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 0.465,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4468,
"Completeness": 0.4432,
"Conciseness": 0.1278,
"Helpfulness": 0.4179,
"Honesty": 0.4271,
"Harmlessness": 0.4459,
"3C3H Score": 0.3848
},
"Tasks Scores": {
"Question Answering (QA)": 0.3684,
"Reasoning": 0.4983,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.6812
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-3B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 3.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7192,
"Completeness": 0.718,
"Conciseness": 0.1906,
"Helpfulness": 0.6986,
"Honesty": 0.7094,
"Harmlessness": 0.7192,
"3C3H Score": 0.6258
},
"Tasks Scores": {
"Question Answering (QA)": 0.6677,
"Reasoning": 0.7594,
"Orthographic and Grammatical Analysis": 0.1075,
"Safety": 0.6083
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-72B-Instruct",
"License": "qwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 72.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6499,
"Completeness": 0.6487,
"Conciseness": 0.2016,
"Helpfulness": 0.6386,
"Honesty": 0.638,
"Harmlessness": 0.6499,
"3C3H Score": 0.5711
},
"Tasks Scores": {
"Question Answering (QA)": 0.6395,
"Reasoning": 0.6122,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.7792
}
},
"Meta": {
"Model Name": "google/gemma-2-27b-it",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 27.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.589,
"Completeness": 0.589,
"Conciseness": 0.1834,
"Helpfulness": 0.5797,
"Honesty": 0.5744,
"Harmlessness": 0.589,
"3C3H Score": 0.5174
},
"Tasks Scores": {
"Question Answering (QA)": 0.5462,
"Reasoning": 0.6011,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.7854
}
},
"Meta": {
"Model Name": "google/gemma-2-9b-it",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 9.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5579,
"Completeness": 0.5544,
"Conciseness": 0.1682,
"Helpfulness": 0.5352,
"Honesty": 0.5436,
"Harmlessness": 0.5579,
"3C3H Score": 0.4862
},
"Tasks Scores": {
"Question Answering (QA)": 0.5925,
"Reasoning": 0.48,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.45
}
},
"Meta": {
"Model Name": "inceptionai/jais-adapted-13b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6679,
"Completeness": 0.6655,
"Conciseness": 0.1804,
"Helpfulness": 0.6326,
"Honesty": 0.652,
"Harmlessness": 0.6679,
"3C3H Score": 0.5777
},
"Tasks Scores": {
"Question Answering (QA)": 0.6864,
"Reasoning": 0.5711,
"Orthographic and Grammatical Analysis": 0.0578,
"Safety": 0.5771
}
},
"Meta": {
"Model Name": "inceptionai/jais-adapted-70b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5211,
"Completeness": 0.5102,
"Conciseness": 0.1339,
"Helpfulness": 0.4798,
"Honesty": 0.5093,
"Harmlessness": 0.5202,
"3C3H Score": 0.4457
},
"Tasks Scores": {
"Question Answering (QA)": 0.5144,
"Reasoning": 0.4844,
"Orthographic and Grammatical Analysis": 0.0269,
"Safety": 0.4312
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-13b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.3729,
"Completeness": 0.3669,
"Conciseness": 0.0887,
"Helpfulness": 0.3441,
"Honesty": 0.3543,
"Harmlessness": 0.3711,
"3C3H Score": 0.3163
},
"Tasks Scores": {
"Question Answering (QA)": 0.348,
"Reasoning": 0.3761,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3417
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-2p7b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 3.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5806,
"Completeness": 0.5759,
"Conciseness": 0.1526,
"Helpfulness": 0.5475,
"Honesty": 0.5621,
"Harmlessness": 0.5806,
"3C3H Score": 0.4999
},
"Tasks Scores": {
"Question Answering (QA)": 0.5812,
"Reasoning": 0.5239,
"Orthographic and Grammatical Analysis": 0.0282,
"Safety": 0.5187
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-30b-8k-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 30.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4755,
"Completeness": 0.4731,
"Conciseness": 0.1243,
"Helpfulness": 0.4522,
"Honesty": 0.4597,
"Harmlessness": 0.4755,
"3C3H Score": 0.41
},
"Tasks Scores": {
"Question Answering (QA)": 0.4743,
"Reasoning": 0.4633,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3542
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-6p7b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 7.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6392,
"Completeness": 0.6129,
"Conciseness": 0.27,
"Helpfulness": 0.6016,
"Honesty": 0.6171,
"Harmlessness": 0.6383,
"3C3H Score": 0.5632
},
"Tasks Scores": {
"Question Answering (QA)": 0.6465,
"Reasoning": 0.6283,
"Orthographic and Grammatical Analysis": 0.0591,
"Safety": 0.4625
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.1-70B-Instruct",
"License": "llama3.1",
"Revision": "main",
"Precision": "bfloat16",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4421,
"Completeness": 0.4409,
"Conciseness": 0.1416,
"Helpfulness": 0.3967,
"Honesty": 0.4065,
"Harmlessness": 0.4421,
"3C3H Score": 0.3783
},
"Tasks Scores": {
"Question Answering (QA)": 0.3826,
"Reasoning": 0.45,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.6625
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.1-8B-Instruct",
"License": "llama3.1",
"Revision": "main",
"Precision": "bfloat16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.2359,
"Completeness": 0.2058,
"Conciseness": 0.0581,
"Helpfulness": 0.1781,
"Honesty": 0.2106,
"Harmlessness": 0.2341,
"3C3H Score": 0.1871
},
"Tasks Scores": {
"Question Answering (QA)": 0.198,
"Reasoning": 0.2328,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2229
}
},
"Meta": {
"Model Name": "meta-llama/Meta-Llama-3-8B-Instruct",
"License": "llama3",
"Revision": "main",
"Precision": "bfloat16",
"Params": 14.963,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5204,
"Completeness": 0.1295,
"Conciseness": 0.4149,
"Helpfulness": 0.2332,
"Honesty": 0.4814,
"Harmlessness": 0.5204,
"3C3H Score": 0.3833
},
"Tasks Scores": {
"Question Answering (QA)": 0.4053,
"Reasoning": 0.3806,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.8188
}
},
"Meta": {
"Model Name": "silma-ai/SILMA-9B-Instruct-v1.0",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 9.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.542,
"Completeness": 0.5156,
"Conciseness": 0.2512,
"Helpfulness": 0.5033,
"Honesty": 0.533,
"Harmlessness": 0.542,
"3C3H Score": 0.4812
},
"Tasks Scores": {
"Question Answering (QA)": 0.6009,
"Reasoning": 0.4825,
"Orthographic and Grammatical Analysis": 0.0309,
"Safety": 0.2583
}
},
"Meta": {
"Model Name": "CohereForAI/aya-23-35B",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 35.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5878,
"Completeness": 0.5472,
"Conciseness": 0.1738,
"Helpfulness": 0.5594,
"Honesty": 0.5806,
"Harmlessness": 0.5833,
"3C3H Score": 0.5054
},
"Tasks Scores": {
"Question Answering (QA)": 0.6209,
"Reasoning": 0.5394,
"Orthographic and Grammatical Analysis": 0.0269,
"Safety": 0.2354
}
},
"Meta": {
"Model Name": "CohereForAI/c4ai-command-r-08-2024",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 32.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6282,
"Completeness": 0.6221,
"Conciseness": 0.1733,
"Helpfulness": 0.5978,
"Honesty": 0.6119,
"Harmlessness": 0.6282,
"3C3H Score": 0.5436
},
"Tasks Scores": {
"Question Answering (QA)": 0.6891,
"Reasoning": 0.5333,
"Orthographic and Grammatical Analysis": 0.0264,
"Safety": 0.2521
}
},
"Meta": {
"Model Name": "CohereForAI/c4ai-command-r-v01",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 35.0,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5297,
"Completeness": 0.4679,
"Conciseness": 0.2876,
"Helpfulness": 0.4694,
"Honesty": 0.5097,
"Harmlessness": 0.5297,
"3C3H Score": 0.4657
},
"Tasks Scores": {
"Question Answering (QA)": 0.5958,
"Reasoning": 0.4296,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3171
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 275,
"Failed Entries": 4,
"Success Ratio": 0.9857
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6717,
"Completeness": 0.6642,
"Conciseness": 0.2906,
"Helpfulness": 0.6479,
"Honesty": 0.6657,
"Harmlessness": 0.6717,
"3C3H Score": 0.602
},
"Tasks Scores": {
"Question Answering (QA)": 0.7136,
"Reasoning": 0.5694,
"Orthographic and Grammatical Analysis": 0.0632,
"Safety": 0.75
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-v2-70B-Chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 267,
"Failed Entries": 12,
"Success Ratio": 0.957
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7103,
"Completeness": 0.7091,
"Conciseness": 0.1912,
"Helpfulness": 0.6888,
"Honesty": 0.7036,
"Harmlessness": 0.7103,
"3C3H Score": 0.6189
},
"Tasks Scores": {
"Question Answering (QA)": 0.6862,
"Reasoning": 0.7472,
"Orthographic and Grammatical Analysis": 0.0282,
"Safety": 0.5482
}
},
"Meta": {
"Model Name": "MaziyarPanahi/calme-2.2-qwen2.5-72b",
"License": "tongyi-qianwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 72.0,
"Total Entries": 279,
"Successful Entries": 275,
"Failed Entries": 4,
"Success Ratio": 0.9857
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.2848,
"Completeness": 0.2848,
"Conciseness": 0.088,
"Helpfulness": 0.2553,
"Honesty": 0.2531,
"Harmlessness": 0.2833,
"3C3H Score": 0.2416
},
"Tasks Scores": {
"Question Answering (QA)": 0.2384,
"Reasoning": 0.2723,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.5486
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-1.5B-Instruct",
"License": "qwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 1.443,
"Total Entries": 279,
"Successful Entries": 268,
"Failed Entries": 11,
"Success Ratio": 0.9606
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6146,
"Completeness": 0.6059,
"Conciseness": 0.1859,
"Helpfulness": 0.5914,
"Honesty": 0.5988,
"Harmlessness": 0.6146,
"3C3H Score": 0.5352
},
"Tasks Scores": {
"Question Answering (QA)": 0.566,
"Reasoning": 0.6684,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.6009
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-14B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 14.0,
"Total Entries": 279,
"Successful Entries": 269,
"Failed Entries": 10,
"Success Ratio": 0.9642
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.8831,
"Completeness": 0.8781,
"Conciseness": 0.3327,
"Helpfulness": 0.8697,
"Honesty": 0.8778,
"Harmlessness": 0.8831,
"3C3H Score": 0.7874
},
"Tasks Scores": {
"Question Answering (QA)": 0.7896,
"Reasoning": 0.77,
"Orthographic and Grammatical Analysis": 0.7487,
"Safety": 0.9013
}
},
"Meta": {
"Model Name": "claude-3-5-sonnet-20241022",
"License": "Proprietary",
"Revision": "UNK",
"Precision": "UNK",
"Params": "UNK",
"Total Entries": 279,
"Successful Entries": 268,
"Failed Entries": 11,
"Success Ratio": 0.9606
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6389,
"Completeness": 0.6377,
"Conciseness": 0.1938,
"Helpfulness": 0.6162,
"Honesty": 0.6316,
"Harmlessness": 0.6389,
"3C3H Score": 0.5595
},
"Tasks Scores": {
"Question Answering (QA)": 0.6376,
"Reasoning": 0.5767,
"Orthographic and Grammatical Analysis": 0.0591,
"Safety": 0.6854
}
},
"Meta": {
"Model Name": "claude-3-haiku-20240307",
"License": "Proprietary",
"Revision": "UNK",
"Precision": "UNK",
"Params": "UNK",
"Total Entries": 279,
"Successful Entries": 276,
"Failed Entries": 3,
"Success Ratio": 0.9892
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.2603,
"Completeness": 0.2311,
"Conciseness": 0.0721,
"Helpfulness": 0.2132,
"Honesty": 0.2476,
"Harmlessness": 0.2594,
"3C3H Score": 0.214
},
"Tasks Scores": {
"Question Answering (QA)": 0.224,
"Reasoning": 0.2934,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.1771
}
},
"Meta": {
"Model Name": "meta-llama/Meta-Llama-3-70B-Instruct",
"License": "llama3",
"Revision": "main",
"Precision": "bfloat16",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 274,
"Failed Entries": 5,
"Success Ratio": 0.9821
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.721,
"Completeness": 0.7138,
"Conciseness": 0.2298,
"Helpfulness": 0.7041,
"Honesty": 0.7141,
"Harmlessness": 0.721,
"3C3H Score": 0.634
},
"Tasks Scores": {
"Question Answering (QA)": 0.6923,
"Reasoning": 0.7312,
"Orthographic and Grammatical Analysis": 0.1909,
"Safety": 0.5229
}
},
"Meta": {
"Model Name": "gpt-4o-mini",
"License": "Proprietary",
"Revision": "UNK",
"Precision": "UNK",
"Params": "UNK",
"Total Entries": 279,
"Successful Entries": 276,
"Failed Entries": 3,
"Success Ratio": 0.9892
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.8375,
"Completeness": 0.8291,
"Conciseness": 0.2894,
"Helpfulness": 0.8099,
"Honesty": 0.83,
"Harmlessness": 0.8375,
"3C3H Score": 0.7389
},
"Tasks Scores": {
"Question Answering (QA)": 0.8014,
"Reasoning": 0.7455,
"Orthographic and Grammatical Analysis": 0.5027,
"Safety": 0.6063
}
},
"Meta": {
"Model Name": "gpt-4o",
"License": "Proprietary",
"Revision": "UNK",
"Precision": "UNK",
"Params": "UNK",
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7194,
"Completeness": 0.7181,
"Conciseness": 0.1927,
"Helpfulness": 0.6921,
"Honesty": 0.7099,
"Harmlessness": 0.7194,
"3C3H Score": 0.6253
},
"Tasks Scores": {
"Question Answering (QA)": 0.6611,
"Reasoning": 0.7922,
"Orthographic and Grammatical Analysis": 0.0736,
"Safety": 0.5741
}
},
"Meta": {
"Model Name": "rombodawg/Rombos-LLM-V2.5-Qwen-72b",
"License": "qwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 72.0,
"Total Entries": 279,
"Successful Entries": 272,
"Failed Entries": 7,
"Success Ratio": 0.9749
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7121,
"Completeness": 0.7097,
"Conciseness": 0.1876,
"Helpfulness": 0.6882,
"Honesty": 0.6968,
"Harmlessness": 0.7121,
"3C3H Score": 0.6177
},
"Tasks Scores": {
"Question Answering (QA)": 0.6815,
"Reasoning": 0.7567,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.5667
}
},
"Meta": {
"Model Name": "MaziyarPanahi/calme-2.1-qwen2.5-72b",
"License": "tongyi-qianwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 72.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.3285,
"Completeness": 0.3225,
"Conciseness": 0.0869,
"Helpfulness": 0.2987,
"Honesty": 0.3081,
"Harmlessness": 0.3279,
"3C3H Score": 0.2788
},
"Tasks Scores": {
"Question Answering (QA)": 0.2945,
"Reasoning": 0.3667,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2625
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-1p3b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 1.0,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5695,
"Completeness": 0.5624,
"Conciseness": 0.1577,
"Helpfulness": 0.5312,
"Honesty": 0.554,
"Harmlessness": 0.5695,
"3C3H Score": 0.4907
},
"Tasks Scores": {
"Question Answering (QA)": 0.5702,
"Reasoning": 0.5139,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.5604
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-30b-16k-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 30.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.1966,
"Completeness": 0.1535,
"Conciseness": 0.0285,
"Helpfulness": 0.1196,
"Honesty": 0.1643,
"Harmlessness": 0.1957,
"3C3H Score": 0.143
},
"Tasks Scores": {
"Question Answering (QA)": 0.1577,
"Reasoning": 0.1872,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.0875
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-590m-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 0.719,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.0791,
"Completeness": 0.0504,
"Conciseness": 0.0216,
"Helpfulness": 0.0414,
"Honesty": 0.0549,
"Harmlessness": 0.0755,
"3C3H Score": 0.0538
},
"Tasks Scores": {
"Question Answering (QA)": 0.0293,
"Reasoning": 0.0756,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2417
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.2-1B-Instruct",
"License": "llama3.2",
"Revision": "main",
"Precision": "bfloat16",
"Params": 1.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.2736,
"Completeness": 0.2616,
"Conciseness": 0.0792,
"Helpfulness": 0.1971,
"Honesty": 0.2315,
"Harmlessness": 0.2727,
"3C3H Score": 0.2193
},
"Tasks Scores": {
"Question Answering (QA)": 0.2133,
"Reasoning": 0.28,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3771
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.2-3B-Instruct",
"License": "llama3.2",
"Revision": "main",
"Precision": "bfloat16",
"Params": 3.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6296,
"Completeness": 0.6165,
"Conciseness": 0.2258,
"Helpfulness": 0.5923,
"Honesty": 0.6123,
"Harmlessness": 0.6296,
"3C3H Score": 0.551
},
"Tasks Scores": {
"Question Answering (QA)": 0.6538,
"Reasoning": 0.6033,
"Orthographic and Grammatical Analysis": 0.0309,
"Safety": 0.375
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"License": "llama3.2",
"Revision": "main",
"Precision": "bfloat16",
"Params": 90.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6858,
"Completeness": 0.6511,
"Conciseness": 0.345,
"Helpfulness": 0.635,
"Honesty": 0.6747,
"Harmlessness": 0.6858,
"3C3H Score": 0.6129
},
"Tasks Scores": {
"Question Answering (QA)": 0.7062,
"Reasoning": 0.6394,
"Orthographic and Grammatical Analysis": 0.0215,
"Safety": 0.7167
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.3-70B-Instruct",
"License": "llama3.3",
"Revision": "main",
"Precision": "bfloat16",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.3321,
"Completeness": 0.1434,
"Conciseness": 0.0403,
"Helpfulness": 0.1359,
"Honesty": 0.2631,
"Harmlessness": 0.3295,
"3C3H Score": 0.2074
},
"Tasks Scores": {
"Question Answering (QA)": 0.2891,
"Reasoning": 0.1744,
"Orthographic and Grammatical Analysis": 0.0175,
"Safety": 0.0
}
},
"Meta": {
"Model Name": "stabilityai/ar-stablelm-2-chat",
"License": "other",
"Revision": "main",
"Precision": "float32",
"Params": 2.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5317,
"Completeness": 0.4875,
"Conciseness": 0.1711,
"Helpfulness": 0.4271,
"Honesty": 0.4904,
"Harmlessness": 0.5317,
"3C3H Score": 0.4399
},
"Tasks Scores": {
"Question Answering (QA)": 0.4885,
"Reasoning": 0.4211,
"Orthographic and Grammatical Analysis": 0.0323,
"Safety": 0.7708
}
},
"Meta": {
"Model Name": "utter-project/EuroLLM-9B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 9.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6619,
"Completeness": 0.6356,
"Conciseness": 0.1938,
"Helpfulness": 0.6353,
"Honesty": 0.6526,
"Harmlessness": 0.661,
"3C3H Score": 0.5734
},
"Tasks Scores": {
"Question Answering (QA)": 0.7327,
"Reasoning": 0.5506,
"Orthographic and Grammatical Analysis": 0.0538,
"Safety": 0.2458
}
},
"Meta": {
"Model Name": "CohereForAI/c4ai-command-r-plus-08-2024",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 104.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"_last_sync_timestamp": "2024-12-15T21:20:51.136159"
}
]