alielfilali01's picture
Create assets/results/results.json
df97369 verified
raw
history blame
19.2 kB
[
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7026,
"Completeness": 0.7014,
"Conciseness": 0.1631,
"Helpfulness": 0.6784,
"Honesty": 0.6972,
"Harmlessness": 0.7026,
"3C3H Score": 0.6076
},
"Tasks Scores": {
"Question Answering (QA)": 0.7151,
"Reasoning": 0.64,
"Orthographic and Grammatical Analysis": 0.0887,
"Safety": 0.4729
}
},
"Meta": {
"Model Name": "CohereForAI/aya-expanse-32b",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 32.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5612,
"Completeness": 0.5612,
"Conciseness": 0.1172,
"Helpfulness": 0.5468,
"Honesty": 0.5519,
"Harmlessness": 0.5594,
"3C3H Score": 0.4829
},
"Tasks Scores": {
"Question Answering (QA)": 0.5526,
"Reasoning": 0.5561,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.4271
}
},
"Meta": {
"Model Name": "CohereForAI/aya-expanse-8b",
"License": "cc-by-nc-4.0",
"Revision": "main",
"Precision": "float16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4648,
"Completeness": 0.46,
"Conciseness": 0.1251,
"Helpfulness": 0.4415,
"Honesty": 0.4495,
"Harmlessness": 0.4639,
"3C3H Score": 0.4008
},
"Tasks Scores": {
"Question Answering (QA)": 0.5056,
"Reasoning": 0.3817,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2917
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-13B-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4158,
"Completeness": 0.4158,
"Conciseness": 0.0941,
"Helpfulness": 0.3817,
"Honesty": 0.3934,
"Harmlessness": 0.4158,
"3C3H Score": 0.3527
},
"Tasks Scores": {
"Question Answering (QA)": 0.4017,
"Reasoning": 0.4367,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2104
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-7B-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 7.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5568,
"Completeness": 0.546,
"Conciseness": 0.2094,
"Helpfulness": 0.5302,
"Honesty": 0.5391,
"Harmlessness": 0.5568,
"3C3H Score": 0.4897
},
"Tasks Scores": {
"Question Answering (QA)": 0.6084,
"Reasoning": 0.4717,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.4083
}
},
"Meta": {
"Model Name": "FreedomIntelligence/AceGPT-v2-8B-Chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.1547,
"Completeness": 0.1439,
"Conciseness": 0.0369,
"Helpfulness": 0.116,
"Honesty": 0.1286,
"Harmlessness": 0.1538,
"3C3H Score": 0.1223
},
"Tasks Scores": {
"Question Answering (QA)": 0.1201,
"Reasoning": 0.1094,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3771
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-0.5B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 0.465,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4468,
"Completeness": 0.4432,
"Conciseness": 0.1278,
"Helpfulness": 0.4179,
"Honesty": 0.4271,
"Harmlessness": 0.4459,
"3C3H Score": 0.3848
},
"Tasks Scores": {
"Question Answering (QA)": 0.3684,
"Reasoning": 0.4983,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.6812
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-3B-Instruct",
"License": "apache-2.0",
"Revision": "main",
"Precision": "bfloat16",
"Params": 3.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.7192,
"Completeness": 0.718,
"Conciseness": 0.1906,
"Helpfulness": 0.6986,
"Honesty": 0.7094,
"Harmlessness": 0.7192,
"3C3H Score": 0.6258
},
"Tasks Scores": {
"Question Answering (QA)": 0.6677,
"Reasoning": 0.7594,
"Orthographic and Grammatical Analysis": 0.1075,
"Safety": 0.6083
}
},
"Meta": {
"Model Name": "Qwen/Qwen2.5-72B-Instruct",
"License": "qwen",
"Revision": "main",
"Precision": "bfloat16",
"Params": 72.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6499,
"Completeness": 0.6487,
"Conciseness": 0.2016,
"Helpfulness": 0.6386,
"Honesty": 0.638,
"Harmlessness": 0.6499,
"3C3H Score": 0.5711
},
"Tasks Scores": {
"Question Answering (QA)": 0.6395,
"Reasoning": 0.6122,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.7792
}
},
"Meta": {
"Model Name": "google/gemma-2-27b-it",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 27.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.589,
"Completeness": 0.589,
"Conciseness": 0.1834,
"Helpfulness": 0.5797,
"Honesty": 0.5744,
"Harmlessness": 0.589,
"3C3H Score": 0.5174
},
"Tasks Scores": {
"Question Answering (QA)": 0.5462,
"Reasoning": 0.6011,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.7854
}
},
"Meta": {
"Model Name": "google/gemma-2-9b-it",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 9.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5579,
"Completeness": 0.5544,
"Conciseness": 0.1682,
"Helpfulness": 0.5352,
"Honesty": 0.5436,
"Harmlessness": 0.5579,
"3C3H Score": 0.4862
},
"Tasks Scores": {
"Question Answering (QA)": 0.5925,
"Reasoning": 0.48,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.45
}
},
"Meta": {
"Model Name": "inceptionai/jais-adapted-13b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6679,
"Completeness": 0.6655,
"Conciseness": 0.1804,
"Helpfulness": 0.6326,
"Honesty": 0.652,
"Harmlessness": 0.6679,
"3C3H Score": 0.5777
},
"Tasks Scores": {
"Question Answering (QA)": 0.6864,
"Reasoning": 0.5711,
"Orthographic and Grammatical Analysis": 0.0578,
"Safety": 0.5771
}
},
"Meta": {
"Model Name": "inceptionai/jais-adapted-70b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5211,
"Completeness": 0.5102,
"Conciseness": 0.1339,
"Helpfulness": 0.4798,
"Honesty": 0.5093,
"Harmlessness": 0.5202,
"3C3H Score": 0.4457
},
"Tasks Scores": {
"Question Answering (QA)": 0.5144,
"Reasoning": 0.4844,
"Orthographic and Grammatical Analysis": 0.0269,
"Safety": 0.4312
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-13b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 13.0,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.3729,
"Completeness": 0.3669,
"Conciseness": 0.0887,
"Helpfulness": 0.3441,
"Honesty": 0.3543,
"Harmlessness": 0.3711,
"3C3H Score": 0.3163
},
"Tasks Scores": {
"Question Answering (QA)": 0.348,
"Reasoning": 0.3761,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3417
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-2p7b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 3.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5806,
"Completeness": 0.5759,
"Conciseness": 0.1526,
"Helpfulness": 0.5475,
"Honesty": 0.5621,
"Harmlessness": 0.5806,
"3C3H Score": 0.4999
},
"Tasks Scores": {
"Question Answering (QA)": 0.5812,
"Reasoning": 0.5239,
"Orthographic and Grammatical Analysis": 0.0282,
"Safety": 0.5187
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-30b-8k-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 30.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4755,
"Completeness": 0.4731,
"Conciseness": 0.1243,
"Helpfulness": 0.4522,
"Honesty": 0.4597,
"Harmlessness": 0.4755,
"3C3H Score": 0.41
},
"Tasks Scores": {
"Question Answering (QA)": 0.4743,
"Reasoning": 0.4633,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.3542
}
},
"Meta": {
"Model Name": "inceptionai/jais-family-6p7b-chat",
"License": "apache-2.0",
"Revision": "main",
"Precision": "float32",
"Params": 7.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.6392,
"Completeness": 0.6129,
"Conciseness": 0.27,
"Helpfulness": 0.6016,
"Honesty": 0.6171,
"Harmlessness": 0.6383,
"3C3H Score": 0.5632
},
"Tasks Scores": {
"Question Answering (QA)": 0.6465,
"Reasoning": 0.6283,
"Orthographic and Grammatical Analysis": 0.0591,
"Safety": 0.4625
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.1-70B-Instruct",
"License": "llama3.1",
"Revision": "main",
"Precision": "bfloat16",
"Params": 70.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.4421,
"Completeness": 0.4409,
"Conciseness": 0.1416,
"Helpfulness": 0.3967,
"Honesty": 0.4065,
"Harmlessness": 0.4421,
"3C3H Score": 0.3783
},
"Tasks Scores": {
"Question Answering (QA)": 0.3826,
"Reasoning": 0.45,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.6625
}
},
"Meta": {
"Model Name": "meta-llama/Llama-3.1-8B-Instruct",
"License": "llama3.1",
"Revision": "main",
"Precision": "bfloat16",
"Params": 8.0,
"Total Entries": 279,
"Successful Entries": 279,
"Failed Entries": 0,
"Success Ratio": 1.0
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.2359,
"Completeness": 0.2058,
"Conciseness": 0.0581,
"Helpfulness": 0.1781,
"Honesty": 0.2106,
"Harmlessness": 0.2341,
"3C3H Score": 0.1871
},
"Tasks Scores": {
"Question Answering (QA)": 0.198,
"Reasoning": 0.2328,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.2229
}
},
"Meta": {
"Model Name": "meta-llama/Meta-Llama-3-8B-Instruct",
"License": "llama3",
"Revision": "main",
"Precision": "bfloat16",
"Params": 14.963,
"Total Entries": 279,
"Successful Entries": 277,
"Failed Entries": 2,
"Success Ratio": 0.9928
}
},
{
"claude-3.5-sonnet Scores": {
"3C3H Scores": {
"Correctness": 0.5204,
"Completeness": 0.1295,
"Conciseness": 0.4149,
"Helpfulness": 0.2332,
"Honesty": 0.4814,
"Harmlessness": 0.5204,
"3C3H Score": 0.3833
},
"Tasks Scores": {
"Question Answering (QA)": 0.4053,
"Reasoning": 0.3806,
"Orthographic and Grammatical Analysis": 0.0,
"Safety": 0.8188
}
},
"Meta": {
"Model Name": "silma-ai/SILMA-9B-Instruct-v1.0",
"License": "gemma",
"Revision": "main",
"Precision": "bfloat16",
"Params": 9.0,
"Total Entries": 279,
"Successful Entries": 278,
"Failed Entries": 1,
"Success Ratio": 0.9964
}
}
]