LLMHallucination_Leaderboard / human_leaderboard_scores.json
ramiroluo's picture
Add Application
91cf45c
raw
history blame
No virus
6.45 kB
{
"claude2###human": {
"Alpaca 7B": {
"nq": {
"abstain": 27.0,
"entailment": 21.90920951194924,
"neutral": 49.65358081796437,
"contradiction": 28.437209670086382
},
"msmarco": {
"abstain": 2.0,
"entailment": 59.01711667017789,
"neutral": 19.84909648174954,
"contradiction": 21.13378684807256
},
"dolly": {
"abstain": 13.0,
"entailment": 76.98572340813719,
"neutral": 12.884738186462325,
"contradiction": 10.129538405400474
},
"avg": {
"abstain": 14.000000000000002,
"entailment": 54.57677389363435,
"neutral": 25.933701849399526,
"contradiction": 19.489524256966117
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 1.0,
"entailment": 58.8535769373559,
"neutral": 22.130219091003404,
"contradiction": 19.016203971640692
},
"msmarco": {
"abstain": 20.0,
"entailment": 77.3299637383689,
"neutral": 6.634321975916804,
"contradiction": 16.035714285714285
},
"dolly": {
"abstain": 0.0,
"entailment": 93.69698079698081,
"neutral": 2.682251082251082,
"contradiction": 3.6207681207681204
},
"avg": {
"abstain": 7.000000000000001,
"entailment": 76.64014084432196,
"neutral": 10.716353248415016,
"contradiction": 12.643505907263023
}
},
"Claude 2": {
"nq": {
"abstain": 21.0,
"entailment": 36.24974533202381,
"neutral": 60.93093966511689,
"contradiction": 2.819315002859307
},
"msmarco": {
"abstain": 6.0,
"entailment": 88.95130578641216,
"neutral": 6.450995812697939,
"contradiction": 4.5976984008898905
},
"dolly": {
"abstain": 8.0,
"entailment": 90.86864524364525,
"neutral": 6.670880448054362,
"contradiction": 2.4604743083003955
},
"avg": {
"abstain": 11.666666666666668,
"entailment": 73.90591693421882,
"neutral": 22.768523928901285,
"contradiction": 3.3255591368798907
}
},
"InstructGPT": {
"nq": {
"abstain": 5.0,
"entailment": 20.438596491228072,
"neutral": 25.30701754385965,
"contradiction": 54.254385964912274
},
"msmarco": {
"abstain": 13.0,
"entailment": 65.80729296246537,
"neutral": 13.403575989782887,
"contradiction": 20.78913104775174
},
"dolly": {
"abstain": 1.0,
"entailment": 81.58865825532492,
"neutral": 5.608465608465608,
"contradiction": 12.802876136209468
},
"avg": {
"abstain": 6.333333333333334,
"entailment": 56.029104347609696,
"neutral": 14.68155114952268,
"contradiction": 29.289344502867635
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 27.0,
"entailment": 37.96803652968036,
"neutral": 17.123287671232877,
"contradiction": 44.90867579908676
},
"msmarco": {
"abstain": 17.0,
"entailment": 61.28370625358577,
"neutral": 17.053930005737232,
"contradiction": 21.662363740676994
},
"dolly": {
"abstain": 3.0,
"entailment": 78.37657474255414,
"neutral": 13.978295473140834,
"contradiction": 7.645129784305042
},
"avg": {
"abstain": 15.66666666666667,
"entailment": 61.10965231518591,
"neutral": 15.894746448106131,
"contradiction": 22.99560123670796
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 71.44246031746032,
"neutral": 15.671428571428569,
"contradiction": 12.88611111111111
},
"msmarco": {
"abstain": 13.0,
"entailment": 91.79110724749671,
"neutral": 6.772111143307898,
"contradiction": 1.4367816091954022
},
"dolly": {
"abstain": 8.0,
"entailment": 97.77950310559007,
"neutral": 1.224120082815735,
"contradiction": 0.9963768115942028
},
"avg": {
"abstain": 7.000000000000001,
"entailment": 86.47235357703416,
"neutral": 8.132385570715742,
"contradiction": 5.395260852250098
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 6.0,
"entailment": 23.619620247386862,
"neutral": 62.5351563421684,
"contradiction": 13.84522341044474
},
"msmarco": {
"abstain": 4.0,
"entailment": 84.80608457890267,
"neutral": 11.166780978062148,
"contradiction": 4.0271344430351785
},
"dolly": {
"abstain": 0.0,
"entailment": 92.75111832611834,
"neutral": 4.0687229437229435,
"contradiction": 3.1801587301587304
},
"avg": {
"abstain": 3.3333333333333335,
"entailment": 67.71289743255467,
"neutral": 25.369613670448583,
"contradiction": 6.917488896996744
}
}
}
}