Add Application
Browse files- app.py +142 -0
- auto_leaderboard_scores.json +0 -0
- human_leaderboard_scores.json +186 -0
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
from plotly.subplots import make_subplots
|
8 |
+
|
9 |
+
|
10 |
+
def display_results(results, setting='avg', rank_metric='Entailment(↓)', is_auto=True):
|
11 |
+
label_marker = {
|
12 |
+
'Entailment': dict(
|
13 |
+
color='rgba(102, 204, 0, 0.6)'),
|
14 |
+
'Neutral': dict(
|
15 |
+
color='rgba(255, 178, 102, 0.6)'),
|
16 |
+
'Contradiction': dict(
|
17 |
+
color='rgba(255, 51, 51, 0.6)'),
|
18 |
+
'Abstain': dict(
|
19 |
+
color='rgba(192, 192, 192, 0.6)')
|
20 |
+
}
|
21 |
+
|
22 |
+
model_names= []
|
23 |
+
entails = []
|
24 |
+
neutrals = []
|
25 |
+
contras = []
|
26 |
+
abstains = []
|
27 |
+
for k, v in results.items():
|
28 |
+
model_names.append(k)
|
29 |
+
entails.append(v[setting]['entailment'])
|
30 |
+
neutrals.append(v[setting]['neutral'])
|
31 |
+
contras.append(v[setting]['contradiction'])
|
32 |
+
abstains.append(v[setting]['abstain'])
|
33 |
+
|
34 |
+
results = list(zip(model_names, entails, neutrals, contras, abstains))
|
35 |
+
label_order = None
|
36 |
+
if rank_metric == 'Entailment(↓)':
|
37 |
+
results = sorted(results, key=lambda x: x[1])
|
38 |
+
label_order = ['Entailment', 'Neutral', 'Contradiction']
|
39 |
+
elif rank_metric == 'Neutral(↑)':
|
40 |
+
results = sorted(results, key=lambda x: x[2], reverse=True)
|
41 |
+
label_order = ['Neutral', 'Contradiction', 'Entailment']
|
42 |
+
elif rank_metric == 'Contradiction(↑)':
|
43 |
+
results = sorted(results, key=lambda x: x[3], reverse=True)
|
44 |
+
label_order = ['Contradiction', 'Neutral', 'Entailment']
|
45 |
+
elif rank_metric == 'Abstain(↑)':
|
46 |
+
results = sorted(results, key=lambda x: x[4], reverse=True)
|
47 |
+
label_order = ['Contradiction', 'Neutral', 'Entailment']
|
48 |
+
|
49 |
+
|
50 |
+
label_to_results_idx = {
|
51 |
+
'Entailment': 1,
|
52 |
+
'Neutral': 2,
|
53 |
+
'Contradiction': 3,
|
54 |
+
'Abstain': 4
|
55 |
+
}
|
56 |
+
|
57 |
+
# fig = go.Figure()
|
58 |
+
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
|
59 |
+
for label in label_order:
|
60 |
+
text = []
|
61 |
+
fig.add_trace(
|
62 |
+
go.Bar(
|
63 |
+
y=[x[0] for rank, x in enumerate(results)],
|
64 |
+
x=[x[label_to_results_idx[label]] for x in results],
|
65 |
+
name=label,
|
66 |
+
orientation='h',
|
67 |
+
marker=label_marker[label],
|
68 |
+
text=[round(x[label_to_results_idx[label]], 1) for x in results]
|
69 |
+
),
|
70 |
+
row=1,
|
71 |
+
col=1
|
72 |
+
)
|
73 |
+
# abstain bar
|
74 |
+
fig.add_trace(
|
75 |
+
go.Bar(
|
76 |
+
y=[x[0] for rank, x in enumerate(results)],
|
77 |
+
x=[x[label_to_results_idx['Abstain']] for x in results],
|
78 |
+
name='Abstain',
|
79 |
+
orientation='h',
|
80 |
+
marker=label_marker['Abstain'],
|
81 |
+
text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
|
82 |
+
),
|
83 |
+
row=1,
|
84 |
+
col=2
|
85 |
+
)
|
86 |
+
|
87 |
+
fig.update_layout(
|
88 |
+
barmode='stack',
|
89 |
+
width=1000,
|
90 |
+
height=900 if is_auto else 500,
|
91 |
+
bargap=0.35,
|
92 |
+
legend_font=dict(size=18),
|
93 |
+
)
|
94 |
+
fig.update_yaxes(tickfont=dict(size=19, color='black'))
|
95 |
+
|
96 |
+
st.plotly_chart(fig)
|
97 |
+
|
98 |
+
|
99 |
+
if __name__ == '__main__':
|
100 |
+
st.set_page_config(layout='wide')
|
101 |
+
st.title('HalluChecker Leaderboard')
|
102 |
+
st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')
|
103 |
+
|
104 |
+
tab1 = st.tabs(['Auto-checked Leaderboard'])
|
105 |
+
with tab1:
|
106 |
+
col1, col2 = st.columns([1, 7])
|
107 |
+
with col1:
|
108 |
+
extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
|
109 |
+
checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
|
110 |
+
model_map = {
|
111 |
+
'Ensemble of 3 Checkers': 'ensemble',
|
112 |
+
'GPT-4': 'gpt4',
|
113 |
+
'Claude 2': 'claude2',
|
114 |
+
'RoBERTa-NLI': 'nli'
|
115 |
+
}
|
116 |
+
extractor = model_map[extractor]
|
117 |
+
checker = model_map[checker]
|
118 |
+
|
119 |
+
rank_metric = st.radio('Rank By:', ['Contradiction(↑)', 'Neutral(↑)', 'Entailment(↓)', 'Abstain(↑)'])
|
120 |
+
with col2:
|
121 |
+
results = json.load(open('auto_leaderboard_scores.json'))
|
122 |
+
res_key = f'{extractor}###{checker}'
|
123 |
+
if res_key not in results:
|
124 |
+
st.write('Work in progress, please stay tuned 😊')
|
125 |
+
else:
|
126 |
+
results = results[res_key]
|
127 |
+
tab_avg, tab_zero, tab_noisy, tab_accurate = \
|
128 |
+
st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])
|
129 |
+
|
130 |
+
with tab_avg:
|
131 |
+
display_results(results, setting='avg', rank_metric=rank_metric)
|
132 |
+
with tab_zero:
|
133 |
+
display_results(results, setting='nq', rank_metric=rank_metric)
|
134 |
+
with tab_noisy:
|
135 |
+
display_results(results, setting='msmarco', rank_metric=rank_metric)
|
136 |
+
with tab_accurate:
|
137 |
+
display_results(results, setting='dolly', rank_metric=rank_metric)
|
138 |
+
st.divider()
|
139 |
+
st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
|
140 |
+
st.write('† The responses of Gemini Pro (API) are collected from its offical API without tools.')
|
141 |
+
st.write('♣ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')
|
142 |
+
|
auto_leaderboard_scores.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
human_leaderboard_scores.json
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"claude2###human": {
|
3 |
+
"Alpaca 7B": {
|
4 |
+
"nq": {
|
5 |
+
"abstain": 27.0,
|
6 |
+
"entailment": 21.90920951194924,
|
7 |
+
"neutral": 49.65358081796437,
|
8 |
+
"contradiction": 28.437209670086382
|
9 |
+
},
|
10 |
+
"msmarco": {
|
11 |
+
"abstain": 2.0,
|
12 |
+
"entailment": 59.01711667017789,
|
13 |
+
"neutral": 19.84909648174954,
|
14 |
+
"contradiction": 21.13378684807256
|
15 |
+
},
|
16 |
+
"dolly": {
|
17 |
+
"abstain": 13.0,
|
18 |
+
"entailment": 76.98572340813719,
|
19 |
+
"neutral": 12.884738186462325,
|
20 |
+
"contradiction": 10.129538405400474
|
21 |
+
},
|
22 |
+
"avg": {
|
23 |
+
"abstain": 14.000000000000002,
|
24 |
+
"entailment": 54.57677389363435,
|
25 |
+
"neutral": 25.933701849399526,
|
26 |
+
"contradiction": 19.489524256966117
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"GPT-3.5-Turbo": {
|
30 |
+
"nq": {
|
31 |
+
"abstain": 1.0,
|
32 |
+
"entailment": 58.8535769373559,
|
33 |
+
"neutral": 22.130219091003404,
|
34 |
+
"contradiction": 19.016203971640692
|
35 |
+
},
|
36 |
+
"msmarco": {
|
37 |
+
"abstain": 20.0,
|
38 |
+
"entailment": 77.3299637383689,
|
39 |
+
"neutral": 6.634321975916804,
|
40 |
+
"contradiction": 16.035714285714285
|
41 |
+
},
|
42 |
+
"dolly": {
|
43 |
+
"abstain": 0.0,
|
44 |
+
"entailment": 93.69698079698081,
|
45 |
+
"neutral": 2.682251082251082,
|
46 |
+
"contradiction": 3.6207681207681204
|
47 |
+
},
|
48 |
+
"avg": {
|
49 |
+
"abstain": 7.000000000000001,
|
50 |
+
"entailment": 76.64014084432196,
|
51 |
+
"neutral": 10.716353248415016,
|
52 |
+
"contradiction": 12.643505907263023
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"Claude 2": {
|
56 |
+
"nq": {
|
57 |
+
"abstain": 21.0,
|
58 |
+
"entailment": 36.24974533202381,
|
59 |
+
"neutral": 60.93093966511689,
|
60 |
+
"contradiction": 2.819315002859307
|
61 |
+
},
|
62 |
+
"msmarco": {
|
63 |
+
"abstain": 6.0,
|
64 |
+
"entailment": 88.95130578641216,
|
65 |
+
"neutral": 6.450995812697939,
|
66 |
+
"contradiction": 4.5976984008898905
|
67 |
+
},
|
68 |
+
"dolly": {
|
69 |
+
"abstain": 8.0,
|
70 |
+
"entailment": 90.86864524364525,
|
71 |
+
"neutral": 6.670880448054362,
|
72 |
+
"contradiction": 2.4604743083003955
|
73 |
+
},
|
74 |
+
"avg": {
|
75 |
+
"abstain": 11.666666666666668,
|
76 |
+
"entailment": 73.90591693421882,
|
77 |
+
"neutral": 22.768523928901285,
|
78 |
+
"contradiction": 3.3255591368798907
|
79 |
+
}
|
80 |
+
},
|
81 |
+
"InstructGPT": {
|
82 |
+
"nq": {
|
83 |
+
"abstain": 5.0,
|
84 |
+
"entailment": 20.438596491228072,
|
85 |
+
"neutral": 25.30701754385965,
|
86 |
+
"contradiction": 54.254385964912274
|
87 |
+
},
|
88 |
+
"msmarco": {
|
89 |
+
"abstain": 13.0,
|
90 |
+
"entailment": 65.80729296246537,
|
91 |
+
"neutral": 13.403575989782887,
|
92 |
+
"contradiction": 20.78913104775174
|
93 |
+
},
|
94 |
+
"dolly": {
|
95 |
+
"abstain": 1.0,
|
96 |
+
"entailment": 81.58865825532492,
|
97 |
+
"neutral": 5.608465608465608,
|
98 |
+
"contradiction": 12.802876136209468
|
99 |
+
},
|
100 |
+
"avg": {
|
101 |
+
"abstain": 6.333333333333334,
|
102 |
+
"entailment": 56.029104347609696,
|
103 |
+
"neutral": 14.68155114952268,
|
104 |
+
"contradiction": 29.289344502867635
|
105 |
+
}
|
106 |
+
},
|
107 |
+
"Falcon 40B Instruct": {
|
108 |
+
"nq": {
|
109 |
+
"abstain": 27.0,
|
110 |
+
"entailment": 37.96803652968036,
|
111 |
+
"neutral": 17.123287671232877,
|
112 |
+
"contradiction": 44.90867579908676
|
113 |
+
},
|
114 |
+
"msmarco": {
|
115 |
+
"abstain": 17.0,
|
116 |
+
"entailment": 61.28370625358577,
|
117 |
+
"neutral": 17.053930005737232,
|
118 |
+
"contradiction": 21.662363740676994
|
119 |
+
},
|
120 |
+
"dolly": {
|
121 |
+
"abstain": 3.0,
|
122 |
+
"entailment": 78.37657474255414,
|
123 |
+
"neutral": 13.978295473140834,
|
124 |
+
"contradiction": 7.645129784305042
|
125 |
+
},
|
126 |
+
"avg": {
|
127 |
+
"abstain": 15.66666666666667,
|
128 |
+
"entailment": 61.10965231518591,
|
129 |
+
"neutral": 15.894746448106131,
|
130 |
+
"contradiction": 22.99560123670796
|
131 |
+
}
|
132 |
+
},
|
133 |
+
"GPT-4": {
|
134 |
+
"nq": {
|
135 |
+
"abstain": 0.0,
|
136 |
+
"entailment": 71.44246031746032,
|
137 |
+
"neutral": 15.671428571428569,
|
138 |
+
"contradiction": 12.88611111111111
|
139 |
+
},
|
140 |
+
"msmarco": {
|
141 |
+
"abstain": 13.0,
|
142 |
+
"entailment": 91.79110724749671,
|
143 |
+
"neutral": 6.772111143307898,
|
144 |
+
"contradiction": 1.4367816091954022
|
145 |
+
},
|
146 |
+
"dolly": {
|
147 |
+
"abstain": 8.0,
|
148 |
+
"entailment": 97.77950310559007,
|
149 |
+
"neutral": 1.224120082815735,
|
150 |
+
"contradiction": 0.9963768115942028
|
151 |
+
},
|
152 |
+
"avg": {
|
153 |
+
"abstain": 7.000000000000001,
|
154 |
+
"entailment": 86.47235357703416,
|
155 |
+
"neutral": 8.132385570715742,
|
156 |
+
"contradiction": 5.395260852250098
|
157 |
+
}
|
158 |
+
},
|
159 |
+
"LLaMA 2 70B Chat": {
|
160 |
+
"nq": {
|
161 |
+
"abstain": 6.0,
|
162 |
+
"entailment": 23.619620247386862,
|
163 |
+
"neutral": 62.5351563421684,
|
164 |
+
"contradiction": 13.84522341044474
|
165 |
+
},
|
166 |
+
"msmarco": {
|
167 |
+
"abstain": 4.0,
|
168 |
+
"entailment": 84.80608457890267,
|
169 |
+
"neutral": 11.166780978062148,
|
170 |
+
"contradiction": 4.0271344430351785
|
171 |
+
},
|
172 |
+
"dolly": {
|
173 |
+
"abstain": 0.0,
|
174 |
+
"entailment": 92.75111832611834,
|
175 |
+
"neutral": 4.0687229437229435,
|
176 |
+
"contradiction": 3.1801587301587304
|
177 |
+
},
|
178 |
+
"avg": {
|
179 |
+
"abstain": 3.3333333333333335,
|
180 |
+
"entailment": 67.71289743255467,
|
181 |
+
"neutral": 25.369613670448583,
|
182 |
+
"contradiction": 6.917488896996744
|
183 |
+
}
|
184 |
+
}
|
185 |
+
}
|
186 |
+
}
|