ramiroluo commited on
Commit
91cf45c
1 Parent(s): c1e3607

Add Application

Browse files
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+
9
+
10
+ def display_results(results, setting='avg', rank_metric='Entailment(↓)', is_auto=True):
11
+ label_marker = {
12
+ 'Entailment': dict(
13
+ color='rgba(102, 204, 0, 0.6)'),
14
+ 'Neutral': dict(
15
+ color='rgba(255, 178, 102, 0.6)'),
16
+ 'Contradiction': dict(
17
+ color='rgba(255, 51, 51, 0.6)'),
18
+ 'Abstain': dict(
19
+ color='rgba(192, 192, 192, 0.6)')
20
+ }
21
+
22
+ model_names= []
23
+ entails = []
24
+ neutrals = []
25
+ contras = []
26
+ abstains = []
27
+ for k, v in results.items():
28
+ model_names.append(k)
29
+ entails.append(v[setting]['entailment'])
30
+ neutrals.append(v[setting]['neutral'])
31
+ contras.append(v[setting]['contradiction'])
32
+ abstains.append(v[setting]['abstain'])
33
+
34
+ results = list(zip(model_names, entails, neutrals, contras, abstains))
35
+ label_order = None
36
+ if rank_metric == 'Entailment(↓)':
37
+ results = sorted(results, key=lambda x: x[1])
38
+ label_order = ['Entailment', 'Neutral', 'Contradiction']
39
+ elif rank_metric == 'Neutral(↑)':
40
+ results = sorted(results, key=lambda x: x[2], reverse=True)
41
+ label_order = ['Neutral', 'Contradiction', 'Entailment']
42
+ elif rank_metric == 'Contradiction(↑)':
43
+ results = sorted(results, key=lambda x: x[3], reverse=True)
44
+ label_order = ['Contradiction', 'Neutral', 'Entailment']
45
+ elif rank_metric == 'Abstain(↑)':
46
+ results = sorted(results, key=lambda x: x[4], reverse=True)
47
+ label_order = ['Contradiction', 'Neutral', 'Entailment']
48
+
49
+
50
+ label_to_results_idx = {
51
+ 'Entailment': 1,
52
+ 'Neutral': 2,
53
+ 'Contradiction': 3,
54
+ 'Abstain': 4
55
+ }
56
+
57
+ # fig = go.Figure()
58
+ fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
59
+ for label in label_order:
60
+ text = []
61
+ fig.add_trace(
62
+ go.Bar(
63
+ y=[x[0] for rank, x in enumerate(results)],
64
+ x=[x[label_to_results_idx[label]] for x in results],
65
+ name=label,
66
+ orientation='h',
67
+ marker=label_marker[label],
68
+ text=[round(x[label_to_results_idx[label]], 1) for x in results]
69
+ ),
70
+ row=1,
71
+ col=1
72
+ )
73
+ # abstain bar
74
+ fig.add_trace(
75
+ go.Bar(
76
+ y=[x[0] for rank, x in enumerate(results)],
77
+ x=[x[label_to_results_idx['Abstain']] for x in results],
78
+ name='Abstain',
79
+ orientation='h',
80
+ marker=label_marker['Abstain'],
81
+ text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
82
+ ),
83
+ row=1,
84
+ col=2
85
+ )
86
+
87
+ fig.update_layout(
88
+ barmode='stack',
89
+ width=1000,
90
+ height=900 if is_auto else 500,
91
+ bargap=0.35,
92
+ legend_font=dict(size=18),
93
+ )
94
+ fig.update_yaxes(tickfont=dict(size=19, color='black'))
95
+
96
+ st.plotly_chart(fig)
97
+
98
+
99
+ if __name__ == '__main__':
100
+ st.set_page_config(layout='wide')
101
+ st.title('HalluChecker Leaderboard')
102
+ st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')
103
+
104
+ tab1 = st.tabs(['Auto-checked Leaderboard'])
105
+ with tab1:
106
+ col1, col2 = st.columns([1, 7])
107
+ with col1:
108
+ extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
109
+ checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
110
+ model_map = {
111
+ 'Ensemble of 3 Checkers': 'ensemble',
112
+ 'GPT-4': 'gpt4',
113
+ 'Claude 2': 'claude2',
114
+ 'RoBERTa-NLI': 'nli'
115
+ }
116
+ extractor = model_map[extractor]
117
+ checker = model_map[checker]
118
+
119
+ rank_metric = st.radio('Rank By:', ['Contradiction(↑)', 'Neutral(↑)', 'Entailment(↓)', 'Abstain(↑)'])
120
+ with col2:
121
+ results = json.load(open('auto_leaderboard_scores.json'))
122
+ res_key = f'{extractor}###{checker}'
123
+ if res_key not in results:
124
+ st.write('Work in progress, please stay tuned 😊')
125
+ else:
126
+ results = results[res_key]
127
+ tab_avg, tab_zero, tab_noisy, tab_accurate = \
128
+ st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])
129
+
130
+ with tab_avg:
131
+ display_results(results, setting='avg', rank_metric=rank_metric)
132
+ with tab_zero:
133
+ display_results(results, setting='nq', rank_metric=rank_metric)
134
+ with tab_noisy:
135
+ display_results(results, setting='msmarco', rank_metric=rank_metric)
136
+ with tab_accurate:
137
+ display_results(results, setting='dolly', rank_metric=rank_metric)
138
+ st.divider()
139
+ st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
140
+ st.write('† The responses of Gemini Pro (API) are collected from its offical API without tools.')
141
+ st.write('♣ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')
142
+
auto_leaderboard_scores.json ADDED
The diff for this file is too large to render. See raw diff
 
human_leaderboard_scores.json ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "claude2###human": {
3
+ "Alpaca 7B": {
4
+ "nq": {
5
+ "abstain": 27.0,
6
+ "entailment": 21.90920951194924,
7
+ "neutral": 49.65358081796437,
8
+ "contradiction": 28.437209670086382
9
+ },
10
+ "msmarco": {
11
+ "abstain": 2.0,
12
+ "entailment": 59.01711667017789,
13
+ "neutral": 19.84909648174954,
14
+ "contradiction": 21.13378684807256
15
+ },
16
+ "dolly": {
17
+ "abstain": 13.0,
18
+ "entailment": 76.98572340813719,
19
+ "neutral": 12.884738186462325,
20
+ "contradiction": 10.129538405400474
21
+ },
22
+ "avg": {
23
+ "abstain": 14.000000000000002,
24
+ "entailment": 54.57677389363435,
25
+ "neutral": 25.933701849399526,
26
+ "contradiction": 19.489524256966117
27
+ }
28
+ },
29
+ "GPT-3.5-Turbo": {
30
+ "nq": {
31
+ "abstain": 1.0,
32
+ "entailment": 58.8535769373559,
33
+ "neutral": 22.130219091003404,
34
+ "contradiction": 19.016203971640692
35
+ },
36
+ "msmarco": {
37
+ "abstain": 20.0,
38
+ "entailment": 77.3299637383689,
39
+ "neutral": 6.634321975916804,
40
+ "contradiction": 16.035714285714285
41
+ },
42
+ "dolly": {
43
+ "abstain": 0.0,
44
+ "entailment": 93.69698079698081,
45
+ "neutral": 2.682251082251082,
46
+ "contradiction": 3.6207681207681204
47
+ },
48
+ "avg": {
49
+ "abstain": 7.000000000000001,
50
+ "entailment": 76.64014084432196,
51
+ "neutral": 10.716353248415016,
52
+ "contradiction": 12.643505907263023
53
+ }
54
+ },
55
+ "Claude 2": {
56
+ "nq": {
57
+ "abstain": 21.0,
58
+ "entailment": 36.24974533202381,
59
+ "neutral": 60.93093966511689,
60
+ "contradiction": 2.819315002859307
61
+ },
62
+ "msmarco": {
63
+ "abstain": 6.0,
64
+ "entailment": 88.95130578641216,
65
+ "neutral": 6.450995812697939,
66
+ "contradiction": 4.5976984008898905
67
+ },
68
+ "dolly": {
69
+ "abstain": 8.0,
70
+ "entailment": 90.86864524364525,
71
+ "neutral": 6.670880448054362,
72
+ "contradiction": 2.4604743083003955
73
+ },
74
+ "avg": {
75
+ "abstain": 11.666666666666668,
76
+ "entailment": 73.90591693421882,
77
+ "neutral": 22.768523928901285,
78
+ "contradiction": 3.3255591368798907
79
+ }
80
+ },
81
+ "InstructGPT": {
82
+ "nq": {
83
+ "abstain": 5.0,
84
+ "entailment": 20.438596491228072,
85
+ "neutral": 25.30701754385965,
86
+ "contradiction": 54.254385964912274
87
+ },
88
+ "msmarco": {
89
+ "abstain": 13.0,
90
+ "entailment": 65.80729296246537,
91
+ "neutral": 13.403575989782887,
92
+ "contradiction": 20.78913104775174
93
+ },
94
+ "dolly": {
95
+ "abstain": 1.0,
96
+ "entailment": 81.58865825532492,
97
+ "neutral": 5.608465608465608,
98
+ "contradiction": 12.802876136209468
99
+ },
100
+ "avg": {
101
+ "abstain": 6.333333333333334,
102
+ "entailment": 56.029104347609696,
103
+ "neutral": 14.68155114952268,
104
+ "contradiction": 29.289344502867635
105
+ }
106
+ },
107
+ "Falcon 40B Instruct": {
108
+ "nq": {
109
+ "abstain": 27.0,
110
+ "entailment": 37.96803652968036,
111
+ "neutral": 17.123287671232877,
112
+ "contradiction": 44.90867579908676
113
+ },
114
+ "msmarco": {
115
+ "abstain": 17.0,
116
+ "entailment": 61.28370625358577,
117
+ "neutral": 17.053930005737232,
118
+ "contradiction": 21.662363740676994
119
+ },
120
+ "dolly": {
121
+ "abstain": 3.0,
122
+ "entailment": 78.37657474255414,
123
+ "neutral": 13.978295473140834,
124
+ "contradiction": 7.645129784305042
125
+ },
126
+ "avg": {
127
+ "abstain": 15.66666666666667,
128
+ "entailment": 61.10965231518591,
129
+ "neutral": 15.894746448106131,
130
+ "contradiction": 22.99560123670796
131
+ }
132
+ },
133
+ "GPT-4": {
134
+ "nq": {
135
+ "abstain": 0.0,
136
+ "entailment": 71.44246031746032,
137
+ "neutral": 15.671428571428569,
138
+ "contradiction": 12.88611111111111
139
+ },
140
+ "msmarco": {
141
+ "abstain": 13.0,
142
+ "entailment": 91.79110724749671,
143
+ "neutral": 6.772111143307898,
144
+ "contradiction": 1.4367816091954022
145
+ },
146
+ "dolly": {
147
+ "abstain": 8.0,
148
+ "entailment": 97.77950310559007,
149
+ "neutral": 1.224120082815735,
150
+ "contradiction": 0.9963768115942028
151
+ },
152
+ "avg": {
153
+ "abstain": 7.000000000000001,
154
+ "entailment": 86.47235357703416,
155
+ "neutral": 8.132385570715742,
156
+ "contradiction": 5.395260852250098
157
+ }
158
+ },
159
+ "LLaMA 2 70B Chat": {
160
+ "nq": {
161
+ "abstain": 6.0,
162
+ "entailment": 23.619620247386862,
163
+ "neutral": 62.5351563421684,
164
+ "contradiction": 13.84522341044474
165
+ },
166
+ "msmarco": {
167
+ "abstain": 4.0,
168
+ "entailment": 84.80608457890267,
169
+ "neutral": 11.166780978062148,
170
+ "contradiction": 4.0271344430351785
171
+ },
172
+ "dolly": {
173
+ "abstain": 0.0,
174
+ "entailment": 92.75111832611834,
175
+ "neutral": 4.0687229437229435,
176
+ "contradiction": 3.1801587301587304
177
+ },
178
+ "avg": {
179
+ "abstain": 3.3333333333333335,
180
+ "entailment": 67.71289743255467,
181
+ "neutral": 25.369613670448583,
182
+ "contradiction": 6.917488896996744
183
+ }
184
+ }
185
+ }
186
+ }