File size: 5,388 Bytes
91cf45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import json
import pandas as pd
import streamlit as st
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def display_results(results, setting='avg', rank_metric='Entailment(β)', is_auto=True):
label_marker = {
'Entailment': dict(
color='rgba(102, 204, 0, 0.6)'),
'Neutral': dict(
color='rgba(255, 178, 102, 0.6)'),
'Contradiction': dict(
color='rgba(255, 51, 51, 0.6)'),
'Abstain': dict(
color='rgba(192, 192, 192, 0.6)')
}
model_names= []
entails = []
neutrals = []
contras = []
abstains = []
for k, v in results.items():
model_names.append(k)
entails.append(v[setting]['entailment'])
neutrals.append(v[setting]['neutral'])
contras.append(v[setting]['contradiction'])
abstains.append(v[setting]['abstain'])
results = list(zip(model_names, entails, neutrals, contras, abstains))
label_order = None
if rank_metric == 'Entailment(β)':
results = sorted(results, key=lambda x: x[1])
label_order = ['Entailment', 'Neutral', 'Contradiction']
elif rank_metric == 'Neutral(β)':
results = sorted(results, key=lambda x: x[2], reverse=True)
label_order = ['Neutral', 'Contradiction', 'Entailment']
elif rank_metric == 'Contradiction(β)':
results = sorted(results, key=lambda x: x[3], reverse=True)
label_order = ['Contradiction', 'Neutral', 'Entailment']
elif rank_metric == 'Abstain(β)':
results = sorted(results, key=lambda x: x[4], reverse=True)
label_order = ['Contradiction', 'Neutral', 'Entailment']
label_to_results_idx = {
'Entailment': 1,
'Neutral': 2,
'Contradiction': 3,
'Abstain': 4
}
# fig = go.Figure()
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
for label in label_order:
text = []
fig.add_trace(
go.Bar(
y=[x[0] for rank, x in enumerate(results)],
x=[x[label_to_results_idx[label]] for x in results],
name=label,
orientation='h',
marker=label_marker[label],
text=[round(x[label_to_results_idx[label]], 1) for x in results]
),
row=1,
col=1
)
# abstain bar
fig.add_trace(
go.Bar(
y=[x[0] for rank, x in enumerate(results)],
x=[x[label_to_results_idx['Abstain']] for x in results],
name='Abstain',
orientation='h',
marker=label_marker['Abstain'],
text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
),
row=1,
col=2
)
fig.update_layout(
barmode='stack',
width=1000,
height=900 if is_auto else 500,
bargap=0.35,
legend_font=dict(size=18),
)
fig.update_yaxes(tickfont=dict(size=19, color='black'))
st.plotly_chart(fig)
if __name__ == '__main__':
st.set_page_config(layout='wide')
st.title('HalluChecker Leaderboard')
st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')
tab1 = st.tabs(['Auto-checked Leaderboard'])
with tab1:
col1, col2 = st.columns([1, 7])
with col1:
extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
model_map = {
'Ensemble of 3 Checkers': 'ensemble',
'GPT-4': 'gpt4',
'Claude 2': 'claude2',
'RoBERTa-NLI': 'nli'
}
extractor = model_map[extractor]
checker = model_map[checker]
rank_metric = st.radio('Rank By:', ['Contradiction(β)', 'Neutral(β)', 'Entailment(β)', 'Abstain(β)'])
with col2:
results = json.load(open('auto_leaderboard_scores.json'))
res_key = f'{extractor}###{checker}'
if res_key not in results:
st.write('Work in progress, please stay tuned π')
else:
results = results[res_key]
tab_avg, tab_zero, tab_noisy, tab_accurate = \
st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])
with tab_avg:
display_results(results, setting='avg', rank_metric=rank_metric)
with tab_zero:
display_results(results, setting='nq', rank_metric=rank_metric)
with tab_noisy:
display_results(results, setting='msmarco', rank_metric=rank_metric)
with tab_accurate:
display_results(results, setting='dolly', rank_metric=rank_metric)
st.divider()
st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
st.write('β The responses of Gemini Pro (API) are collected from its offical API without tools.')
st.write('β£ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')
|