File size: 5,388 Bytes
91cf45c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import json

import pandas as pd
import streamlit as st

import plotly.graph_objects as go
from plotly.subplots import make_subplots


def display_results(results, setting='avg', rank_metric='Entailment(↓)', is_auto=True):
    label_marker = {
        'Entailment': dict(
            color='rgba(102, 204, 0, 0.6)'),
        'Neutral': dict(
            color='rgba(255, 178, 102, 0.6)'),
        'Contradiction': dict(
            color='rgba(255, 51, 51, 0.6)'),
        'Abstain': dict(
            color='rgba(192, 192, 192, 0.6)')
    }

    model_names= []
    entails = []
    neutrals = []
    contras = []
    abstains = []
    for k, v in results.items():
        model_names.append(k)
        entails.append(v[setting]['entailment'])
        neutrals.append(v[setting]['neutral'])
        contras.append(v[setting]['contradiction'])
        abstains.append(v[setting]['abstain'])

    results = list(zip(model_names, entails, neutrals, contras, abstains))
    label_order = None
    if rank_metric == 'Entailment(↓)':
        results = sorted(results, key=lambda x: x[1])
        label_order = ['Entailment', 'Neutral', 'Contradiction']
    elif rank_metric == 'Neutral(↑)':
        results = sorted(results, key=lambda x: x[2], reverse=True)
        label_order = ['Neutral', 'Contradiction', 'Entailment']
    elif rank_metric == 'Contradiction(↑)':
        results = sorted(results, key=lambda x: x[3], reverse=True)
        label_order = ['Contradiction', 'Neutral', 'Entailment']
    elif rank_metric == 'Abstain(↑)':
        results = sorted(results, key=lambda x: x[4], reverse=True)
        label_order = ['Contradiction', 'Neutral', 'Entailment']


    label_to_results_idx = {
        'Entailment': 1,
        'Neutral': 2,
        'Contradiction': 3,
        'Abstain': 4
    }
    
    # fig = go.Figure()
    fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
    for label in label_order:
        text = []
        fig.add_trace(
            go.Bar(
                y=[x[0] for rank, x in enumerate(results)],
                x=[x[label_to_results_idx[label]] for x in results],
                name=label,
                orientation='h',
                marker=label_marker[label],
                text=[round(x[label_to_results_idx[label]], 1) for x in results]
            ),
            row=1,
            col=1
        )
    # abstain bar
    fig.add_trace(
        go.Bar(
            y=[x[0] for rank, x in enumerate(results)],
            x=[x[label_to_results_idx['Abstain']] for x in results],
            name='Abstain',
            orientation='h',
            marker=label_marker['Abstain'],
            text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
        ),
        row=1,
        col=2
    )

    fig.update_layout(
        barmode='stack',
        width=1000,
        height=900 if is_auto else 500,
        bargap=0.35,
        legend_font=dict(size=18),
    )
    fig.update_yaxes(tickfont=dict(size=19, color='black'))

    st.plotly_chart(fig)


if __name__ == '__main__':
    st.set_page_config(layout='wide')
    st.title('HalluChecker Leaderboard')
    st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')

    tab1 = st.tabs(['Auto-checked Leaderboard'])
    with tab1:
        col1, col2 = st.columns([1, 7])
        with col1:
            extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
            checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
            model_map = {
                'Ensemble of 3 Checkers': 'ensemble',
                'GPT-4': 'gpt4',
                'Claude 2': 'claude2',
                'RoBERTa-NLI': 'nli'
            }
            extractor = model_map[extractor]
            checker = model_map[checker]

            rank_metric = st.radio('Rank By:', ['Contradiction(↑)', 'Neutral(↑)', 'Entailment(↓)', 'Abstain(↑)'])
        with col2:
            results = json.load(open('auto_leaderboard_scores.json'))
            res_key = f'{extractor}###{checker}'
            if res_key not in results:
                st.write('Work in progress, please stay tuned 😊')
            else:
                results = results[res_key]
                tab_avg, tab_zero, tab_noisy, tab_accurate = \
                    st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])

                with tab_avg:
                    display_results(results, setting='avg', rank_metric=rank_metric)
                with tab_zero:
                    display_results(results, setting='nq', rank_metric=rank_metric)
                with tab_noisy:
                    display_results(results, setting='msmarco', rank_metric=rank_metric)
                with tab_accurate:
                    display_results(results, setting='dolly', rank_metric=rank_metric)
                st.divider()
                st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
                st.write('† The responses of Gemini Pro (API) are collected from its offical API without tools.')
                st.write('♣ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')