Spaces:

ramiroluo
/

LLMHallucination_Leaderboard

Sleeping

File size: 5,388 Bytes

91cf45c

import json

import pandas as pd
import streamlit as st

import plotly.graph_objects as go
from plotly.subplots import make_subplots


def display_results(results, setting='avg', rank_metric='Entailment(↓)', is_auto=True):
    label_marker = {
        'Entailment': dict(
            color='rgba(102, 204, 0, 0.6)'),
        'Neutral': dict(
            color='rgba(255, 178, 102, 0.6)'),
        'Contradiction': dict(
            color='rgba(255, 51, 51, 0.6)'),
        'Abstain': dict(
            color='rgba(192, 192, 192, 0.6)')
    }

    model_names= []
    entails = []
    neutrals = []
    contras = []
    abstains = []
    for k, v in results.items():
        model_names.append(k)
        entails.append(v[setting]['entailment'])
        neutrals.append(v[setting]['neutral'])
        contras.append(v[setting]['contradiction'])
        abstains.append(v[setting]['abstain'])

    results = list(zip(model_names, entails, neutrals, contras, abstains))
    label_order = None
    if rank_metric == 'Entailment(↓)':
        results = sorted(results, key=lambda x: x[1])
        label_order = ['Entailment', 'Neutral', 'Contradiction']
    elif rank_metric == 'Neutral(↑)':
        results = sorted(results, key=lambda x: x[2], reverse=True)
        label_order = ['Neutral', 'Contradiction', 'Entailment']
    elif rank_metric == 'Contradiction(↑)':
        results = sorted(results, key=lambda x: x[3], reverse=True)
        label_order = ['Contradiction', 'Neutral', 'Entailment']
    elif rank_metric == 'Abstain(↑)':
        results = sorted(results, key=lambda x: x[4], reverse=True)
        label_order = ['Contradiction', 'Neutral', 'Entailment']


    label_to_results_idx = {
        'Entailment': 1,
        'Neutral': 2,
        'Contradiction': 3,
        'Abstain': 4
    }
    
    # fig = go.Figure()
    fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
    for label in label_order:
        text = []
        fig.add_trace(
            go.Bar(
                y=[x[0] for rank, x in enumerate(results)],
                x=[x[label_to_results_idx[label]] for x in results],
                name=label,
                orientation='h',
                marker=label_marker[label],
                text=[round(x[label_to_results_idx[label]], 1) for x in results]
            ),
            row=1,
            col=1
        )
    # abstain bar
    fig.add_trace(
        go.Bar(
            y=[x[0] for rank, x in enumerate(results)],
            x=[x[label_to_results_idx['Abstain']] for x in results],
            name='Abstain',
            orientation='h',
            marker=label_marker['Abstain'],
            text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
        ),
        row=1,
        col=2
    )

    fig.update_layout(
        barmode='stack',
        width=1000,
        height=900 if is_auto else 500,
        bargap=0.35,
        legend_font=dict(size=18),
    )
    fig.update_yaxes(tickfont=dict(size=19, color='black'))

    st.plotly_chart(fig)


if __name__ == '__main__':
    st.set_page_config(layout='wide')
    st.title('HalluChecker Leaderboard')
    st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')

    tab1 = st.tabs(['Auto-checked Leaderboard'])
    with tab1:
        col1, col2 = st.columns([1, 7])
        with col1:
            extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
            checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
            model_map = {
                'Ensemble of 3 Checkers': 'ensemble',
                'GPT-4': 'gpt4',
                'Claude 2': 'claude2',
                'RoBERTa-NLI': 'nli'
            }
            extractor = model_map[extractor]
            checker = model_map[checker]

            rank_metric = st.radio('Rank By:', ['Contradiction(↑)', 'Neutral(↑)', 'Entailment(↓)', 'Abstain(↑)'])
        with col2:
            results = json.load(open('auto_leaderboard_scores.json'))
            res_key = f'{extractor}###{checker}'
            if res_key not in results:
                st.write('Work in progress, please stay tuned 😊')
            else:
                results = results[res_key]
                tab_avg, tab_zero, tab_noisy, tab_accurate = \
                    st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])

                with tab_avg:
                    display_results(results, setting='avg', rank_metric=rank_metric)
                with tab_zero:
                    display_results(results, setting='nq', rank_metric=rank_metric)
                with tab_noisy:
                    display_results(results, setting='msmarco', rank_metric=rank_metric)
                with tab_accurate:
                    display_results(results, setting='dolly', rank_metric=rank_metric)
                st.divider()
                st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
                st.write('† The responses of Gemini Pro (API) are collected from its offical API without tools.')
                st.write('♣ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')