Spaces:

ramiroluo
/

LLMHallucination_Leaderboard

Running

App Files Files Community

ramiroluo commited on Dec 26, 2023

Commit

91cf45c

•

1 Parent(s): c1e3607

Add Application

Browse files

Files changed (3) hide show

app.py +142 -0
auto_leaderboard_scores.json +0 -0
human_leaderboard_scores.json +186 -0

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import json
+import pandas as pd
+import streamlit as st
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+def display_results(results, setting='avg', rank_metric='Entailment(↓)', is_auto=True):
+    label_marker = {
+        'Entailment': dict(
+            color='rgba(102, 204, 0, 0.6)'),
+        'Neutral': dict(
+            color='rgba(255, 178, 102, 0.6)'),
+        'Contradiction': dict(
+            color='rgba(255, 51, 51, 0.6)'),
+        'Abstain': dict(
+            color='rgba(192, 192, 192, 0.6)')
+    }
+    model_names= []
+    entails = []
+    neutrals = []
+    contras = []
+    abstains = []
+    for k, v in results.items():
+        model_names.append(k)
+        entails.append(v[setting]['entailment'])
+        neutrals.append(v[setting]['neutral'])
+        contras.append(v[setting]['contradiction'])
+        abstains.append(v[setting]['abstain'])
+    results = list(zip(model_names, entails, neutrals, contras, abstains))
+    label_order = None
+    if rank_metric == 'Entailment(↓)':
+        results = sorted(results, key=lambda x: x[1])
+        label_order = ['Entailment', 'Neutral', 'Contradiction']
+    elif rank_metric == 'Neutral(↑)':
+        results = sorted(results, key=lambda x: x[2], reverse=True)
+        label_order = ['Neutral', 'Contradiction', 'Entailment']
+    elif rank_metric == 'Contradiction(↑)':
+        results = sorted(results, key=lambda x: x[3], reverse=True)
+        label_order = ['Contradiction', 'Neutral', 'Entailment']
+    elif rank_metric == 'Abstain(↑)':
+        results = sorted(results, key=lambda x: x[4], reverse=True)
+        label_order = ['Contradiction', 'Neutral', 'Entailment']
+    label_to_results_idx = {
+        'Entailment': 1,
+        'Neutral': 2,
+        'Contradiction': 3,
+        'Abstain': 4
+    }
+    # fig = go.Figure()
+    fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0)
+    for label in label_order:
+        text = []
+        fig.add_trace(
+            go.Bar(
+                y=[x[0] for rank, x in enumerate(results)],
+                x=[x[label_to_results_idx[label]] for x in results],
+                name=label,
+                orientation='h',
+                marker=label_marker[label],
+                text=[round(x[label_to_results_idx[label]], 1) for x in results]
+            ),
+            row=1,
+            col=1
+        )
+    # abstain bar
+    fig.add_trace(
+        go.Bar(
+            y=[x[0] for rank, x in enumerate(results)],
+            x=[x[label_to_results_idx['Abstain']] for x in results],
+            name='Abstain',
+            orientation='h',
+            marker=label_marker['Abstain'],
+            text=[round(x[label_to_results_idx['Abstain']], 1) for x in results]
+        ),
+        row=1,
+        col=2
+    )
+    fig.update_layout(
+        barmode='stack',
+        width=1000,
+        height=900 if is_auto else 500,
+        bargap=0.35,
+        legend_font=dict(size=18),
+    )
+    fig.update_yaxes(tickfont=dict(size=19, color='black'))
+    st.plotly_chart(fig)
+if __name__ == '__main__':
+    st.set_page_config(layout='wide')
+    st.title('HalluChecker Leaderboard')
+    st.write('[GitHub repo of HalluChecker](https://github.com/LuoXiaoHeics/HalluChecker)')
+    tab1 = st.tabs(['Auto-checked Leaderboard'])
+    with tab1:
+        col1, col2 = st.columns([1, 7])
+        with col1:
+            extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2'])
+            checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI'])
+            model_map = {
+                'Ensemble of 3 Checkers': 'ensemble',
+                'GPT-4': 'gpt4',
+                'Claude 2': 'claude2',
+                'RoBERTa-NLI': 'nli'
+            }
+            extractor = model_map[extractor]
+            checker = model_map[checker]
+            rank_metric = st.radio('Rank By:', ['Contradiction(↑)', 'Neutral(↑)', 'Entailment(↓)', 'Abstain(↑)'])
+        with col2:
+            results = json.load(open('auto_leaderboard_scores.json'))
+            res_key = f'{extractor}###{checker}'
+            if res_key not in results:
+                st.write('Work in progress, please stay tuned 😊')
+            else:
+                results = results[res_key]
+                tab_avg, tab_zero, tab_noisy, tab_accurate = \
+                    st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context'])
+                with tab_avg:
+                    display_results(results, setting='avg', rank_metric=rank_metric)
+                with tab_zero:
+                    display_results(results, setting='nq', rank_metric=rank_metric)
+                with tab_noisy:
+                    display_results(results, setting='msmarco', rank_metric=rank_metric)
+                with tab_accurate:
+                    display_results(results, setting='dolly', rank_metric=rank_metric)
+                st.divider()
+                st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.')
+                st.write('† The responses of Gemini Pro (API) are collected from its offical API without tools.')
+                st.write('♣ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).')

auto_leaderboard_scores.json ADDED Viewed

The diff for this file is too large to render. See raw diff

human_leaderboard_scores.json ADDED Viewed

	@@ -0,0 +1,186 @@

+{
+    "claude2###human": {
+        "Alpaca 7B": {
+            "nq": {
+                "abstain": 27.0,
+                "entailment": 21.90920951194924,
+                "neutral": 49.65358081796437,
+                "contradiction": 28.437209670086382
+            },
+            "msmarco": {
+                "abstain": 2.0,
+                "entailment": 59.01711667017789,
+                "neutral": 19.84909648174954,
+                "contradiction": 21.13378684807256
+            },
+            "dolly": {
+                "abstain": 13.0,
+                "entailment": 76.98572340813719,
+                "neutral": 12.884738186462325,
+                "contradiction": 10.129538405400474
+            },
+            "avg": {
+                "abstain": 14.000000000000002,
+                "entailment": 54.57677389363435,
+                "neutral": 25.933701849399526,
+                "contradiction": 19.489524256966117
+            }
+        },
+        "GPT-3.5-Turbo": {
+            "nq": {
+                "abstain": 1.0,
+                "entailment": 58.8535769373559,
+                "neutral": 22.130219091003404,
+                "contradiction": 19.016203971640692
+            },
+            "msmarco": {
+                "abstain": 20.0,
+                "entailment": 77.3299637383689,
+                "neutral": 6.634321975916804,
+                "contradiction": 16.035714285714285
+            },
+            "dolly": {
+                "abstain": 0.0,
+                "entailment": 93.69698079698081,
+                "neutral": 2.682251082251082,
+                "contradiction": 3.6207681207681204
+            },
+            "avg": {
+                "abstain": 7.000000000000001,
+                "entailment": 76.64014084432196,
+                "neutral": 10.716353248415016,
+                "contradiction": 12.643505907263023
+            }
+        },
+        "Claude 2": {
+            "nq": {
+                "abstain": 21.0,
+                "entailment": 36.24974533202381,
+                "neutral": 60.93093966511689,
+                "contradiction": 2.819315002859307
+            },
+            "msmarco": {
+                "abstain": 6.0,
+                "entailment": 88.95130578641216,
+                "neutral": 6.450995812697939,
+                "contradiction": 4.5976984008898905
+            },
+            "dolly": {
+                "abstain": 8.0,
+                "entailment": 90.86864524364525,
+                "neutral": 6.670880448054362,
+                "contradiction": 2.4604743083003955
+            },
+            "avg": {
+                "abstain": 11.666666666666668,
+                "entailment": 73.90591693421882,
+                "neutral": 22.768523928901285,
+                "contradiction": 3.3255591368798907
+            }
+        },
+        "InstructGPT": {
+            "nq": {
+                "abstain": 5.0,
+                "entailment": 20.438596491228072,
+                "neutral": 25.30701754385965,
+                "contradiction": 54.254385964912274
+            },
+            "msmarco": {
+                "abstain": 13.0,
+                "entailment": 65.80729296246537,
+                "neutral": 13.403575989782887,
+                "contradiction": 20.78913104775174
+            },
+            "dolly": {
+                "abstain": 1.0,
+                "entailment": 81.58865825532492,
+                "neutral": 5.608465608465608,
+                "contradiction": 12.802876136209468
+            },
+            "avg": {
+                "abstain": 6.333333333333334,
+                "entailment": 56.029104347609696,
+                "neutral": 14.68155114952268,
+                "contradiction": 29.289344502867635
+            }
+        },
+        "Falcon 40B Instruct": {
+            "nq": {
+                "abstain": 27.0,
+                "entailment": 37.96803652968036,
+                "neutral": 17.123287671232877,
+                "contradiction": 44.90867579908676
+            },
+            "msmarco": {
+                "abstain": 17.0,
+                "entailment": 61.28370625358577,
+                "neutral": 17.053930005737232,
+                "contradiction": 21.662363740676994
+            },
+            "dolly": {
+                "abstain": 3.0,
+                "entailment": 78.37657474255414,
+                "neutral": 13.978295473140834,
+                "contradiction": 7.645129784305042
+            },
+            "avg": {
+                "abstain": 15.66666666666667,
+                "entailment": 61.10965231518591,
+                "neutral": 15.894746448106131,
+                "contradiction": 22.99560123670796
+            }
+        },
+        "GPT-4": {
+            "nq": {
+                "abstain": 0.0,
+                "entailment": 71.44246031746032,
+                "neutral": 15.671428571428569,
+                "contradiction": 12.88611111111111
+            },
+            "msmarco": {
+                "abstain": 13.0,
+                "entailment": 91.79110724749671,
+                "neutral": 6.772111143307898,
+                "contradiction": 1.4367816091954022
+            },
+            "dolly": {
+                "abstain": 8.0,
+                "entailment": 97.77950310559007,
+                "neutral": 1.224120082815735,
+                "contradiction": 0.9963768115942028
+            },
+            "avg": {
+                "abstain": 7.000000000000001,
+                "entailment": 86.47235357703416,
+                "neutral": 8.132385570715742,
+                "contradiction": 5.395260852250098
+            }
+        },
+        "LLaMA 2 70B Chat": {
+            "nq": {
+                "abstain": 6.0,
+                "entailment": 23.619620247386862,
+                "neutral": 62.5351563421684,
+                "contradiction": 13.84522341044474
+            },
+            "msmarco": {
+                "abstain": 4.0,
+                "entailment": 84.80608457890267,
+                "neutral": 11.166780978062148,
+                "contradiction": 4.0271344430351785
+            },
+            "dolly": {
+                "abstain": 0.0,
+                "entailment": 92.75111832611834,
+                "neutral": 4.0687229437229435,
+                "contradiction": 3.1801587301587304
+            },
+            "avg": {
+                "abstain": 3.3333333333333335,
+                "entailment": 67.71289743255467,
+                "neutral": 25.369613670448583,
+                "contradiction": 6.917488896996744
+            }
+        }
+    }
+}