import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
# Set up page config
st.set_page_config(
page_title="FactBench Leaderboard",
layout="wide"
)
# load header
with open("_header.md", "r") as f:
HEADER_MD = f.read()
# Load the image
image = Image.open("factEvalSteps.png")
logo_image = Image.open("Factbench_logo.png")
# Custom CSS for the page
st.markdown(
"""
""",
unsafe_allow_html=True
)
# Display title and description
st.markdown('
', unsafe_allow_html=True)
# st.image(logo_image, output_format="PNG", width=200)
# Convert the image to base64
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
f"""
""",
unsafe_allow_html=True
)
# header_md_text = HEADER_MD # make some parameters later
# gr.Markdown(header_md_text, elem_classes="markdown-text")
st.markdown(
'''
''',
unsafe_allow_html=True
)
# st.markdown('
FactBench Leaderboard
',
# unsafe_allow_html=True)
# st.markdown('
Benchmark for LM Factuality Evaluation
', unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
# Load the data
data_path = "tiered_models_data.csv"
df = pd.read_csv(data_path)
# Assign ranks within each tier based on factuality_score
df['rank'] = df.groupby('tier')['factuality_score'].rank(
ascending=False, method='min').astype(int)
# Replace NaN values with '-'
df.fillna('-', inplace=True)
df['original_order'] = df.groupby('tier').cumcount()
# Create tabs
st.markdown("""
""", unsafe_allow_html=True)
tab1, tab2, tab3 = st.tabs(["Leaderboard", "Benchmark Details", "Submit your models"])
# Tab 1: Leaderboard
with tab1:
# df['original_order'] = df.groupby('tier').cumcount()
# print(df['original_order'])
# st.markdown('', unsafe_allow_html=True)
st.markdown("""
Metrics Explanation
""", unsafe_allow_html=True)
st.markdown("""
π― Factual Precision measures the ratio of supported units divided by all units averaged over model responses. π Hallucination Score quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (Avg. # Units), the average number of units labelled as undecidable (Avg. # Undecidable), and the average number of units labelled as unsupported (Avg. # Unsupported).
π for closed LLMs; π for open-weights LLMs; π¨ for newly added models
""",
unsafe_allow_html=True
)
st.markdown("""
""", unsafe_allow_html=True)
# Dropdown menu to filter tiers
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
selected_tier = st.selectbox('Select Tier:', tiers)
# Filter the data based on the selected tier
if selected_tier != 'All Tiers':
filtered_df = df[df['tier'] == selected_tier]
else:
filtered_df = df
sort_by_factuality = st.checkbox('Sort by Factual Precision')
# Sort the dataframe based on Factuality Score if the checkbox is selected
if sort_by_factuality:
updated_filtered_df = filtered_df.sort_values(
by=['tier', 'factuality_score'], ascending=[True, False]
)
else:
updated_filtered_df = filtered_df.sort_values(
by=['tier', 'original_order']
)
# Create HTML for the table
if selected_tier == 'All Tiers':
html = '''
Tier |
Rank |
Model |
π― Factual Precision |
π Hallucination Score |
Avg. # Tokens |
Avg. # Units |
Avg. # Undecidable |
Avg. # Unsupported |
'''
else:
html = '''
Rank |
Model |
π― Factual Precision |
π Hallucination Score |
Avg. # Tokens |
Avg. # Units |
Avg. # Undecidable |
Avg. # Unsupported |
'''
# Generate the rows of the table
current_tier = None
for i, row in updated_filtered_df.iterrows():
html += ''
# Only display the 'Tier' column if 'All Tiers' is selected
if selected_tier == 'All Tiers':
if row['tier'] != current_tier:
current_tier = row['tier']
html += f'{current_tier} | '
# Fill in model and scores
html += f'''
{row['rank']} |
{row['model']} |
{row['factuality_score']} |
{row['hallucination_score']} |
{row['avg_tokens']} |
{row['avg_factual_units']} |
{row['avg_undecidable_units']:.2f} |
{row['avg_unsupported_units']:.2f} |
'''
# Close the table
html += '''
'''
# Display the table
st.markdown(html, unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
# Tab 2: Details
with tab2:
st.markdown('', unsafe_allow_html=True)
# st.markdown('
',
# unsafe_allow_html=True)
st.image(image, use_column_width=True)
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
st.write(
"Language models (LMs) are widely used by an increasing number of users, "
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
)
st.markdown('### Content Categorization')
st.write(
"VERIFY considers the verifiability of LM-generated content and categorizes content units as "
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
)
st.markdown('### Hallucination Prompts & FactBench Dataset')
st.write(
"Using VERIFY, we identify 'hallucination prompts' across diverse topicsβthose eliciting the highest rates of "
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
"regularly updated with new prompts."
)
st.markdown('
', unsafe_allow_html=True)
# Tab 3: Links
with tab3:
st.markdown('', unsafe_allow_html=True)
st.markdown('
Submit your model information on our Github
',
unsafe_allow_html=True)
st.markdown(
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
st.markdown(
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
st.markdown('
', unsafe_allow_html=True)