factbench / app.py
farimafatahi's picture
Update app.py
0ae8f9d verified
raw
history blame
8.44 kB
import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
# Set up page config
st.set_page_config(
page_title="FactBench Leaderboard",
layout="wide"
)
# Load the image
image = Image.open("factEvalSteps.png")
logo_image = Image.open("Factbench_logo.png")
# Custom CSS for the page
st.markdown(
"""
<style>
@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
html, body, [class*="css"] {
font-family: 'Courier Prime', monospace;
background-color: #f9f9f9; /* Light grey background */
}
.title {
font-size: 42px;
font-weight: bold;
text-align: center;
color: #333;
margin-bottom: 5px;
}
.description {
font-size: 22px;
text-align: center;
margin-bottom: 30px;
color: #555;
}
.container {
max-width: 1000px;
margin: 0 auto;
padding: 20px;
}
table {
width: 100%;
border-collapse: collapse;
border-radius: 10px;
overflow: hidden;
}
th, td {
padding: 8px;
text-align: center;
border: 1px solid #ddd;
font-size: 14px;
transition: background-color 0.3s;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
td:hover {
background-color: #eaeaea;
}
</style>
""",
unsafe_allow_html=True
)
# Display title and description
st.markdown('<div class="container">', unsafe_allow_html=True)
# st.image(logo_image, output_format="PNG", width=200)
# Convert the image to base64
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
f"""
<style>
.logo-container {{
display: flex;
justify-content: flex-start; /* Aligns to the left */
}}
.logo-container img {{
width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
max-width: 300px; /* Set a maximum width */
background-color: transparent;
}}
</style>
<div class="logo-container">
<img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo">
</div>
""",
unsafe_allow_html=True
)
st.markdown('<div class="title">FactBench Leaderboard</div>',
unsafe_allow_html=True)
st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Load the data
data_path = "tiered_models_data.csv"
df = pd.read_csv(data_path)
# Assign ranks within each tier based on factuality_score
df['rank'] = df.groupby('tier')['factuality_score'].rank(
ascending=False, method='min').astype(int)
# Replace NaN values with '-'
df.fillna('-', inplace=True)
df['original_order'] = df.groupby('tier').cumcount()
# Create tabs
tab1, tab2, tab3 = st.tabs(
["Leaderboard", "Benchmark Details", "Submit your models"])
# Tab 1: Leaderboard
with tab1:
# df['original_order'] = df.groupby('tier').cumcount()
# print(df['original_order'])
# st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
st.markdown('## Metric Explanation')
st.markdown('@Farima populate here')
# Dropdown menu to filter tiers
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
selected_tier = st.selectbox('Select Tier:', tiers)
# Filter the data based on the selected tier
if selected_tier != 'All Tiers':
filtered_df = df[df['tier'] == selected_tier]
else:
filtered_df = df
sort_by_factuality = st.checkbox('Sort by Factuality Score')
# Sort the dataframe based on Factuality Score if the checkbox is selected
if sort_by_factuality:
updated_filtered_df = filtered_df.sort_values(
by=['tier', 'factuality_score'], ascending=[True, False]
)
else:
updated_filtered_df = filtered_df.sort_values(
by=['tier', 'original_order']
)
# Create HTML for the table
if selected_tier == 'All Tiers':
html = '''
<table>
<thead>
<tr>
<th>Tier</th>
<th>Rank</th>
<th>Model</th>
<th>Factuality Score</th>
<th>Hallucination Score</th>
<th># Tokens</th>
<th># Factual</th>
<th># Undecidable</th>
<th># Unsupported</th>
</tr>
</thead>
<tbody>
'''
else:
html = '''
<table>
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Factuality Score</th>
<th>Hallucination Score</th>
<th># Tokens</th>
<th># Factual</th>
<th># Undecidable</th>
<th># Unsupported</th>
</tr>
</thead>
<tbody>
'''
# Generate the rows of the table
current_tier = None
for i, row in updated_filtered_df.iterrows():
html += '<tr>'
# Only display the 'Tier' column if 'All Tiers' is selected
if selected_tier == 'All Tiers':
if row['tier'] != current_tier:
current_tier = row['tier']
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
# Fill in model and scores
html += f'''
<td>{row['rank']}</td>
<td>{row['model']}</td>
<td>{row['factuality_score']}</td>
<td>{row['hallucination_score']}</td>
<td>{row['avg_tokens']}</td>
<td>{row['avg_factual_units']}</td>
<td>{row['avg_undecidable_units']:.2f}</td>
<td>{row['avg_unsupported_units']:.2f}</td>
</tr>
'''
# Close the table
html += '''
</table>
'''
# Display the table
st.markdown(html, unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Tab 2: Details
with tab2:
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
st.markdown('<div class="title">Benchmark Details</div>',
unsafe_allow_html=True)
st.image(image, use_column_width=True)
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
st.write(
"Language models (LMs) are widely used by an increasing number of users, "
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
)
st.markdown('### Content Categorization')
st.write(
"VERIFY considers the verifiability of LM-generated content and categorizes content units as "
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
)
st.markdown('### Hallucination Prompts & FactBench Dataset')
st.write(
"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
"regularly updated with new prompts."
)
st.markdown('</div>', unsafe_allow_html=True)
# Tab 3: Links
with tab3:
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
st.markdown('<div class="title">Submit your model information on our Github</div>',
unsafe_allow_html=True)
st.markdown(
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
st.markdown(
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
st.markdown('</div>', unsafe_allow_html=True)