|
import streamlit as st |
|
import pandas as pd |
|
from PIL import Image |
|
import base64 |
|
from io import BytesIO |
|
|
|
|
|
st.set_page_config( |
|
page_title="FactBench Leaderboard", |
|
layout="wide" |
|
) |
|
|
|
|
|
image = Image.open("factEvalSteps.png") |
|
logo_image = Image.open("Factbench_logo.png") |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap'); |
|
|
|
html, body, [class*="css"] { |
|
font-family: 'Courier Prime', monospace; |
|
background-color: #f9f9f9; /* Light grey background */ |
|
} |
|
|
|
.title { |
|
font-size: 42px; |
|
font-weight: bold; |
|
text-align: center; |
|
color: #333; |
|
margin-bottom: 5px; |
|
} |
|
|
|
.description { |
|
font-size: 22px; |
|
text-align: center; |
|
margin-bottom: 30px; |
|
color: #555; |
|
} |
|
|
|
.container { |
|
max-width: 1000px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
} |
|
|
|
table { |
|
width: 100%; |
|
border-collapse: collapse; |
|
border-radius: 10px; |
|
overflow: hidden; |
|
} |
|
|
|
th, td { |
|
padding: 8px; |
|
text-align: center; |
|
border: 1px solid #ddd; |
|
font-size: 14px; |
|
transition: background-color 0.3s; |
|
} |
|
|
|
th { |
|
background-color: #f2f2f2; |
|
font-weight: bold; |
|
} |
|
|
|
td:hover { |
|
background-color: #eaeaea; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
st.markdown('<div class="container">', unsafe_allow_html=True) |
|
|
|
|
|
|
|
buffered = BytesIO() |
|
logo_image.save(buffered, format="PNG") |
|
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
st.markdown( |
|
f""" |
|
<style> |
|
.logo-container {{ |
|
display: flex; |
|
justify-content: flex-start; /* Aligns to the left */ |
|
}} |
|
.logo-container img {{ |
|
width: 50%; /* Adjust this to control the width, e.g., 50% of container width */ |
|
max-width: 300px; /* Set a maximum width */ |
|
background-color: transparent; |
|
}} |
|
</style> |
|
<div class="logo-container"> |
|
<img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo"> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
st.markdown('<div class="title">FactBench Leaderboard</div>', |
|
unsafe_allow_html=True) |
|
st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>', |
|
unsafe_allow_html=True) |
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
data_path = "tiered_models_data.csv" |
|
df = pd.read_csv(data_path) |
|
|
|
|
|
df['rank'] = df.groupby('tier')['factuality_score'].rank( |
|
ascending=False, method='min').astype(int) |
|
|
|
|
|
df.fillna('-', inplace=True) |
|
|
|
df['original_order'] = df.groupby('tier').cumcount() |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs( |
|
["Leaderboard", "Benchmark Details", "Submit your models"]) |
|
|
|
|
|
with tab1: |
|
|
|
|
|
|
|
|
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
st.markdown('## Metric Explanation') |
|
st.markdown('@Farima populate here') |
|
|
|
|
|
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy'] |
|
selected_tier = st.selectbox('Select Tier:', tiers) |
|
|
|
|
|
if selected_tier != 'All Tiers': |
|
filtered_df = df[df['tier'] == selected_tier] |
|
else: |
|
filtered_df = df |
|
|
|
sort_by_factuality = st.checkbox('Sort by Factuality Score') |
|
|
|
|
|
if sort_by_factuality: |
|
updated_filtered_df = filtered_df.sort_values( |
|
by=['tier', 'factuality_score'], ascending=[True, False] |
|
) |
|
else: |
|
updated_filtered_df = filtered_df.sort_values( |
|
by=['tier', 'original_order'] |
|
) |
|
|
|
|
|
if selected_tier == 'All Tiers': |
|
html = ''' |
|
<table> |
|
<thead> |
|
<tr> |
|
<th>Tier</th> |
|
<th>Rank</th> |
|
<th>Model</th> |
|
<th>Factuality Score</th> |
|
<th>Hallucination Score</th> |
|
<th># Tokens</th> |
|
<th># Factual</th> |
|
<th># Undecidable</th> |
|
<th># Unsupported</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
''' |
|
else: |
|
html = ''' |
|
<table> |
|
<thead> |
|
<tr> |
|
<th>Rank</th> |
|
<th>Model</th> |
|
<th>Factuality Score</th> |
|
<th>Hallucination Score</th> |
|
<th># Tokens</th> |
|
<th># Factual</th> |
|
<th># Undecidable</th> |
|
<th># Unsupported</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
''' |
|
|
|
|
|
current_tier = None |
|
for i, row in updated_filtered_df.iterrows(): |
|
html += '<tr>' |
|
|
|
|
|
if selected_tier == 'All Tiers': |
|
if row['tier'] != current_tier: |
|
current_tier = row['tier'] |
|
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>' |
|
|
|
|
|
html += f''' |
|
<td>{row['rank']}</td> |
|
<td>{row['model']}</td> |
|
<td>{row['factuality_score']}</td> |
|
<td>{row['hallucination_score']}</td> |
|
<td>{row['avg_tokens']}</td> |
|
<td>{row['avg_factual_units']}</td> |
|
<td>{row['avg_undecidable_units']:.2f}</td> |
|
<td>{row['avg_unsupported_units']:.2f}</td> |
|
</tr> |
|
''' |
|
|
|
|
|
html += ''' |
|
</table> |
|
''' |
|
|
|
|
|
st.markdown(html, unsafe_allow_html=True) |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
with tab2: |
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
st.markdown('<div class="title">Benchmark Details</div>', |
|
unsafe_allow_html=True) |
|
st.image(image, use_column_width=True) |
|
|
|
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') |
|
st.write( |
|
"Language models (LMs) are widely used by an increasing number of users, " |
|
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. " |
|
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " |
|
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions." |
|
) |
|
|
|
st.markdown('### Content Categorization') |
|
st.write( |
|
"VERIFY considers the verifiability of LM-generated content and categorizes content units as " |
|
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " |
|
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." |
|
) |
|
|
|
st.markdown('### Hallucination Prompts & FactBench Dataset') |
|
st.write( |
|
"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of " |
|
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " |
|
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " |
|
"regularly updated with new prompts." |
|
) |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
with tab3: |
|
st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
|
st.markdown('<div class="title">Submit your model information on our Github</div>', |
|
unsafe_allow_html=True) |
|
|
|
st.markdown( |
|
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') |
|
st.markdown( |
|
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|