Spaces:

launch
/

factbench

Running

File size: 8,439 Bytes

import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO

# Set up page config
st.set_page_config(
    page_title="FactBench Leaderboard",
    layout="wide"
)

# Load the image
image = Image.open("factEvalSteps.png")
logo_image = Image.open("Factbench_logo.png")

# Custom CSS for the page
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');

    html, body, [class*="css"] {
        font-family: 'Courier Prime', monospace;
        background-color: #f9f9f9;  /* Light grey background */
    }

    .title {
        font-size: 42px;
        font-weight: bold;
        text-align: center;
        color: #333;
        margin-bottom: 5px;
    }

    .description {
        font-size: 22px;
        text-align: center;
        margin-bottom: 30px;
        color: #555;
    }

    .container {
        max-width: 1000px;  
        margin: 0 auto;  
        padding: 20px;
    }

    table {
        width: 100%;
        border-collapse: collapse;
        border-radius: 10px;
        overflow: hidden;
    }

    th, td {
        padding: 8px;
        text-align: center;
        border: 1px solid #ddd;
        font-size: 14px;
        transition: background-color 0.3s;
    }

    th {
        background-color: #f2f2f2;
        font-weight: bold;
    }

    td:hover {
        background-color: #eaeaea;
    }
    </style>
    """,
    unsafe_allow_html=True
)

# Display title and description
st.markdown('<div class="container">', unsafe_allow_html=True)
# st.image(logo_image, output_format="PNG", width=200)

# Convert the image to base64
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
    f"""
    <style>
    .logo-container {{
        display: flex;
        justify-content: flex-start;  /* Aligns to the left */
    }}
    .logo-container img {{
        width: 50%;  /* Adjust this to control the width, e.g., 50% of container width */
        max-width: 300px;  /* Set a maximum width */
        background-color: transparent;
    }}
    </style>
    <div class="logo-container">
        <img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo">
    </div>
    """,
    unsafe_allow_html=True
)
st.markdown('<div class="title">FactBench Leaderboard</div>',
            unsafe_allow_html=True)
st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
            unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)

# Load the data
data_path = "tiered_models_data.csv"
df = pd.read_csv(data_path)

# Assign ranks within each tier based on factuality_score
df['rank'] = df.groupby('tier')['factuality_score'].rank(
    ascending=False, method='min').astype(int)

# Replace NaN values with '-'
df.fillna('-', inplace=True)

df['original_order'] = df.groupby('tier').cumcount()

# Create tabs
tab1, tab2, tab3 = st.tabs(
    ["Leaderboard", "Benchmark Details", "Submit your models"])

# Tab 1: Leaderboard
with tab1:
    # df['original_order'] = df.groupby('tier').cumcount()
    # print(df['original_order'])
    
    # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
    st.markdown('<div class="tab-content">', unsafe_allow_html=True)

    st.markdown('## Metric Explanation')
    st.markdown('@Farima populate here')

    # Dropdown menu to filter tiers
    tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
    selected_tier = st.selectbox('Select Tier:', tiers)

    # Filter the data based on the selected tier
    if selected_tier != 'All Tiers':
        filtered_df = df[df['tier'] == selected_tier]
    else:
        filtered_df = df

    sort_by_factuality = st.checkbox('Sort by Factuality Score')

    # Sort the dataframe based on Factuality Score if the checkbox is selected
    if sort_by_factuality:
        updated_filtered_df = filtered_df.sort_values(
            by=['tier', 'factuality_score'], ascending=[True, False]
        )
    else:
        updated_filtered_df = filtered_df.sort_values(
            by=['tier', 'original_order']
        )

    # Create HTML for the table
    if selected_tier == 'All Tiers':
        html = '''
        <table>
            <thead>
                <tr>
                    <th>Tier</th>
                    <th>Rank</th>
                    <th>Model</th>
                    <th>Factuality Score</th>
                    <th>Hallucination Score</th>
                    <th># Tokens</th>
                    <th># Factual</th>
                    <th># Undecidable</th>
                    <th># Unsupported</th>
                </tr>
            </thead>
            <tbody>
        '''
    else:
        html = '''
        <table>
            <thead>
                <tr>
                    <th>Rank</th>
                    <th>Model</th>
                    <th>Factuality Score</th>
                    <th>Hallucination Score</th>
                    <th># Tokens</th>
                    <th># Factual</th>
                    <th># Undecidable</th>
                    <th># Unsupported</th>
                </tr>
            </thead>
            <tbody>
        '''

    # Generate the rows of the table
    current_tier = None
    for i, row in updated_filtered_df.iterrows():
        html += '<tr>'

        # Only display the 'Tier' column if 'All Tiers' is selected
        if selected_tier == 'All Tiers':
            if row['tier'] != current_tier:
                current_tier = row['tier']
                html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'

        # Fill in model and scores
        html += f'''
            <td>{row['rank']}</td>
            <td>{row['model']}</td>
            <td>{row['factuality_score']}</td>
            <td>{row['hallucination_score']}</td>
            <td>{row['avg_tokens']}</td>
            <td>{row['avg_factual_units']}</td>
            <td>{row['avg_undecidable_units']:.2f}</td>
            <td>{row['avg_unsupported_units']:.2f}</td>
        </tr>
    '''

    # Close the table
    html += '''
    </table>
    '''

    # Display the table
    st.markdown(html, unsafe_allow_html=True)

    st.markdown('</div>', unsafe_allow_html=True)

# Tab 2: Details
with tab2:
    st.markdown('<div class="tab-content">', unsafe_allow_html=True)

    st.markdown('<div class="title">Benchmark Details</div>',
                unsafe_allow_html=True)
    st.image(image, use_column_width=True)

    st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
    st.write(
        "Language models (LMs) are widely used by an increasing number of users, "
        "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
        "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
        "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
    )

    st.markdown('### Content Categorization')
    st.write(
        "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
        "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
        "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
    )

    st.markdown('### Hallucination Prompts & FactBench Dataset')
    st.write(
        "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
        "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
        "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
        "regularly updated with new prompts."
    )

    st.markdown('</div>', unsafe_allow_html=True)

# Tab 3: Links
with tab3:
    st.markdown('<div class="tab-content">', unsafe_allow_html=True)

    st.markdown('<div class="title">Submit your model information on our Github</div>',
                unsafe_allow_html=True)

    st.markdown(
        '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
    st.markdown(
        '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')

    st.markdown('</div>', unsafe_allow_html=True)