File size: 14,315 Bytes
f7f7ef8
 
33a05be
f7f7ef8
e13106d
f7f7ef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e13106d
f7f7ef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2c6410
 
33a05be
 
 
c2c6410
 
 
33a05be
 
 
c2c6410
33a05be
c2c6410
33a05be
 
c2c6410
33a05be
 
c2c6410
33a05be
 
 
c2c6410
 
 
 
 
 
 
 
 
 
 
 
 
814faad
 
 
7d6243e
814faad
 
c2c6410
7d6243e
 
c2c6410
7d6243e
c2c6410
 
 
 
814faad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7f7ef8
 
33a05be
f7f7ef8
 
 
c2c6410
 
 
33a05be
c2c6410
814faad
 
33a05be
 
06bc5e8
d60330a
06bc5e8
 
583c951
 
 
 
 
 
06bc5e8
 
 
d60330a
f7f7ef8
33a05be
 
b15b106
33a05be
c2c6410
33a05be
 
814faad
33a05be
b15b106
33a05be
c2c6410
33a05be
 
814faad
 
b15b106
814faad
 
 
 
 
33a05be
b15b106
814faad
 
 
 
1820231
814faad
 
 
 
 
 
 
 
 
 
33a05be
e13106d
33a05be
 
 
 
e13106d
 
33a05be
 
 
 
 
 
 
 
 
 
 
f7f7ef8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import gradio as gr
import pandas as pd
import numpy as np

# Data for Table 1: Robustness Results (unchanged)
robustness_data = {
    "Model Name": [
        "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
        "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
        "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
        "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
        "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
        "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
        "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
    ],
    "Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96],
    "Misspelled (Ξ”)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (↓0.01)", "0.95 (↓0.02)", "0.96 (↓0.02)", "0.85 (↑0.02)", "0.90 (↓0.05)", "0.97 (↑0.02)", "0.97 (↑0.01)", "0.94 (0.0)", "0.90 (↓0.01)", "0.92 (↓0.02)", "0.92 (↓0.03)", "0.91 (↓0.01)", "0.94 (↓0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (↓0.03)", "0.94 (0.0)", "0.85 (↓0.01)", "0.84 (↓0.04)", "0.84 (↓0.05)", "0.93 (↓0.03)"],
    "Incomplete (Ξ”)": ["0.95 (0.0)", "0.94 (↓0.02)", "0.94 (↓0.01)", "0.94 (↓0.03)", "0.96 (↓0.02)", "0.82 (↓0.01)", "0.92 (↓0.03)", "0.95 (0.0)", "0.95 (↓0.01)", "0.93 (↓0.01)", "0.86 (↓0.05)", "0.94 (0.0)", "0.93 (↓0.02)", "0.90 (↓0.02)", "0.94 (↓0.01)", "0.93 (↓0.02)", "0.93 (↓0.01)", "0.91 (0.0)", "0.91 (↓0.04)", "0.93 (↓0.01)", "0.78 (↓0.08)", "0.78 (↓0.10)", "0.84 (↓0.05)", "0.92 (↓0.04)"],
    "Out-of-Domain (Ξ”)": ["0.88 (↓0.07)", "0.92 (↓0.04)", "0.92 (↓0.03)", "0.89 (↓0.08)", "0.95 (↓0.03)", "0.87 (↑0.04)", "0.93 (↓0.02)", "0.92 (↓0.03)", "0.94 (↓0.02)", "0.91 (↓0.03)", "0.82 (↓0.09)", "0.87 (↓0.07)", "0.90 (↓0.05)", "0.85 (↓0.07)", "0.94 (↓0.01)", "0.92 (↓0.03)", "0.92 (↓0.02)", "0.86 (↓0.05)", "0.91 (↓0.04)", "0.90 (↓0.04)", "0.79 (↓0.07)", "0.83 (↓0.05)", "0.81 (↓0.08)", "0.90 (↓0.06)"],
    "OCR Context (Ξ”)": ["0.91 (↓0.04)", "0.92 (↓0.04)", "0.95 (0.0)", "0.94 (↓0.03)", "0.90 (↓0.08)", "0.72 (↓0.11)", "0.86 (↓0.09)", "0.89 (↓0.06)", "0.93 (↓0.03)", "0.88 (↓0.06)", "0.80 (↓0.11)", "0.88 (↓0.06)", "0.89 (↓0.06)", "0.80 (↓0.12)", "0.88 (↓0.07)", "0.92 (↓0.03)", "0.91 (↓0.03)", "0.77 (↓0.14)", "0.89 (↓0.06)", "0.91 (↓0.03)", "0.69 (↓0.17)", "0.78 (↓0.10)", "0.72 (↓0.17)", "0.89 (↓0.07)"],
    "Robustness (Ξ”)": ["0.83 (↓0.12)", "0.84 (↓0.12)", "0.85 (↓0.10)", "0.81 (↓0.16)", "0.90 (↓0.08)", "0.64 (↓0.19)", "0.82 (↓0.13)", "0.86 (↓0.09)", "0.89 (↓0.07)", "0.80 (↓0.14)", "0.70 (↓0.21)", "0.80 (↓0.14)", "0.82 (↓0.13)", "0.75 (↓0.17)", "0.86 (↓0.09)", "0.85 (↓0.10)", "0.84 (↓0.10)", "0.74 (↓0.17)", "0.80 (↓0.15)", "0.82 (↓0.12)", "0.58 (↓0.28)", "0.70 (↓0.18)", "0.63 (↓0.26)", "0.83 (↓0.13)"]
}

# Data for Table 2: Context Grounding Results (unchanged)
context_grounding_data = {
    "Model Name": [
        "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
        "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
        "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
        "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
        "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
        "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
        "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
    ],
    "Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95],
    "No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66],
    "Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83],
    "Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65],
    "Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80],
    "Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83],
    "Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
}

# Function to bold the highest score per column (excluding "Model Name") but keep original data for calculations
def format_table(df, original_df=None):
    styled_df = df.copy()
    numeric_columns = [col for col in df.columns if col != "Model Name"]
    
    if original_df is None:
        original_df = df.copy()  # Use the input df as original if none provided
    
    for col in numeric_columns:
        if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
            # Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
            if any(" (" in str(x) for x in original_df[col]):
                # Handle string values with deltas (e.g., "0.95 (0.0)")
                values = [float(str(x).split(" (")[0]) for x in original_df[col]]
            else:
                # Handle direct float values
                values = original_df[col].astype(float)
            
            max_value = np.max(values)
            styled_df[col] = original_df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)
    
    return styled_df

# Function to extract numeric value from a string (removing bold markup and deltas)
def extract_numeric(value):
    if pd.isna(value):
        return np.nan
    if isinstance(value, str):
        # Remove bold markup (**)
        value = value.replace("**", "")
        # Extract numeric part before the delta (if present)
        if " (" in value:
            return float(value.split(" (")[0])
        return float(value)
    return float(value)

# Function to calculate top 3 models based on combined score (average of numeric columns)
def get_top_3_models(robustness_df, context_grounding_df):
    # Combine numeric columns from both datasets
    numeric_cols_robustness = ["Baseline", "Robustness (Ξ”)"]  # Columns with numeric or string-numeric data
    numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]  # From context grounding
    
    # Extract numeric values for each column in robustness_df, handling bold markup and deltas
    robustness_scores = pd.DataFrame()
    for col in numeric_cols_robustness:
        robustness_scores[col] = robustness_df[col].apply(extract_numeric)
    
    # Extract numeric values for context_grounding_df (all are already float values, but use extract_numeric for consistency)
    context_scores = pd.DataFrame()
    for col in numeric_cols_context:
        context_scores[col] = context_grounding_df[col].apply(extract_numeric)
    
    # Combine scores by averaging
    combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
    
    # Add combined scores to a DataFrame for sorting
    combined_df = pd.DataFrame({
        "Model Name": robustness_df["Model Name"],
        "Combined Score": combined_scores
    })
    
    # Sort by combined score in descending order and get top 3
    top_3 = combined_df.sort_values(by="Combined Score", ascending=False).head(3)
    
    # Format the winners table
    winners_df = pd.DataFrame({
        "Rank": [1, 2, 3],
        "Model Name": top_3["Model Name"],
        "Combined Score": top_3["Combined Score"].round(3)
    })
    
    return winners_df

# Function to create the Gradio interface
def create_leaderboard():
    # Convert data to DataFrames
    robustness_df = pd.DataFrame(robustness_data)
    context_grounding_df = pd.DataFrame(context_grounding_data)

    # Format tables to bold highest scores, but keep original data for calculations
    formatted_robustness_df = format_table(robustness_df, robustness_df.copy())  # Pass original data for calculations
    formatted_context_grounding_df = format_table(context_grounding_df, context_grounding_df.copy())  # Pass original data for calculations

    # Get top 3 winners using the original (unformatted) DataFrames
    winners_df = get_top_3_models(robustness_df, context_grounding_df)

    # Create Gradio interface with a nice theme
    with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo:
        gr.Markdown(
            """<div style="text-align: center;"><h1>Financial <span style='color: #e6b800;'>Models</span>  <span style='color: #e6b800;'> Performance Leaderboard</span></h1></div>\
            <br>\
            <p>Inspired by the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</a>, we evaluate model performance using <a href="https://huggingface.co/papers/2502.06329">FailSafe Long Context QA</a>. This evaluation leverages the <a href="https://huggingface.co/datasets/Writer/FailSafeQA">FailSafeQA dataset</a> to assess how well models handle long-context question answering, ensuring robust and reliable performance in extended-context scenarios.</p>
            <br/>
            <p>FailSafeQA returns three critical measures of model performance for finance, including a novel metric for model compliance:  </p>
            <p><b>LLM Robustness: </b>Uses HELM’s definition to assess a model’s ability to provide a consistent and reliable answer despite perturbations of query and context</p>
            <p> <b>LLM Context Grounding: </b>Assesses a models ability to detect cases where the problem is unanswerable and refrain from producing potentially misleading hallucinations</p>
            <p> <b>LLM Compliance Score:</b>A new metric that quantifies the tradeoff between Robustness and Context Grounding, inspired by the classic precision-recall trade-off. In other words, this compliance metric aims to evaluate a model’s tendency to hallucinate in the presence of missing or incomplete context.</p>
            <p> These scores are combined to determine the top three winners in a leaderboard. </p>
""",
            elem_classes="markdown-text",
        )

        
        with gr.Row():
            with gr.Column():
                with gr.Tab("🎯 Robustness Results"):
                    gr.DataFrame(
                        value=formatted_robustness_df,
                        label="Robustness Results",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("🧩 Context Grounding Results"):
                    gr.DataFrame(
                        value=formatted_context_grounding_df,
                        label="Context Grounding Results",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("πŸ… Top 3 Winners"):
                    gr.DataFrame(
                        value=winners_df,
                        label="Top 3 Models",
                        wrap=True,
                        elem_classes=["custom-table"]
                    )
                with gr.Tab("πŸ“ About FailSafe"):
                    gr.HTML("""
                        <div style="padding: 20px;">
                            <h2>About This Leaderboard</h2>
                            <p>This Financial Model Performance Leaderboard compares the performance of various AI models across robustness and context grounding metrics. The data is sourced from evaluations conducted on February 18, 2025, and reflects the models' ability to handle financial tasks under different conditions.</p>
                            <p>For more information or if you would like to submit your model for evaluation, contact us at <a href="mailto:support@writer.com">support@writer.com</a>.</p>
                        </div>
                    """)
                    with gr.Row():
                        submit_btn = gr.Button("Submit Feedback")
                        output = gr.Textbox(label="Feedback Submission Status", placeholder="Your feedback will appear here...")
                    
                    def submit_feedback():
                        return "Thank you for your feedback!"

                    submit_btn.click(fn=submit_feedback, inputs=None, outputs=output)

    # Custom CSS for better table appearance (larger font, spacing, and height)
    demo.css = """
    .custom-table {
        font-size: 16px;  /* Increase font size for readability */
        line-height: 2;   /* Increase line height for longer rows */
        max-height: 600px; /* Set maximum height for scrolling if needed */
        overflow-y: auto;  /* Enable vertical scrolling if content exceeds height */
        border-collapse: collapse;
    }
    .custom-table th, .custom-table td {
        padding: 12px;    /* Increase padding for spacing */
        border: 1px solid #ddd;
    }
    .custom-table th {
        background-color: #f5f5f5;
        font-weight: bold;
    }
    """

    return demo

# Launch the Gradio app
if __name__ == "__main__":
    demo = create_leaderboard()
    demo.launch()