Update app.py
Browse files
app.py
CHANGED
@@ -41,44 +41,57 @@ context_grounding_data = {
|
|
41 |
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
|
42 |
}
|
43 |
|
44 |
-
# Function to bold the highest score per column (excluding "Model Name")
|
45 |
-
def format_table(df):
|
46 |
styled_df = df.copy()
|
47 |
numeric_columns = [col for col in df.columns if col != "Model Name"]
|
48 |
|
|
|
|
|
|
|
49 |
for col in numeric_columns:
|
50 |
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
|
51 |
# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
|
52 |
-
if any(" (" in str(x) for x in
|
53 |
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
54 |
-
values = [float(str(x).split(" (")[0]) for x in
|
55 |
else:
|
56 |
# Handle direct float values
|
57 |
-
values =
|
58 |
|
59 |
max_value = np.max(values)
|
60 |
-
styled_df[col] =
|
61 |
|
62 |
return styled_df
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# Function to calculate top 3 models based on combined score (average of numeric columns)
|
65 |
def get_top_3_models(robustness_df, context_grounding_df):
|
66 |
# Combine numeric columns from both datasets
|
67 |
numeric_cols_robustness = ["Baseline", "Robustness (Δ)"] # Columns with numeric or string-numeric data
|
68 |
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] # From context grounding
|
69 |
|
70 |
-
# Extract numeric values for each column in robustness_df
|
71 |
robustness_scores = pd.DataFrame()
|
72 |
for col in numeric_cols_robustness:
|
73 |
-
|
74 |
-
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
75 |
-
robustness_scores[col] = robustness_df[col].apply(lambda x: float(str(x).split(" (")[0]) if " (" in str(x) else float(x))
|
76 |
-
else:
|
77 |
-
# Handle direct float values
|
78 |
-
robustness_scores[col] = robustness_df[col].astype(float)
|
79 |
|
80 |
-
# Extract numeric values for context_grounding_df (all are already float values)
|
81 |
-
context_scores =
|
|
|
|
|
82 |
|
83 |
# Combine scores by averaging
|
84 |
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
|
@@ -107,11 +120,11 @@ def create_leaderboard():
|
|
107 |
robustness_df = pd.DataFrame(robustness_data)
|
108 |
context_grounding_df = pd.DataFrame(context_grounding_data)
|
109 |
|
110 |
-
# Format tables to bold highest scores
|
111 |
-
|
112 |
-
|
113 |
|
114 |
-
# Get top 3 winners
|
115 |
winners_df = get_top_3_models(robustness_df, context_grounding_df)
|
116 |
|
117 |
# Create Gradio interface with a nice theme
|
@@ -122,14 +135,14 @@ def create_leaderboard():
|
|
122 |
with gr.Column():
|
123 |
with gr.Tab("Robustness Results"):
|
124 |
gr.DataFrame(
|
125 |
-
value=
|
126 |
label="Robustness Results",
|
127 |
wrap=True,
|
128 |
elem_classes=["custom-table"]
|
129 |
)
|
130 |
with gr.Tab("Context Grounding Results"):
|
131 |
gr.DataFrame(
|
132 |
-
value=
|
133 |
label="Context Grounding Results",
|
134 |
wrap=True,
|
135 |
elem_classes=["custom-table"]
|
|
|
41 |
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
|
42 |
}
|
43 |
|
44 |
+
# Function to bold the highest score per column (excluding "Model Name") but keep original data for calculations
|
45 |
+
def format_table(df, original_df=None):
|
46 |
styled_df = df.copy()
|
47 |
numeric_columns = [col for col in df.columns if col != "Model Name"]
|
48 |
|
49 |
+
if original_df is None:
|
50 |
+
original_df = df.copy() # Use the input df as original if none provided
|
51 |
+
|
52 |
for col in numeric_columns:
|
53 |
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
|
54 |
# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
|
55 |
+
if any(" (" in str(x) for x in original_df[col]):
|
56 |
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
57 |
+
values = [float(str(x).split(" (")[0]) for x in original_df[col]]
|
58 |
else:
|
59 |
# Handle direct float values
|
60 |
+
values = original_df[col].astype(float)
|
61 |
|
62 |
max_value = np.max(values)
|
63 |
+
styled_df[col] = original_df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)
|
64 |
|
65 |
return styled_df
|
66 |
|
67 |
+
# Function to extract numeric value from a string (removing bold markup and deltas)
|
68 |
+
def extract_numeric(value):
|
69 |
+
if pd.isna(value):
|
70 |
+
return np.nan
|
71 |
+
if isinstance(value, str):
|
72 |
+
# Remove bold markup (**)
|
73 |
+
value = value.replace("**", "")
|
74 |
+
# Extract numeric part before the delta (if present)
|
75 |
+
if " (" in value:
|
76 |
+
return float(value.split(" (")[0])
|
77 |
+
return float(value)
|
78 |
+
return float(value)
|
79 |
+
|
80 |
# Function to calculate top 3 models based on combined score (average of numeric columns)
|
81 |
def get_top_3_models(robustness_df, context_grounding_df):
|
82 |
# Combine numeric columns from both datasets
|
83 |
numeric_cols_robustness = ["Baseline", "Robustness (Δ)"] # Columns with numeric or string-numeric data
|
84 |
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] # From context grounding
|
85 |
|
86 |
+
# Extract numeric values for each column in robustness_df, handling bold markup and deltas
|
87 |
robustness_scores = pd.DataFrame()
|
88 |
for col in numeric_cols_robustness:
|
89 |
+
robustness_scores[col] = robustness_df[col].apply(extract_numeric)
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
# Extract numeric values for context_grounding_df (all are already float values, but use extract_numeric for consistency)
|
92 |
+
context_scores = pd.DataFrame()
|
93 |
+
for col in numeric_cols_context:
|
94 |
+
context_scores[col] = context_grounding_df[col].apply(extract_numeric)
|
95 |
|
96 |
# Combine scores by averaging
|
97 |
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
|
|
|
120 |
robustness_df = pd.DataFrame(robustness_data)
|
121 |
context_grounding_df = pd.DataFrame(context_grounding_data)
|
122 |
|
123 |
+
# Format tables to bold highest scores, but keep original data for calculations
|
124 |
+
formatted_robustness_df = format_table(robustness_df, robustness_df.copy()) # Pass original data for calculations
|
125 |
+
formatted_context_grounding_df = format_table(context_grounding_df, context_grounding_df.copy()) # Pass original data for calculations
|
126 |
|
127 |
+
# Get top 3 winners using the original (unformatted) DataFrames
|
128 |
winners_df = get_top_3_models(robustness_df, context_grounding_df)
|
129 |
|
130 |
# Create Gradio interface with a nice theme
|
|
|
135 |
with gr.Column():
|
136 |
with gr.Tab("Robustness Results"):
|
137 |
gr.DataFrame(
|
138 |
+
value=formatted_robustness_df,
|
139 |
label="Robustness Results",
|
140 |
wrap=True,
|
141 |
elem_classes=["custom-table"]
|
142 |
)
|
143 |
with gr.Tab("Context Grounding Results"):
|
144 |
gr.DataFrame(
|
145 |
+
value=formatted_context_grounding_df,
|
146 |
label="Context Grounding Results",
|
147 |
wrap=True,
|
148 |
elem_classes=["custom-table"]
|