pratikbhavsar commited on
Commit
cfec1f3
·
1 Parent(s): 3aeb75b

added sonnet and improved data explorer

Browse files
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import gradio as gr
2
  import promptquality as pq
3
  from dotenv import load_dotenv
@@ -15,8 +20,7 @@ from data_loader import (
15
  )
16
  from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
17
  from tabs.model_comparison import create_model_comparison_tab, compare_models
18
- from tabs.data_exploration import create_exploration_tab
19
- from chat import filter_and_update_display
20
 
21
 
22
  def create_app():
@@ -35,9 +39,7 @@ def create_app():
35
 
36
  mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
37
 
38
- exp_outputs = create_exploration_tab(
39
- df, MODELS, DATASETS, SCORES, HEADER_CONTENT
40
- )
41
 
42
  # Initial loads
43
  app.load(
@@ -55,8 +57,16 @@ def create_app():
55
  )
56
 
57
  app.load(
58
- fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], 0, 1, 0),
59
- outputs=exp_outputs,
 
 
 
 
 
 
 
 
60
  )
61
 
62
  return app
 
1
+ # Add this at the top of your script
2
+ import warnings
3
+
4
+ warnings.filterwarnings("ignore")
5
+
6
  import gradio as gr
7
  import promptquality as pq
8
  from dotenv import load_dotenv
 
20
  )
21
  from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
22
  from tabs.model_comparison import create_model_comparison_tab, compare_models
23
+ from tabs.data_exploration import create_exploration_tab, filter_and_display
 
24
 
25
 
26
  def create_app():
 
39
 
40
  mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
41
 
42
+ exp_outputs = create_exploration_tab(df)
 
 
43
 
44
  # Initial loads
45
  app.load(
 
57
  )
58
 
59
  app.load(
60
+ fn=lambda: filter_and_display(
61
+ MODELS[0],
62
+ DATASETS[0],
63
+ min(SCORES),
64
+ max(SCORES),
65
+ 0,
66
+ 0,
67
+ 0,
68
+ ),
69
+ outputs=exp_outputs[:-1],
70
  )
71
 
72
  return app
chat.py CHANGED
@@ -1,256 +1,723 @@
1
- import gradio as gr
2
- import pandas as pd
3
  import json
4
 
5
 
6
- def get_updated_df(df, df_output):
7
- df = df.iloc[: len(df_output)].copy()
8
- df["response"] = df_output["response"].tolist()
9
- df["rationale"] = df_output["rationale"].tolist()
10
- df["explanation"] = df_output["explanation"].tolist()
11
- df["score"] = df_output["score"].tolist()
12
- cols = [
13
- "conversation",
14
- "tools_langchain",
15
- "n_turns",
16
- "len_query",
17
- "n_tools",
18
- "response",
19
- "rationale",
20
- "explanation",
21
- "score",
22
- ]
23
- return df[cols]
24
-
25
-
26
- def get_chat_and_score_df(model, dataset):
27
- df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
28
- df = pd.read_parquet(f"datasets/{dataset}.parquet")
29
- df = get_updated_df(df, df_output)
30
- return df
31
-
32
-
33
- def format_chat_message(role, content, is_response=False):
34
- """Format individual chat messages with alignment based on role."""
35
- role_style = role.lower()
36
- alignment = "flex-end" if role_style == "user" else "flex-start"
37
- max_width = "80%"
38
-
39
- # Clean up any excessive whitespace while preserving intentional line breaks
40
- cleaned_content = "\n".join(line.strip() for line in content.split("\n"))
41
-
42
- background_color = (
43
- "var(--response-bg)" if is_response else f"var(--message-bg-{role_style})"
44
- )
45
 
 
46
  return f"""
47
  <div style="
48
- display: flex;
49
- justify-content: {alignment};
50
- margin: 0.75rem 0;">
51
  <div style="
52
- max-width: {max_width};
 
 
53
  padding: 1rem;
54
- border-radius: 12px;
55
- background-color: {background_color};
56
- border: 1px solid var(--border-color);
57
  box-shadow: 0 1px 2px var(--shadow-color);">
58
  <div style="
59
- font-weight: 600;
60
- color: var(--primary-text);
61
  margin-bottom: 0.5rem;
62
- font-size: 0.9rem;
63
- text-transform: uppercase;">
64
- {role + (" Response" if is_response else "")}
 
 
 
 
65
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  <div style="
67
- color: var(--text-color);
68
- line-height: 1.6;
69
- white-space: pre-wrap;
70
- font-family: {is_response and 'monospace' or 'inherit'};
71
- font-size: {is_response and '0.9rem' or 'inherit'};">
72
- {cleaned_content}
 
 
 
73
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  </div>
75
  </div>
76
  """
77
 
78
 
79
- def format_response(response):
80
- """Format the response data, handling both JSON and text."""
81
  try:
 
 
 
 
 
 
 
 
 
 
 
82
  # Try to parse as JSON
83
- response_data = json.loads(response)
84
- # Format JSON response nicely
85
- formatted_response = json.dumps(response_data, indent=2)
86
- except (json.JSONDecodeError, TypeError):
87
- # If not JSON, use as is
88
- formatted_response = str(response)
89
 
90
- return formatted_response
91
 
 
 
 
92
 
93
- def parse_tool_schema(tool):
94
- """Parse tool schema to extract name, description, and parameters properly."""
95
- name = tool.get("title", "Unnamed Tool")
96
- description = tool.get("description", "No description available")
97
 
98
- parameters = {}
99
- if "properties" in tool:
100
- for param_name, param_data in tool["properties"].items():
101
- param_desc = param_data.get("description", "No description")
102
- param_type = param_data.get("type", "unknown")
103
- parameters[param_name] = f"{param_desc} (Type: {param_type})"
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- return name, description, parameters
 
 
 
 
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- def format_tool_info(tools):
109
- """Format tool information with improved schema parsing and dark theme support."""
110
- if isinstance(tools, str):
111
- try:
112
- tools = json.loads(tools)
113
- except:
114
- return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
115
 
116
- if not tools:
117
- return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
 
 
118
 
119
- tool_html = ""
120
- for tool in tools:
121
- name, description, parameters = parse_tool_schema(tool)
122
- tool_html += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  <div style="
124
- margin: 1rem 0;
125
- padding: 1.5rem;
126
- border-radius: 8px;
127
- background-color: var(--surface-color);
128
- border: 1px solid var(--border-color);">
 
 
 
 
129
  <div style="
130
- font-weight: 600;
 
131
  color: var(--primary-text);
132
- margin-bottom: 0.75rem;
133
- font-size: 1.1rem;">
134
- {name}
135
  </div>
136
  <div style="
137
- color: var(--text-color);
138
- margin-bottom: 1rem;
139
- line-height: 1.5;">
140
- {description}
141
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  <div style="
143
- background-color: var(--surface-color-alt);
144
- padding: 1rem;
145
- border-radius: 4px;
146
- border: 1px solid var(--border-color);">
147
- {format_parameters(parameters)}
 
 
 
 
 
 
148
  </div>
 
 
 
149
  </div>
150
  """
151
- return f"""
152
- <div style="
153
- max-height: 600px;
154
- overflow-y: auto;
155
- padding-right: 0.5rem;">
156
- <style>
157
- :root[data-theme="light"] {{
158
- --surface-color: #f8f9fa;
159
- --surface-color-alt: #ffffff;
160
- --text-color: #202124;
161
- --text-muted: #666666;
162
- --primary-text: #1a73e8;
163
- --border-color: #e9ecef;
164
- --shadow-color: rgba(0,0,0,0.1);
165
- --message-bg-user: #E5F6FD;
166
- --message-bg-assistant: #F7F7F8;
167
- --message-bg-system: #FFF3E0;
168
- --score-high: #1a73e8;
169
- --score-med: #f4b400;
170
- --score-low: #ea4335;
171
- }}
172
-
173
- :root[data-theme="dark"] {{
174
- --surface-color: #1e1e1e;
175
- --surface-color-alt: #2d2d2d;
176
- --text-color: #ffffff;
177
- --text-muted: #a0a0a0;
178
- --primary-text: #60a5fa;
179
- --border-color: #404040;
180
- --shadow-color: rgba(0,0,0,0.3);
181
- --message-bg-user: #2d3748;
182
- --message-bg-assistant: #1a1a1a;
183
- --message-bg-system: #2c2516;
184
- --response-bg: #2a2f3a;
185
- --score-high: #60a5fa;
186
- --score-med: #fbbf24;
187
- --score-low: #ef4444;
188
- }}
189
- </style>
190
- {tool_html}
191
- </div>
192
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  def format_parameters(parameters):
195
  if not parameters:
196
- return '<div style="color: var(--text-muted);">No parameters</div>'
197
 
198
  params_html = ""
199
  for name, desc in parameters.items():
 
 
 
 
 
 
200
  params_html += f"""
201
- <div style="margin: 0.75rem 0;">
 
 
 
 
202
  <div style="
203
- font-weight: 500;
204
- color: var(--primary-text);
205
- margin-bottom: 0.25rem;">
206
- {name}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  </div>
208
  <div style="
209
  color: var(--text-color);
210
- line-height: 1.4;
211
- font-size: 0.95rem;">
212
- {desc}
 
213
  </div>
214
  </div>
215
  """
216
- return params_html
217
 
218
- def format_metrics(score, rationale, explanation):
219
- """Format metrics display with improved dark theme support."""
220
- score_color = (
221
- "var(--score-high)"
222
- if score >= 0.7
223
- else "var(--score-med)" if score >= 0.4 else "var(--score-low)"
 
 
 
 
 
 
 
224
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  return f"""
226
  <div style="
227
- padding: 1.5rem;
228
  background-color: var(--surface-color);
229
- border-radius: 8px;
230
  border: 1px solid var(--border-color);
231
- box-shadow: 0 2px 4px var(--shadow-color);">
232
- <div style="margin-bottom: 1.5rem;">
233
- <h3 style="
234
- color: var(--text-color);
235
- font-size: 1.1rem;
236
- margin-bottom: 0.5rem;
237
- font-weight: 600;">TSQ Score</h3>
238
- <div style="
239
- font-size: 2rem;
240
- font-weight: 600;
241
- color: {score_color};">
242
- {score:.2f}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  </div>
244
  </div>
245
- <div style="margin-bottom: 1.5rem;">
246
  <h3 style="
247
  color: var(--text-color);
248
  font-size: 1.1rem;
249
- margin-bottom: 0.5rem;
250
- font-weight: 600;">Rationale</h3>
 
 
 
 
 
 
 
 
 
 
 
251
  <div style="
252
  color: var(--text-color);
253
- line-height: 1.5;">
 
 
 
254
  {rationale}
255
  </div>
256
  </div>
@@ -258,93 +725,426 @@ def format_metrics(score, rationale, explanation):
258
  <h3 style="
259
  color: var(--text-color);
260
  font-size: 1.1rem;
261
- margin-bottom: 0.5rem;
262
- font-weight: 600;">Explanation</h3>
 
 
 
 
 
 
 
 
 
 
 
263
  <div style="
264
  color: var(--text-color);
265
- line-height: 1.5;">
 
 
 
266
  {explanation}
267
  </div>
268
  </div>
269
  </div>
270
  """
271
 
272
- def update_chat_display(df, index):
273
- """Update the chat visualization with improved dark theme support."""
274
- if df is None or df.empty or index >= len(df):
275
- return (
276
- '<div style="padding: 1rem; color: var(--text-muted);">No data available</div>',
277
- '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
278
- '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
279
- )
280
 
281
- row = df.iloc[index]
 
 
 
 
 
282
 
283
- messages = json.loads(row["conversation"])
284
- response = row["response"]
285
- formatted_response = format_response(response)
 
 
 
 
 
 
 
 
 
 
286
 
287
- # Create list of all messages including the response
288
- all_messages = [
289
- format_chat_message(msg["role"], msg["content"]) for msg in messages
290
- ]
291
- all_messages.append(
292
- format_chat_message("Assistant", formatted_response, is_response=True)
293
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
- chat_html = f"""
296
- <div style="
297
- background-color: var(--surface-color);
298
- border-radius: 8px;
299
- border: 1px solid var(--border-color);
300
- box-shadow: 0 2px 4px var(--shadow-color);
301
- padding: 1.5rem;">
302
- {"".join(all_messages)}
303
- </div>
304
- """
305
 
306
- metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
307
- tool_html = format_tool_info(row["tools_langchain"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
- return chat_html, metrics_html, tool_html
 
 
 
 
 
 
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
- def filter_and_update_display(model, dataset, min_score, max_score, current_index):
313
- try:
314
- df_chat = get_chat_and_score_df(model, dataset)
315
- df_chat = df_chat[
316
- (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
317
- ]
318
-
319
- if df_chat.empty:
320
- return (
321
- '<div style="padding: 1rem; color: var(--text-muted);">No data available for selected filters</div>',
322
- '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
323
- '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
324
- "0/0",
325
- )
326
-
327
- max_index = len(df_chat) - 1
328
- current_index = min(current_index, max_index)
329
- chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
330
-
331
- index_display = f'<div style="font-weight: 500; color: var(--primary-text);">{current_index + 1}/{len(df_chat)}</div>'
332
- return chat_html, metrics_html, tool_html, index_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  except Exception as e:
335
- error_html = f"""
336
  <div style="
337
- padding: 1rem;
338
  color: var(--score-low);
339
  background-color: var(--surface-color);
340
  border: 1px solid var(--score-low);
341
- border-radius: 4px;">
342
- Error: {str(e)}
 
343
  </div>
344
  """
345
- return (
346
- error_html,
347
- '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
348
- '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
349
- "0/0",
350
- )
 
 
 
1
  import json
2
 
3
 
4
+ def format_user_message(msg):
5
+ """Format a user message for display."""
6
+ # Extract the content based on role
7
+ content = msg.get("content", "")
8
+
9
+ # Handle None content
10
+ if content is None:
11
+ content = ""
12
+ elif isinstance(content, (int, float)):
13
+ content = str(content)
14
+ elif isinstance(content, list):
15
+ # Handle list-type content (may contain multiple parts)
16
+ content_text = ""
17
+ for item in content:
18
+ if item is None:
19
+ continue
20
+ if isinstance(item, dict) and "text" in item:
21
+ text_value = item.get("text", "")
22
+ if text_value is not None:
23
+ content_text += str(text_value) + "\n"
24
+ elif isinstance(item, str):
25
+ content_text += item + "\n"
26
+ elif item is not None:
27
+ content_text += str(item) + "\n"
28
+ content = content_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # User message - align right using text-align instead of flex
31
  return f"""
32
  <div style="
33
+ text-align: right;
34
+ margin-bottom: 1.25rem;
35
+ padding: 0 0.5rem;">
36
  <div style="
37
+ display: inline-block;
38
+ max-width: 85%;
39
+ background-color: var(--message-bg-user);
40
  padding: 1rem;
41
+ border-radius: 1rem 0 1rem 1rem;
42
+ color: var(--text-color);
43
+ text-align: left;
44
  box-shadow: 0 1px 2px var(--shadow-color);">
45
  <div style="
46
+ font-weight: 500;
 
47
  margin-bottom: 0.5rem;
48
+ color: var(--primary-text);
49
+ display: flex;
50
+ align-items: center;">
51
+ <span style="margin-right: 0.5rem;">👤</span>User
52
+ </div>
53
+ <div style="white-space: pre-wrap; line-height: 1.5;">
54
+ {content}
55
  </div>
56
+ </div>
57
+ </div>
58
+ """
59
+
60
+
61
+ def format_tool_call(tool_name, tool_input):
62
+ """Format a tool call for display."""
63
+ # Ensure tool_name is a string
64
+ if tool_name is None:
65
+ tool_name = "Unknown Tool"
66
+ elif not isinstance(tool_name, str):
67
+ tool_name = str(tool_name)
68
+
69
+ # Ensure tool_input is serializable
70
+ if tool_input is None:
71
+ tool_input = {}
72
+
73
+ try:
74
+ # Try to serialize the tool input as JSON
75
+ tool_input_json = json.dumps(tool_input, indent=2)
76
+ except TypeError:
77
+ # If serialization fails, create a simplified representation
78
+ if isinstance(tool_input, dict):
79
+ simplified_input = {}
80
+ for k, v in tool_input.items():
81
+ if v is None or isinstance(v, (str, int, float, bool, list, dict)):
82
+ simplified_input[k] = v
83
+ else:
84
+ simplified_input[k] = str(v)
85
+ tool_input_json = json.dumps(simplified_input, indent=2)
86
+ else:
87
+ tool_input_json = str(tool_input)
88
+
89
+ return f"""
90
+ <div style="
91
+ background-color: var(--surface-color-alt);
92
+ padding: 0.75rem;
93
+ border-radius: 0.5rem;
94
+ margin-top: 0.75rem;
95
+ border-left: 3px solid var(--primary-text-light);">
96
+ <div style="
97
+ font-weight: 500;
98
+ margin-bottom: 0.5rem;
99
+ font-size: 0.9rem;
100
+ color: var(--primary-text);">
101
+ <span style="margin-right: 0.5rem;">🔧</span>{tool_name}
102
+ </div>
103
+ <div style="
104
+ font-family: monospace;
105
+ font-size: 0.85rem;
106
+ white-space: pre-wrap;">
107
+ {tool_input_json}
108
+ </div>
109
+ </div>
110
+ """
111
+
112
+
113
+ def extract_assistant_content(msg):
114
+ """Extract text content and tool calls from an assistant message."""
115
+ assistant_text = ""
116
+ tool_calls_html = ""
117
+
118
+ if "content" in msg:
119
+ content = msg["content"]
120
+
121
+ # Handle string content
122
+ if content is None:
123
+ assistant_text = ""
124
+ elif isinstance(content, str):
125
+ assistant_text = content
126
+ elif isinstance(content, (int, float)):
127
+ assistant_text = str(content)
128
+ # Handle list content with text and tool calls
129
+ elif isinstance(content, list):
130
+ for item in content:
131
+ if item is None:
132
+ continue
133
+ if isinstance(item, dict):
134
+ if "text" in item:
135
+ text_value = item.get("text", "")
136
+ if text_value is not None:
137
+ assistant_text += str(text_value) + "\n"
138
+ elif "type" in item and item["type"] == "tool_use":
139
+ # Format tool call in a nicer way
140
+ tool_name = item.get("name", "Unknown Tool")
141
+ tool_input = item.get("input", {})
142
+ if tool_input is None:
143
+ tool_input = {}
144
+ tool_calls_html += format_tool_call(tool_name, tool_input)
145
+ elif isinstance(item, str):
146
+ assistant_text += item + "\n"
147
+ elif item is not None:
148
+ assistant_text += str(item) + "\n"
149
+
150
+ # Extract tool calls if present
151
+ elif "tool_calls" in msg:
152
+ assistant_text = "The assistant used the following tools:"
153
+ tool_calls = msg.get("tool_calls", [])
154
+ if tool_calls is None:
155
+ tool_calls = []
156
+
157
+ for tool_call in tool_calls:
158
+ if tool_call is None:
159
+ continue
160
+ tool_name = tool_call.get("name", "Unknown Tool")
161
+ tool_args = tool_call.get("args", {})
162
+ if tool_args is None:
163
+ tool_args = {}
164
+ tool_calls_html += format_tool_call(tool_name, tool_args)
165
+
166
+ return assistant_text.strip(), tool_calls_html
167
+
168
+
169
+ def format_assistant_message(msg):
170
+ """Format an assistant message for display."""
171
+ assistant_text, tool_calls_html = extract_assistant_content(msg)
172
+
173
+ return f"""
174
+ <div style="
175
+ text-align: left;
176
+ margin-bottom: 1.25rem;
177
+ padding: 0 0.5rem;">
178
+ <div style="
179
+ display: inline-block;
180
+ max-width: 85%;
181
+ background-color: var(--message-bg-assistant);
182
+ padding: 1rem;
183
+ border-radius: 0 1rem 1rem 1rem;
184
+ color: var(--text-color);
185
+ text-align: left;
186
+ box-shadow: 0 1px 2px var(--shadow-color);">
187
  <div style="
188
+ font-weight: 500;
189
+ margin-bottom: 0.5rem;
190
+ color: var(--primary-text);
191
+ display: flex;
192
+ align-items: center;">
193
+ <span style="margin-right: 0.5rem;">🤖</span>Assistant
194
+ </div>
195
+ <div style="white-space: pre-wrap; line-height: 1.5;">
196
+ {assistant_text}
197
  </div>
198
+ {tool_calls_html}
199
+ </div>
200
+ </div>
201
+ """
202
+
203
+
204
+ def format_system_message(msg):
205
+ """Format a system or other message for display."""
206
+ content = msg.get("content", "")
207
+
208
+ # Handle None content
209
+ if content is None:
210
+ content = ""
211
+ elif isinstance(content, (int, float)):
212
+ content = str(content)
213
+ elif isinstance(content, list):
214
+ content_text = ""
215
+ for item in content:
216
+ if item is None:
217
+ continue
218
+ if isinstance(item, dict) and "text" in item:
219
+ text_value = item.get("text", "")
220
+ if text_value is not None:
221
+ content_text += str(text_value) + "\n"
222
+ elif isinstance(item, str):
223
+ content_text += item + "\n"
224
+ elif item is not None:
225
+ content_text += str(item) + "\n"
226
+ content = content_text.strip()
227
+
228
+ return f"""
229
+ <div style="
230
+ text-align: center;
231
+ margin-bottom: 1rem;
232
+ padding: 0 0.5rem;">
233
+ <div style="
234
+ display: inline-block;
235
+ max-width: 85%;
236
+ background-color: var(--message-bg-system);
237
+ padding: 0.75rem;
238
+ border-radius: 0.5rem;
239
+ color: var(--text-color);
240
+ text-align: left;
241
+ font-style: italic;
242
+ font-size: 0.9rem;">
243
+ {content}
244
  </div>
245
  </div>
246
  """
247
 
248
 
249
+ def parse_complex_response(response):
250
+ """Parse complex JSON response and extract text and tool calls."""
251
  try:
252
+ # Ensure response is a string
253
+ if response is None:
254
+ return "", ""
255
+
256
+ if isinstance(response, (int, float)):
257
+ return str(response), ""
258
+
259
+ # Convert to string if it's not already
260
+ if not isinstance(response, str):
261
+ response = str(response)
262
+
263
  # Try to parse as JSON
264
+ if not response.strip().startswith("[") and not response.strip().startswith(
265
+ "{"
266
+ ):
267
+ return response, ""
 
 
268
 
269
+ response_obj = json.loads(response)
270
 
271
+ # Handle array format like in the example
272
+ if isinstance(response_obj, list) and len(response_obj) > 0:
273
+ response_obj = response_obj[0] # Take first item in array
274
 
275
+ # Extract text content and tool calls
276
+ text_content = ""
277
+ tool_calls_html = ""
 
278
 
279
+ # Handle content field which can be string or list
280
+ if "content" in response_obj:
281
+ content = response_obj["content"]
282
+ if content is None:
283
+ text_content = ""
284
+ elif isinstance(content, str):
285
+ text_content = content
286
+ elif isinstance(content, (int, float)):
287
+ text_content = str(content)
288
+ elif isinstance(content, list):
289
+ # Extract only text content from items with type="text"
290
+ for item in content:
291
+ if item is None:
292
+ continue
293
+ if isinstance(item, dict):
294
+ if "type" in item and item["type"] == "text" and "text" in item:
295
+ text_value = item.get("text", "")
296
+ if text_value is not None:
297
+ text_content += str(text_value) + "\n"
298
 
299
+ # Get formatted tool calls if they exist
300
+ if "tool_calls" in response_obj:
301
+ tool_calls = response_obj.get("tool_calls", [])
302
+ if tool_calls is None:
303
+ tool_calls = []
304
 
305
+ if tool_calls:
306
+ try:
307
+ tool_calls_html = f"""
308
+ <div style="
309
+ background-color: var(--surface-color-alt);
310
+ padding: 0.75rem;
311
+ border-radius: 0.5rem;
312
+ margin-top: 0.75rem;
313
+ border-left: 3px solid var(--primary-text-light);">
314
+ <div style="
315
+ font-weight: 500;
316
+ margin-bottom: 0.5rem;
317
+ font-size: 0.9rem;
318
+ color: var(--primary-text);">
319
+ <span style="margin-right: 0.5rem;">🔧</span>Tool Calls
320
+ </div>
321
+ <div style="
322
+ font-family: monospace;
323
+ font-size: 0.85rem;
324
+ white-space: pre-wrap;">
325
+ {json.dumps(tool_calls, indent=2)}
326
+ </div>
327
+ </div>
328
+ """
329
+ except:
330
+ # Fallback if JSON serialization fails
331
+ tool_calls_html = (
332
+ "<div>Tool calls present but could not be formatted.</div>"
333
+ )
334
+
335
+ return text_content.strip(), tool_calls_html
336
+ except Exception as e:
337
+ # If parsing fails, return the original response with error info
338
+ return f"{response}\n\nError parsing response: {str(e)}", ""
339
 
 
 
 
 
 
 
 
340
 
341
+ def format_final_response(response):
342
+ """Format the final response for display."""
343
+ # First try to process as complex JSON with tool calls
344
+ text_content, tool_calls_html = parse_complex_response(response)
345
 
346
+ # If that didn't work, try basic JSON parsing
347
+ if text_content == response:
348
+ # Clean up JSON response if it looks like JSON
349
+ if response.strip().startswith("{") and "content" in response:
350
+ try:
351
+ response_obj = json.loads(response)
352
+ if isinstance(response_obj, dict) and "content" in response_obj:
353
+ if isinstance(response_obj["content"], str):
354
+ text_content = response_obj["content"]
355
+ else:
356
+ text_content = json.dumps(response_obj["content"], indent=2)
357
+ else:
358
+ text_content = response
359
+ except:
360
+ text_content = response
361
+ else:
362
+ text_content = response
363
+
364
+ return f"""
365
+ <div style="
366
+ text-align: left;
367
+ margin-bottom: 1.25rem;
368
+ margin-top: 1.5rem;
369
+ padding: 0 0.5rem;">
370
  <div style="
371
+ display: inline-block;
372
+ max-width: 85%;
373
+ background-color: var(--response-bg);
374
+ padding: 1rem;
375
+ border-radius: 0 1rem 1rem 1rem;
376
+ color: var(--text-color);
377
+ text-align: left;
378
+ box-shadow: 0 1px 2px var(--shadow-color);
379
+ border-left: 4px solid var(--primary-text);">
380
  <div style="
381
+ font-weight: 500;
382
+ margin-bottom: 0.5rem;
383
  color: var(--primary-text);
384
+ display: flex;
385
+ align-items: center;">
386
+ <span style="margin-right: 0.5rem;">🤖</span>Final Response
387
  </div>
388
  <div style="
389
+ white-space: pre-wrap;
390
+ line-height: 1.5;
391
+ font-family: var(--font-sans);">
392
+ {text_content}
393
  </div>
394
+ {tool_calls_html}
395
+ </div>
396
+ </div>
397
+ """
398
+
399
+
400
+ def update_chat_display(existing_display, new_message):
401
+ """Update an existing chat display with a new message."""
402
+ try:
403
+ # Parse the new message
404
+ role = new_message.get("role", "unknown").lower()
405
+
406
+ # Format the new message based on its role
407
+ if role == "user":
408
+ message_html = format_user_message(new_message)
409
+ elif role == "assistant" or role == "ai":
410
+ message_html = format_assistant_message(new_message)
411
+ else:
412
+ message_html = format_system_message(new_message)
413
+
414
+ # Find the position to insert the new message (before the Final Response section)
415
+ insert_marker = '<div style="padding-top: 0.5rem;margin-top: 1rem;margin-bottom: 1rem;border-top: 1px solid var(--border-color-light);'
416
+ parts = existing_display.split(insert_marker)
417
+
418
+ if len(parts) == 2:
419
+ # Insert the new message before the Final Response section
420
+ updated_display = parts[0] + message_html + insert_marker + parts[1]
421
+ return updated_display
422
+ else:
423
+ # If we can't find the insertion point, append to the end
424
+ return existing_display + message_html
425
+ except Exception as e:
426
+ return (
427
+ existing_display
428
+ + f"""
429
+ <div style="
430
+ padding: 1rem;
431
+ color: var(--score-low);
432
+ background-color: var(--surface-color);
433
+ border: 1px solid var(--score-low);
434
+ border-radius: 10px;
435
+ margin-top: 1rem;">
436
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Updating Chat</div>
437
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
438
+ </div>
439
+ """
440
+ )
441
+
442
+
443
+ def format_chat_display(row):
444
+ """Format the chat display with better styling for user and assistant messages."""
445
+ try:
446
+ # Parse the conversation JSON
447
+ messages = json.loads(row["conversation"])
448
+
449
+ # Create HTML for all messages
450
+ messages_html = ""
451
+ for msg in messages:
452
+ role = msg.get("role", "unknown").lower()
453
+
454
+ if role == "user":
455
+ messages_html += format_user_message(msg)
456
+ elif role == "assistant" or role == "ai":
457
+ messages_html += format_assistant_message(msg)
458
+ else:
459
+ # System or other message types
460
+ messages_html += format_system_message(msg)
461
+
462
+ # Format the final response from the assistant
463
+ response_html = format_final_response(row["response"])
464
+
465
+ # Combine all HTML
466
+ full_chat_html = f"""
467
+ <div style="
468
+ padding: 1.5rem;
469
+ background-color: var(--surface-color);
470
+ border-radius: 10px;
471
+ border: 1px solid var(--border-color);
472
+ box-shadow: 0 2px 6px var(--shadow-color);
473
+ height: 100%;
474
+ overflow-y: auto;
475
+ max-height: 600px;
476
+ font-family: var(--font-sans);">
477
  <div style="
478
+ padding-bottom: 1rem;
479
+ margin-bottom: 1.5rem;
480
+ border-bottom: 1px solid var(--border-color-light);
481
+ display: flex;
482
+ align-items: center;">
483
+ <div style="
484
+ font-weight: 600;
485
+ font-size: 1.1rem;
486
+ color: var(--primary-text);">
487
+ <span style="margin-right: 0.5rem;">💬</span>Conversation
488
+ </div>
489
  </div>
490
+ {messages_html}
491
+
492
+ {response_html}
493
  </div>
494
  """
495
+ return full_chat_html
496
+
497
+ except Exception as e:
498
+ return f"""
499
+ <div style="
500
+ padding: 1.5rem;
501
+ color: var(--score-low);
502
+ background-color: var(--surface-color);
503
+ border: 1px solid var(--score-low);
504
+ border-radius: 10px;">
505
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Chat</div>
506
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
507
+ <div style="margin-top: 1rem; font-family: monospace; font-size: 0.8rem;">
508
+ Original conversation: {str(row["conversation"])}
509
+ </div>
510
+ </div>
511
+ """
512
+
513
+
514
+ def parse_tool_schema(tool):
515
+ """Parse tool schema to extract name, description, and parameters properly."""
516
+
517
+ # Handle schema wrapped in a list
518
+ if isinstance(tool, list) and len(tool) > 0:
519
+ tool = tool[0]
520
+
521
+ # Extract function information from the new schema structure with "function" key
522
+ if "function" in tool:
523
+ function_data = tool["function"]
524
+ name = function_data.get("name", "Unnamed Tool")
525
+ description = function_data.get("description", "No description available")
526
+
527
+ parameters = {}
528
+ if (
529
+ "parameters" in function_data
530
+ and "properties" in function_data["parameters"]
531
+ ):
532
+ properties = function_data["parameters"]["properties"]
533
+ for param_name, param_data in properties.items():
534
+ param_desc = param_data.get("description", "No description")
535
+ param_type = param_data.get("type", "unknown")
536
+ param_default = param_data.get("default", "None")
537
+
538
+ # Include default value in parameter description
539
+ parameters[param_name] = (
540
+ f"{param_desc} (Type: {param_type}, Default: {param_default})"
541
+ )
542
+
543
+ # Check for required parameters
544
+ required_params = function_data.get("parameters", {}).get("required", [])
545
+ if required_params:
546
+ for param_name in required_params:
547
+ if param_name in parameters:
548
+ parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
549
+ else:
550
+ # Original schema parsing
551
+ name = tool.get("title", "Unnamed Tool")
552
+ description = tool.get("description", "No description available")
553
+
554
+ parameters = {}
555
+ if "properties" in tool:
556
+ for param_name, param_data in tool["properties"].items():
557
+ param_desc = param_data.get("description", "No description")
558
+ param_type = param_data.get("type", "unknown")
559
+ param_title = param_data.get("title", param_name)
560
+
561
+ parameters[param_name] = (
562
+ f"{param_desc} (Type: {param_type}, Title: {param_title})"
563
+ )
564
+
565
+ # Check for required parameters in the original schema
566
+ required_params = tool.get("required", [])
567
+ if required_params:
568
+ for param_name in required_params:
569
+ if param_name in parameters:
570
+ parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
571
+
572
+ return name, description, parameters
573
+
574
 
575
  def format_parameters(parameters):
576
  if not parameters:
577
+ return '<div style="color: var(--text-muted); font-style: italic;">No parameters</div>'
578
 
579
  params_html = ""
580
  for name, desc in parameters.items():
581
+ is_required = "[REQUIRED]" in desc
582
+ param_style = "required" if is_required else "optional"
583
+
584
+ # Clean up the description to remove the REQUIRED marker but keep the info
585
+ cleaned_desc = desc.replace("[REQUIRED] ", "") if is_required else desc
586
+
587
  params_html += f"""
588
+ <div style="
589
+ margin-bottom: 1.2rem;
590
+ padding-bottom: 1.2rem;
591
+ border-bottom: 1px solid var(--border-color);
592
+ last-child: border-bottom: none;">
593
  <div style="
594
+ display: flex;
595
+ align-items: center;
596
+ justify-content: space-between;
597
+ margin-bottom: 0.5rem;">
598
+ <div style="
599
+ font-weight: 600;
600
+ color: var(--primary-text);
601
+ font-size: 1.05rem;
602
+ display: flex;
603
+ align-items: center;">
604
+ {name}
605
+ </div>
606
+ <div style="
607
+ font-size: 0.8rem;
608
+ padding: 0.2rem 0.6rem;
609
+ border-radius: 12px;
610
+ background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
611
+ color: var(--{param_style}-color);
612
+ font-weight: 500;">
613
+ {f"Required" if is_required else "Optional"}
614
+ </div>
615
  </div>
616
  <div style="
617
  color: var(--text-color);
618
+ line-height: 1.5;
619
+ font-size: 0.95rem;
620
+ opacity: 0.9;">
621
+ {cleaned_desc}
622
  </div>
623
  </div>
624
  """
 
625
 
626
+ # Remove the border-bottom from the last parameter
627
+ params_html = params_html.replace("last-child: border-bottom: none;", "")
628
+ return (
629
+ params_html
630
+ + """
631
+ <style>
632
+ div:last-child {
633
+ border-bottom: none !important;
634
+ margin-bottom: 0 !important;
635
+ padding-bottom: 0 !important;
636
+ }
637
+ </style>
638
+ """
639
  )
640
+
641
+
642
+ def format_metrics(score, rationale, explanation):
643
+ """Format metrics display with improved visual hierarchy and dark theme support."""
644
+ # Determine score color and add emoji indicator
645
+ if score >= 0.7:
646
+ score_color = "var(--score-high)"
647
+ score_emoji = "🟢"
648
+ score_text = "High"
649
+ elif score >= 0.4:
650
+ score_color = "var(--score-med)"
651
+ score_emoji = "🟠"
652
+ score_text = "Medium"
653
+ else:
654
+ score_color = "var(--score-low)"
655
+ score_emoji = "🔴"
656
+ score_text = "Low"
657
+
658
  return f"""
659
  <div style="
660
+ padding: 1.75rem;
661
  background-color: var(--surface-color);
662
+ border-radius: 10px;
663
  border: 1px solid var(--border-color);
664
+ box-shadow: 0 3px 8px var(--shadow-color);">
665
+ <div style="
666
+ display: flex;
667
+ align-items: center;
668
+ margin-bottom: 1.75rem;
669
+ padding-bottom: 1.5rem;
670
+ border-bottom: 1px solid var(--border-color-light);">
671
+ <div style="flex: 1;">
672
+ <h3 style="
673
+ color: var(--text-color);
674
+ font-size: 1.2rem;
675
+ margin-bottom: 0.25rem;
676
+ font-weight: 600;">TSQ Score</h3>
677
+ <div style="
678
+ display: flex;
679
+ align-items: baseline;">
680
+ <div style="
681
+ font-size: 2.5rem;
682
+ font-weight: 700;
683
+ color: {score_color};">
684
+ {score:.2f}
685
+ </div>
686
+ <div style="
687
+ margin-left: 0.75rem;
688
+ font-size: 1rem;
689
+ color: {score_color};
690
+ font-weight: 500;
691
+ display: flex;
692
+ align-items: center;">
693
+ <span style="margin-right: 0.5rem;">{score_emoji}</span>{score_text}
694
+ </div>
695
+ </div>
696
  </div>
697
  </div>
698
+ <div style="margin-bottom: 1.75rem;">
699
  <h3 style="
700
  color: var(--text-color);
701
  font-size: 1.1rem;
702
+ margin-bottom: 0.75rem;
703
+ font-weight: 600;
704
+ display: flex;
705
+ align-items: center;">
706
+ <span style="
707
+ display: inline-block;
708
+ width: 18px;
709
+ height: 18px;
710
+ background-color: var(--primary-text-light);
711
+ border-radius: 4px;
712
+ margin-right: 0.5rem;"></span>
713
+ Rationale
714
+ </h3>
715
  <div style="
716
  color: var(--text-color);
717
+ line-height: 1.6;
718
+ padding-left: 1.5rem;
719
+ border-left: 3px solid var(--primary-text-light);
720
+ font-size: 0.95rem;">
721
  {rationale}
722
  </div>
723
  </div>
 
725
  <h3 style="
726
  color: var(--text-color);
727
  font-size: 1.1rem;
728
+ margin-bottom: 0.75rem;
729
+ font-weight: 600;
730
+ display: flex;
731
+ align-items: center;">
732
+ <span style="
733
+ display: inline-block;
734
+ width: 18px;
735
+ height: 18px;
736
+ background-color: var(--primary-text-light);
737
+ border-radius: 4px;
738
+ margin-right: 0.5rem;"></span>
739
+ Explanation
740
+ </h3>
741
  <div style="
742
  color: var(--text-color);
743
+ line-height: 1.6;
744
+ padding-left: 1.5rem;
745
+ border-left: 3px solid var(--primary-text-light);
746
+ font-size: 0.95rem;">
747
  {explanation}
748
  </div>
749
  </div>
750
  </div>
751
  """
752
 
 
 
 
 
 
 
 
 
753
 
754
+ def format_metrics_display(row):
755
+ """Format the metrics display with score, rationale and explanation."""
756
+ try:
757
+ score = row["score"]
758
+ rationale = row["rationale"]
759
+ explanation = row["explanation"]
760
 
761
+ # Determine score color and add emoji indicator
762
+ if score >= 0.7:
763
+ score_color = "var(--score-high)"
764
+ score_emoji = "🟢"
765
+ score_text = "High"
766
+ elif score >= 0.4:
767
+ score_color = "var(--score-med)"
768
+ score_emoji = "🟠"
769
+ score_text = "Medium"
770
+ else:
771
+ score_color = "var(--score-low)"
772
+ score_emoji = "🔴"
773
+ score_text = "Low"
774
 
775
+ metrics_html = f"""
776
+ <div style="
777
+ padding: 1.5rem;
778
+ background-color: var(--surface-color);
779
+ border-radius: 10px;
780
+ border: 1px solid var(--border-color);
781
+ box-shadow: 0 2px 6px var(--shadow-color);
782
+ height: 100%;
783
+ overflow-y: auto;
784
+ max-height: 600px;">
785
+ <div style="
786
+ padding-bottom: 1rem;
787
+ margin-bottom: 1.5rem;
788
+ border-bottom: 1px solid var(--border-color-light);
789
+ display: flex;
790
+ align-items: center;">
791
+ <div style="
792
+ font-weight: 600;
793
+ font-size: 1.1rem;
794
+ color: var(--primary-text);">
795
+ <span style="margin-right: 0.5rem;">📊</span>Evaluation Metrics
796
+ </div>
797
+ </div>
798
+
799
+ <div style="
800
+ margin-bottom: 1.5rem;
801
+ padding-bottom: 1.5rem;
802
+ border-bottom: 1px solid var(--border-color-light);">
803
+ <div style="
804
+ display: flex;
805
+ align-items: center;
806
+ justify-content: space-between;">
807
+ <div>
808
+ <div style="
809
+ font-weight: 600;
810
+ margin-bottom: 0.25rem;
811
+ color: var(--text-color);">
812
+ TSQ Score
813
+ </div>
814
+ <div style="
815
+ font-size: 2.5rem;
816
+ font-weight: 700;
817
+ color: {score_color};
818
+ display: flex;
819
+ align-items: center;">
820
+ {score:.2f}
821
+ <div style="
822
+ margin-left: 0.75rem;
823
+ font-size: 1rem;
824
+ display: flex;
825
+ align-items: center;">
826
+ {score_emoji} <span style="margin-left: 0.25rem;">{score_text}</span>
827
+ </div>
828
+ </div>
829
+ </div>
830
+ </div>
831
+ </div>
832
+
833
+ <div style="margin-bottom: 1.5rem;">
834
+ <div style="
835
+ font-weight: 600;
836
+ margin-bottom: 0.75rem;
837
+ color: var(--text-color);
838
+ display: flex;
839
+ align-items: center;">
840
+ <span style="
841
+ display: inline-block;
842
+ width: 12px;
843
+ height: 12px;
844
+ background-color: var(--primary-text-light);
845
+ border-radius: 2px;
846
+ margin-right: 0.5rem;"></span>
847
+ Rationale
848
+ </div>
849
+ <div style="
850
+ background-color: var(--surface-color-alt);
851
+ padding: 1rem;
852
+ border-radius: 8px;
853
+ border-left: 3px solid var(--primary-text-light);
854
+ line-height: 1.5;
855
+ color: var(--text-color);
856
+ font-size: 0.95rem;">
857
+ {rationale}
858
+ </div>
859
+ </div>
860
+
861
+ <div>
862
+ <div style="
863
+ font-weight: 600;
864
+ margin-bottom: 0.75rem;
865
+ color: var(--text-color);
866
+ display: flex;
867
+ align-items: center;">
868
+ <span style="
869
+ display: inline-block;
870
+ width: 12px;
871
+ height: 12px;
872
+ background-color: var(--primary-text-light);
873
+ border-radius: 2px;
874
+ margin-right: 0.5rem;"></span>
875
+ Explanation
876
+ </div>
877
+ <div style="
878
+ background-color: var(--surface-color-alt);
879
+ padding: 1rem;
880
+ border-radius: 8px;
881
+ border-left: 3px solid var(--primary-text-light);
882
+ line-height: 1.5;
883
+ color: var(--text-color);
884
+ font-size: 0.95rem;">
885
+ {explanation}
886
+ </div>
887
+ </div>
888
+ </div>
889
+ """
890
+ return metrics_html
891
+ except Exception as e:
892
+ return f"""
893
+ <div style="
894
+ padding: 1.5rem;
895
+ color: var(--score-low);
896
+ background-color: var(--surface-color);
897
+ border: 1px solid var(--score-low);
898
+ border-radius: 10px;">
899
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Metrics</div>
900
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
901
+ </div>
902
+ """
903
 
 
 
 
 
 
 
 
 
 
 
904
 
905
+ def format_tool_info(tools_data):
906
+ """Format the tool information with improved styling."""
907
+ try:
908
+ if not tools_data or tools_data == "[]":
909
+ return """
910
+ <div style="
911
+ padding: 1.5rem;
912
+ text-align: center;
913
+ color: var(--text-muted);
914
+ background-color: var(--surface-color);
915
+ border-radius: 10px;
916
+ border: 1px solid var(--border-color);
917
+ box-shadow: 0 2px 6px var(--shadow-color);">
918
+ <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
919
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
920
+ <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
921
+ </div>
922
+ """
923
 
924
+ if isinstance(tools_data, str):
925
+ try:
926
+ tools = json.loads(tools_data)
927
+ except:
928
+ tools = []
929
+ else:
930
+ tools = tools_data
931
 
932
+ if not tools:
933
+ return """
934
+ <div style="
935
+ padding: 1.5rem;
936
+ text-align: center;
937
+ color: var(--text-muted);
938
+ background-color: var(--surface-color);
939
+ border-radius: 10px;
940
+ border: 1px solid var(--border-color);
941
+ box-shadow: 0 2px 6px var(--shadow-color);">
942
+ <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
943
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
944
+ <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
945
+ </div>
946
+ """
947
 
948
+ # Format each tool
949
+ tool_items = ""
950
+ for tool in tools:
951
+ name = tool.get("title", tool.get("name", "Unnamed Tool"))
952
+ description = tool.get("description", "No description available")
953
+
954
+ # Get parameters
955
+ parameters = {}
956
+ required_params = []
957
+
958
+ # Handle different schema formats
959
+ if "function" in tool:
960
+ # Function schema format
961
+ function_data = tool["function"]
962
+ name = function_data.get("name", name)
963
+ description = function_data.get("description", description)
964
+
965
+ if (
966
+ "parameters" in function_data
967
+ and "properties" in function_data["parameters"]
968
+ ):
969
+ properties = function_data["parameters"]["properties"]
970
+ for param_name, param_data in properties.items():
971
+ param_desc = param_data.get("description", "No description")
972
+ param_type = param_data.get("type", "unknown")
973
+ param_default = param_data.get("default", "None")
974
+ parameters[param_name] = {
975
+ "description": param_desc,
976
+ "type": param_type,
977
+ "default": param_default,
978
+ }
979
+
980
+ required_params = function_data.get("parameters", {}).get(
981
+ "required", []
982
+ )
983
+
984
+ elif "properties" in tool:
985
+ # Original schema format
986
+ if "properties" in tool:
987
+ for param_name, param_data in tool["properties"].items():
988
+ param_desc = param_data.get("description", "No description")
989
+ param_type = param_data.get("type", "unknown")
990
+ param_title = param_data.get("title", param_name)
991
+ parameters[param_name] = {
992
+ "description": param_desc,
993
+ "type": param_type,
994
+ "title": param_title,
995
+ }
996
+
997
+ required_params = tool.get("required", [])
998
+
999
+ # Format parameters
1000
+ params_html = ""
1001
+ if parameters:
1002
+ for param_name, param_data in parameters.items():
1003
+ is_required = param_name in required_params
1004
+ param_style = "required" if is_required else "optional"
1005
+
1006
+ params_html += f"""
1007
+ <div style="
1008
+ margin-bottom: 1rem;
1009
+ padding-bottom: 1rem;
1010
+ border-bottom: 1px solid var(--border-color-light);">
1011
+ <div style="
1012
+ display: flex;
1013
+ align-items: center;
1014
+ justify-content: space-between;
1015
+ margin-bottom: 0.5rem;">
1016
+ <div style="
1017
+ font-weight: 600;
1018
+ color: var(--primary-text);
1019
+ font-size: 0.95rem;">
1020
+ {param_name}
1021
+ </div>
1022
+ <div style="
1023
+ font-size: 0.75rem;
1024
+ padding: 0.15rem 0.5rem;
1025
+ border-radius: 12px;
1026
+ background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
1027
+ color: {f"var(--score-low)" if is_required else "var(--text-muted)"};
1028
+ font-weight: 500;">
1029
+ {f"Required" if is_required else "Optional"}
1030
+ </div>
1031
+ </div>
1032
+ <div style="
1033
+ color: var(--text-muted);
1034
+ line-height: 1.5;
1035
+ font-size: 0.85rem;
1036
+ margin-bottom: 0.25rem;">
1037
+ {param_data.get("description", "No description")}
1038
+ </div>
1039
+ <div style="
1040
+ display: flex;
1041
+ font-size: 0.8rem;
1042
+ color: var(--text-muted);">
1043
+ <div style="margin-right: 1rem;">
1044
+ <span style="font-weight: 500;">Type:</span> {param_data.get("type", "unknown")}
1045
+ </div>
1046
+ {f'<div><span style="font-weight: 500;">Default:</span> {param_data.get("default", "None")}</div>' if "default" in param_data else ''}
1047
+ </div>
1048
+ </div>
1049
+ """
1050
+ else:
1051
+ params_html = """
1052
+ <div style="
1053
+ color: var(--text-muted);
1054
+ font-style: italic;
1055
+ padding: 0.75rem;
1056
+ text-align: center;
1057
+ font-size: 0.9rem;">
1058
+ No parameters
1059
+ </div>
1060
+ """
1061
+
1062
+ # Remove border from last parameter
1063
+ params_html += """
1064
+ <style>
1065
+ .tool-params > div:last-child {
1066
+ border-bottom: none !important;
1067
+ margin-bottom: 0 !important;
1068
+ padding-bottom: 0 !important;
1069
+ }
1070
+ </style>
1071
+ """
1072
+
1073
+ tool_items += f"""
1074
+ <div style="
1075
+ margin-bottom: 1.5rem;
1076
+ padding: 1.5rem;
1077
+ border-radius: 8px;
1078
+ background-color: var(--surface-color-alt);
1079
+ border: 1px solid var(--border-color);
1080
+ box-shadow: 0 1px 3px var(--shadow-color);">
1081
+ <div style="
1082
+ font-weight: 600;
1083
+ color: var(--primary-text);
1084
+ margin-bottom: 0.75rem;
1085
+ font-size: 1.05rem;
1086
+ display: flex;
1087
+ align-items: center;">
1088
+ <span style="margin-right: 8px;">⚙️</span> {name}
1089
+ </div>
1090
+ <div style="
1091
+ color: var(--text-color);
1092
+ margin-bottom: 1.25rem;
1093
+ line-height: 1.5;
1094
+ font-size: 0.95rem;
1095
+ padding-left: 0.5rem;
1096
+ border-left: 3px solid var(--primary-text-light);">
1097
+ {description}
1098
+ </div>
1099
+ <div style="
1100
+ font-weight: 600;
1101
+ color: var(--text-color);
1102
+ margin-bottom: 0.75rem;
1103
+ font-size: 0.9rem;">
1104
+ Parameters:
1105
+ </div>
1106
+ <div class="tool-params">
1107
+ {params_html}
1108
+ </div>
1109
+ </div>
1110
+ """
1111
 
1112
+ full_tools_html = f"""
1113
+ <div style="
1114
+ padding: 1.5rem;
1115
+ background-color: var(--surface-color);
1116
+ border-radius: 10px;
1117
+ border: 1px solid var(--border-color);
1118
+ box-shadow: 0 2px 6px var(--shadow-color);
1119
+ height: 100%;
1120
+ overflow-y: auto;
1121
+ max-height: 600px;">
1122
+ <div style="
1123
+ padding-bottom: 1rem;
1124
+ margin-bottom: 1.5rem;
1125
+ border-bottom: 1px solid var(--border-color-light);
1126
+ display: flex;
1127
+ align-items: center;">
1128
+ <div style="
1129
+ font-weight: 600;
1130
+ font-size: 1.1rem;
1131
+ color: var(--primary-text);">
1132
+ <span style="margin-right: 0.5rem;">🛠️</span>Available Tools
1133
+ </div>
1134
+ </div>
1135
+ {tool_items}
1136
+ </div>
1137
+ """
1138
+ return full_tools_html
1139
  except Exception as e:
1140
+ return f"""
1141
  <div style="
1142
+ padding: 1.5rem;
1143
  color: var(--score-low);
1144
  background-color: var(--surface-color);
1145
  border: 1px solid var(--score-low);
1146
+ border-radius: 10px;">
1147
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Tool Info</div>
1148
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
1149
  </div>
1150
  """
 
 
 
 
 
 
data_loader.py CHANGED
@@ -32,6 +32,8 @@ def load_data():
32
  )
33
  return df
34
 
 
 
35
 
36
  # categories.py
37
  CATEGORIES = {
@@ -602,9 +604,9 @@ HEADER_CONTENT = (
602
 
603
  CARDS = """ <div class="metrics-grid">
604
  <div class="metric-card">
605
- <div class="metric-number metric-blue">17</div>
606
  <div class="metric-label">Total Models</div>
607
- <div class="metric-detail primary">12 Private</div>
608
  <div class="metric-detail primary">5 Open Source</div>
609
  </div>
610
 
@@ -1001,7 +1003,7 @@ METHODOLOGY = """
1001
  <tbody>
1002
  <tr>
1003
  <td>Performance Champion</td>
1004
- <td>Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
1005
  </tr>
1006
  <tr>
1007
  <td>Price-Performance Paradox</td>
 
32
  )
33
  return df
34
 
35
+ df = load_data()
36
+ MODELS = [x.strip() for x in df["Model"].unique().tolist()]
37
 
38
  # categories.py
39
  CATEGORIES = {
 
604
 
605
  CARDS = """ <div class="metrics-grid">
606
  <div class="metric-card">
607
+ <div class="metric-number metric-blue">18</div>
608
  <div class="metric-label">Total Models</div>
609
+ <div class="metric-detail primary">13 Private</div>
610
  <div class="metric-detail primary">5 Open Source</div>
611
  </div>
612
 
 
1003
  <tbody>
1004
  <tr>
1005
  <td>Performance Champion</td>
1006
+ <td>Claude 3.7 Sonnet comes at the top with 0.953 but Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
1007
  </tr>
1008
  <tr>
1009
  <td>Price-Performance Paradox</td>
output/claude-3-7-sonnet-20250219/BFCL_v3_irrelevance.parquet ADDED
Binary file (52.7 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (26.5 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (26.6 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (51.9 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (43.2 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (51.1 kB). View file
 
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (50.5 kB). View file
 
output/claude-3-7-sonnet-20250219/tau_long_context.parquet ADDED
Binary file (46.6 kB). View file
 
output/claude-3-7-sonnet-20250219/toolace_single_func_call_1.parquet ADDED
Binary file (20.8 kB). View file
 
output/claude-3-7-sonnet-20250219/toolace_single_func_call_2.parquet ADDED
Binary file (17.1 kB). View file
 
output/claude-3-7-sonnet-20250219/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (107 kB). View file
 
output/claude-3-7-sonnet-20250219/xlam_multiple_tool_single_call.parquet ADDED
Binary file (51.2 kB). View file
 
output/claude-3-7-sonnet-20250219/xlam_single_tool_multiple_call.parquet ADDED
Binary file (34.2 kB). View file
 
output/claude-3-7-sonnet-20250219/xlam_single_tool_single_call.parquet ADDED
Binary file (56.5 kB). View file
 
output/claude-3-7-sonnet-20250219/xlam_tool_miss.parquet ADDED
Binary file (66.7 kB). View file
 
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
- gradio==5.12.0
2
  pandas
3
  matplotlib
4
- plotly
5
- promptquality==0.72.1
 
1
+ gradio==5.18.0
2
  pandas
3
  matplotlib
4
+ plotly
 
results.csv CHANGED
@@ -1,4 +1,5 @@
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
 
2
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
3
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
@@ -16,4 +17,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
16
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
17
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
18
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
19
- ,,,,,,,0.83,0.79,0.81,0.78,0.76,0.88,0.80,0.96,0.60,0.81,0.82,0.81,0.92,0.85,0.73,0.80
 
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
3
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
4
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
5
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
 
17
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
18
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
19
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
20
+ Dataset Avg,,,,,,,0.83,0.80,0.81,0.79,0.78,0.89,0.81,0.96,0.62,0.81,0.82,0.82,0.92,0.85,0.74,0.81
tabs/data_exploration.py CHANGED
@@ -1,135 +1,674 @@
1
  import gradio as gr
2
- from chat import get_chat_and_score_df, update_chat_display
 
 
 
 
 
 
 
3
 
4
- def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
5
 
6
- def filter_and_update_display(model, dataset, min_score, max_score, current_index):
7
- try:
8
- df_chat = get_chat_and_score_df(model, dataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Filter by score range
11
- df_chat = df_chat[
12
- (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
13
- ]
14
-
15
- if df_chat.empty:
16
- return (
17
- "<div>No data available for selected filters</div>",
18
- "<div>No metrics available</div>",
19
- "<div>No tool information available</div>",
20
- "0/0",
21
- )
22
 
23
- max_index = len(df_chat) - 1
24
- current_index = min(current_index, max_index)
25
- chat_html, metrics_html, tool_html = update_chat_display(
26
- df_chat, current_index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  )
28
 
29
- return (
30
- chat_html,
31
- metrics_html,
32
- tool_html,
33
- f"{current_index + 1}/{len(df_chat)}",
 
 
 
 
34
  )
35
- except Exception as e:
36
- print(f"Error in filter_and_update_display: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  return (
38
- f"<div>Error: {str(e)}</div>",
39
- "<div>No metrics available</div>",
40
- "<div>No tool information available</div>",
41
- "0/0",
42
  )
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with gr.Tab("Data Exploration"):
45
- gr.HTML(HEADER_CONTENT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # All filters in a single row with consistent sizing
48
- with gr.Row(equal_height=True):
49
- explore_model = gr.Dropdown(
50
- choices=MODELS,
51
- value=MODELS[0],
52
- label="Model",
53
- container=True,
54
- scale=1,
55
- )
56
- explore_dataset = gr.Dropdown(
57
- choices=DATASETS,
58
- value=DATASETS[0],
59
- label="Dataset",
60
- container=True,
61
- scale=1,
62
- )
63
- min_score = gr.Slider(
64
- minimum=min(SCORES),
65
- maximum=max(SCORES),
66
- value=min(SCORES),
67
- step=0.1,
68
- label="Minimum Score - TSQ",
69
- container=True,
70
- scale=1,
71
- )
72
- max_score = gr.Slider(
73
- minimum=min(SCORES),
74
- maximum=max(SCORES),
75
- value=max(SCORES),
76
- step=0.1,
77
- label="Maximum Score - TSQ",
78
- container=True,
79
- scale=1,
80
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  # Navigation row
83
  with gr.Row(variant="panel"):
84
- index_display = gr.HTML( # Changed the variable name to index_display
85
- value="0/0", elem_id="index-display", elem_classes="text-center"
86
- )
87
- with gr.Row():
88
- prev_btn = gr.Button("← Previous", size="lg", variant="secondary")
89
- next_btn = gr.Button("Next →", size="lg", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Content area with equal column widths
92
  with gr.Row(equal_height=True):
93
- chat_display = gr.HTML()
94
- metrics_display = gr.HTML()
 
 
 
 
95
  tool_info_display = gr.HTML()
96
 
 
97
  current_index = gr.State(value=0)
98
 
99
- # Update display on filter change
100
- def update_on_filter_change(model, dataset, min_score, max_score):
101
- return filter_and_update_display(model, dataset, min_score, max_score, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- for control in [explore_model, explore_dataset, min_score, max_score]:
 
 
 
 
 
 
 
 
 
 
104
  control.change(
105
- update_on_filter_change,
106
- inputs=[explore_model, explore_dataset, min_score, max_score],
 
 
 
 
 
 
 
 
107
  outputs=[
108
  chat_display,
109
  metrics_display,
110
  tool_info_display,
111
  index_display,
112
- ], # Changed to index_display
113
- )
114
-
115
- # Navigation functions
116
- def navigate(direction, current_idx, model, dataset, min_score, max_score):
117
- new_index = current_idx + direction
118
- return (
119
- *filter_and_update_display(
120
- model, dataset, min_score, max_score, new_index
121
- ),
122
- new_index,
123
  )
124
 
 
125
  prev_btn.click(
126
- lambda idx, m, d, min_s, max_s: navigate(-1, idx, m, d, min_s, max_s),
127
  inputs=[
128
  current_index,
129
  explore_model,
130
  explore_dataset,
131
  min_score,
132
  max_score,
 
 
 
133
  ],
134
  outputs=[
135
  chat_display,
@@ -137,17 +676,20 @@ def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
137
  tool_info_display,
138
  index_display,
139
  current_index,
140
- ], # Changed to index_display
141
  )
142
 
143
  next_btn.click(
144
- lambda idx, m, d, min_s, max_s: navigate(1, idx, m, d, min_s, max_s),
145
  inputs=[
146
  current_index,
147
  explore_model,
148
  explore_dataset,
149
  min_score,
150
  max_score,
 
 
 
151
  ],
152
  outputs=[
153
  chat_display,
@@ -155,12 +697,126 @@ def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
155
  tool_info_display,
156
  index_display,
157
  current_index,
158
- ], # Changed to index_display
159
  )
160
 
161
- return (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  chat_display,
163
  metrics_display,
164
  tool_info_display,
165
- index_display, # Changed to index_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  )
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from data_loader import MODELS, DATASETS, SCORES, HEADER_CONTENT
5
+ from chat import (
6
+ format_chat_display,
7
+ format_metrics_display,
8
+ format_tool_info,
9
+ )
10
 
 
11
 
12
+ def get_updated_df(df, df_output):
13
+ df = df.iloc[: len(df_output)].copy()
14
+ df["response"] = df_output["response"].tolist()
15
+ df["rationale"] = df_output["rationale"].tolist()
16
+ df["explanation"] = df_output["explanation"].tolist()
17
+ df["score"] = df_output["score"].tolist()
18
+ cols = [
19
+ "conversation",
20
+ "tools_langchain",
21
+ "n_turns",
22
+ "len_query",
23
+ "n_tools",
24
+ "response",
25
+ "rationale",
26
+ "explanation",
27
+ "score",
28
+ ]
29
+ return df[cols]
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def get_chat_and_score_df(model, dataset):
33
+ df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
34
+ df = pd.read_parquet(f"datasets/{dataset}.parquet")
35
+ df = get_updated_df(df, df_output)
36
+ return df
37
+
38
+
39
+ def on_filter_change(
40
+ model,
41
+ dataset,
42
+ min_score,
43
+ max_score,
44
+ min_n_turns,
45
+ min_len_query,
46
+ min_n_tools,
47
+ ):
48
+ try:
49
+ # Call filter_and_display with index 0 and unpack 4 values
50
+ chat_html, metrics_html, tool_html, index_html = filter_and_display(
51
+ model,
52
+ dataset,
53
+ min_score,
54
+ max_score,
55
+ min_n_turns,
56
+ min_len_query,
57
+ min_n_tools,
58
+ 0,
59
+ )
60
+ # Return exactly 4 values
61
+ return chat_html, metrics_html, tool_html, index_html
62
+ except Exception as e:
63
+ error_html = f"""
64
+ <div style="padding: 1.5rem; color: var(--score-low);">
65
+ <div style="font-weight: 600;">Filter Error</div>
66
+ <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
67
+ {str(e)}
68
+ </div>
69
+ </div>
70
+ """
71
+ return (
72
+ error_html,
73
+ "<div style='text-align: center;'>No metrics available</div>",
74
+ "<div style='text-align: center;'>No tool information available</div>",
75
+ "<div style='text-align: center;'>0/0</div>",
76
+ )
77
+
78
+
79
+ def navigate_prev(
80
+ current_idx,
81
+ model,
82
+ dataset,
83
+ min_score,
84
+ max_score,
85
+ min_n_turns,
86
+ min_len_query,
87
+ min_n_tools,
88
+ ):
89
+ try:
90
+ # Handle current_idx as dictionary
91
+ if isinstance(current_idx, dict) and "value" in current_idx:
92
+ idx_val = int(current_idx["value"])
93
+ else:
94
+ idx_val = int(current_idx) if current_idx is not None else 0
95
+
96
+ new_index = max(0, idx_val - 1)
97
+
98
+ chat_html, metrics_html, tool_html, index_html = filter_and_display(
99
+ model,
100
+ dataset,
101
+ min_score,
102
+ max_score,
103
+ min_n_turns,
104
+ min_len_query,
105
+ min_n_tools,
106
+ new_index,
107
+ )
108
+ return chat_html, metrics_html, tool_html, index_html, new_index
109
+ except Exception as e:
110
+ error_html = f"""
111
+ <div style="padding: 1.5rem; color: var(--score-low);">
112
+ <div style="font-weight: 600;">Navigation Error</div>
113
+ <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
114
+ {str(e)}
115
+ </div>
116
+ </div>
117
+ """
118
+ return (
119
+ error_html,
120
+ "<div style='text-align: center;'>No metrics available</div>",
121
+ "<div style='text-align: center;'>No tool information available</div>",
122
+ "<div style='text-align: center;'>0/0</div>",
123
+ current_idx or 0,
124
+ )
125
+
126
+
127
+ def navigate_next(
128
+ current_idx,
129
+ model,
130
+ dataset,
131
+ min_score,
132
+ max_score,
133
+ min_n_turns,
134
+ min_len_query,
135
+ min_n_tools,
136
+ ):
137
+ try:
138
+ # Handle current_idx as dictionary
139
+ if isinstance(current_idx, dict) and "value" in current_idx:
140
+ idx_val = int(current_idx["value"])
141
+ else:
142
+ idx_val = int(current_idx) if current_idx is not None else 0
143
+
144
+ new_index = idx_val + 1
145
+
146
+ chat_html, metrics_html, tool_html, index_html = filter_and_display(
147
+ model,
148
+ dataset,
149
+ min_score,
150
+ max_score,
151
+ min_n_turns,
152
+ min_len_query,
153
+ min_n_tools,
154
+ new_index,
155
+ )
156
+ return chat_html, metrics_html, tool_html, index_html, new_index
157
+ except Exception as e:
158
+ error_html = f"""
159
+ <div style="padding: 1.5rem; color: var(--score-low);">
160
+ <div style="font-weight: 600;">Navigation Error</div>
161
+ <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
162
+ {str(e)}
163
+ </div>
164
+ </div>
165
+ """
166
+ return (
167
+ error_html,
168
+ "<div style='text-align: center;'>No metrics available</div>",
169
+ "<div style='text-align: center;'>No tool information available</div>",
170
+ "<div style='text-align: center;'>0/0</div>",
171
+ current_idx or 0,
172
+ )
173
+
174
+
175
+ def filter_and_display(
176
+ model,
177
+ dataset,
178
+ min_score,
179
+ max_score,
180
+ min_n_turns,
181
+ min_len_query,
182
+ min_n_tools,
183
+ index=0,
184
+ ):
185
+ """Combined function to filter data and update display"""
186
+ try:
187
+ # Extract model
188
+ if isinstance(model, dict):
189
+ if "value" in model:
190
+ model_str = str(model["value"])
191
+ else:
192
+ model_str = MODELS[0]
193
+ else:
194
+ model_str = str(model) if model is not None else MODELS[0]
195
+
196
+ # Extract dataset
197
+ if isinstance(dataset, dict):
198
+ if "value" in dataset:
199
+ dataset_str = str(dataset["value"])
200
+ else:
201
+ dataset_str = DATASETS[0]
202
+ else:
203
+ dataset_str = str(dataset) if dataset is not None else DATASETS[0]
204
+
205
+ # Extract min_score
206
+ if isinstance(min_score, dict):
207
+ if "value" in min_score:
208
+ min_score_val = float(min_score["value"])
209
+ else:
210
+ min_score_val = float(min(SCORES))
211
+ else:
212
+ min_score_val = (
213
+ float(min_score) if min_score is not None else float(min(SCORES))
214
  )
215
 
216
+ # Extract max_score
217
+ if isinstance(max_score, dict):
218
+ if "value" in max_score:
219
+ max_score_val = float(max_score["value"])
220
+ else:
221
+ max_score_val = float(max(SCORES))
222
+ else:
223
+ max_score_val = (
224
+ float(max_score) if max_score is not None else float(max(SCORES))
225
  )
226
+
227
+ # Extract min_n_turns
228
+ if isinstance(min_n_turns, dict):
229
+ if "value" in min_n_turns:
230
+ min_n_turns_val = int(min_n_turns["value"])
231
+ else:
232
+ min_n_turns_val = 0
233
+ else:
234
+ min_n_turns_val = int(min_n_turns) if min_n_turns is not None else 0
235
+
236
+ # Extract min_len_query
237
+ if isinstance(min_len_query, dict):
238
+ if "value" in min_len_query:
239
+ min_len_query_val = int(min_len_query["value"])
240
+ else:
241
+ min_len_query_val = 0
242
+ else:
243
+ min_len_query_val = int(min_len_query) if min_len_query is not None else 0
244
+
245
+ # Extract min_n_tools
246
+ if isinstance(min_n_tools, dict):
247
+ if "value" in min_n_tools:
248
+ min_n_tools_val = int(min_n_tools["value"])
249
+ else:
250
+ min_n_tools_val = 0
251
+ else:
252
+ min_n_tools_val = int(min_n_tools) if min_n_tools is not None else 0
253
+
254
+ # Extract index
255
+ if isinstance(index, dict):
256
+ if "value" in index:
257
+ try:
258
+ index_val = int(index["value"])
259
+ except (ValueError, TypeError):
260
+ index_val = 0
261
+ else:
262
+ index_val = 0
263
+ else:
264
+ try:
265
+ index_val = int(index) if index is not None else 0
266
+ except (ValueError, TypeError):
267
+ index_val = 0
268
+
269
+ # Get the data
270
+ df_chat = get_chat_and_score_df(model_str, dataset_str)
271
+
272
+ # Ensure filter columns exist
273
+ for col, default in [
274
+ ("score", 0.0),
275
+ ("n_turns", 0),
276
+ ("len_query", 0),
277
+ ("n_tools", 0),
278
+ ]:
279
+ if col not in df_chat.columns:
280
+ df_chat[col] = default
281
+ else:
282
+ df_chat[col] = pd.to_numeric(df_chat[col], errors="coerce").fillna(
283
+ default
284
+ )
285
+
286
+ # Apply all filters
287
+ df_filtered = df_chat[
288
+ (df_chat["score"] >= min_score_val)
289
+ & (df_chat["score"] <= max_score_val)
290
+ & (df_chat["n_turns"] >= min_n_turns_val)
291
+ & (df_chat["len_query"] >= min_len_query_val)
292
+ & (df_chat["n_tools"] >= min_n_tools_val)
293
+ ].copy()
294
+
295
+ # Check if dataframe is empty
296
+ if len(df_filtered) == 0:
297
+ empty_message = """
298
+ <div style="
299
+ padding: 1.5rem;
300
+ text-align: center;
301
+ color: var(--text-muted);
302
+ background-color: var(--surface-color-alt);
303
+ border-radius: 8px;
304
+ border: 1px dashed var(--border-color);
305
+ margin: 1rem 0;">
306
+ <div style="font-size: 2rem; margin-bottom: 1rem;">📭</div>
307
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">No Results Found</div>
308
+ <div style="font-style: italic; font-size: 0.9rem;">Try adjusting your filters to see more data</div>
309
+ </div>
310
+ """
311
  return (
312
+ empty_message,
313
+ empty_message,
314
+ empty_message,
315
+ "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
316
  )
317
 
318
+ # Ensure index is valid
319
+ max_index = len(df_filtered) - 1
320
+ valid_index = max(0, min(index_val, max_index))
321
+
322
+ # Get the row
323
+ row = df_filtered.iloc[valid_index]
324
+
325
+ # Format displays
326
+ chat_html = format_chat_display(row)
327
+ metrics_html = format_metrics_display(row)
328
+
329
+ # Get tools info with error handling
330
+ try:
331
+ tool_html = format_tool_info(row["tools_langchain"])
332
+ except Exception as e:
333
+ tool_html = f"""
334
+ <div style="padding: 1rem; background-color: var(--surface-color-alt); border-radius: 8px; color: var(--text-muted);">
335
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">Tool Information Unavailable</div>
336
+ <div style="font-size: 0.9rem;">Error: {str(e)}</div>
337
+ </div>
338
+ """
339
+
340
+ # Index display
341
+ index_html = f"""
342
+ <div style="
343
+ display: flex;
344
+ align-items: center;
345
+ justify-content: center;
346
+ font-weight: 500;
347
+ color: var(--primary-text);
348
+ background-color: var(--surface-color-alt);
349
+ padding: 0.5rem 1rem;
350
+ border-radius: 20px;
351
+ font-size: 0.9rem;
352
+ width: fit-content;
353
+ margin: 0 auto;">
354
+ <span style="margin-right: 0.5rem;">📄</span>{valid_index + 1}/{len(df_filtered)}
355
+ </div>
356
+ """
357
+
358
+ return chat_html, metrics_html, tool_html, index_html
359
+
360
+ except Exception as e:
361
+ error_html = f"""
362
+ <div style="
363
+ padding: 1.5rem;
364
+ color: var(--score-low);
365
+ background-color: var(--surface-color);
366
+ border: 1px solid var(--score-low);
367
+ border-radius: 8px;
368
+ margin: 1rem 0;
369
+ display: flex;
370
+ align-items: flex-start;">
371
+ <div style="flex-shrink: 0; margin-right: 1rem; font-size: 1.5rem;">⚠️</div>
372
+ <div>
373
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Occurred</div>
374
+ <div style="
375
+ font-family: monospace;
376
+ background-color: var(--surface-color-alt);
377
+ padding: 1rem;
378
+ border-radius: 4px;
379
+ white-space: pre-wrap;
380
+ font-size: 0.9rem;">
381
+ {str(e)}
382
+ </div>
383
+ </div>
384
+ </div>
385
+ """
386
+ return (
387
+ error_html,
388
+ "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No metrics available</div>",
389
+ "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No tool information available</div>",
390
+ "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
391
+ )
392
+
393
+
394
+ def create_exploration_tab(df):
395
+ """Create an enhanced data exploration tab with better UI and functionality."""
396
+
397
+ # Main UI setup
398
  with gr.Tab("Data Exploration"):
399
+ # CSS styling (unchanged)
400
+ gr.HTML(
401
+ """
402
+ <style>
403
+ /* Custom styling for the exploration tab */
404
+ :root[data-theme="light"] {
405
+ --surface-color: #f8f9fa;
406
+ --surface-color-alt: #ffffff;
407
+ --text-color: #202124;
408
+ --text-muted: #666666;
409
+ --primary-text: #1a73e8;
410
+ --primary-text-light: rgba(26, 115, 232, 0.3);
411
+ --border-color: #e9ecef;
412
+ --border-color-light: #f1f3f5;
413
+ --shadow-color: rgba(0,0,0,0.05);
414
+ --message-bg-user: #E5F6FD;
415
+ --message-bg-assistant: #F7F7F8;
416
+ --message-bg-system: #FFF3E0;
417
+ --response-bg: #F0F7FF;
418
+ --score-high: #1a73e8;
419
+ --score-med: #f4b400;
420
+ --score-low: #ea4335;
421
+ }
422
+
423
+ :root[data-theme="dark"] {
424
+ --surface-color: #1e1e1e;
425
+ --surface-color-alt: #2d2d2d;
426
+ --text-color: #ffffff;
427
+ --text-muted: #a0a0a0;
428
+ --primary-text: #60a5fa;
429
+ --primary-text-light: rgba(96, 165, 250, 0.3);
430
+ --border-color: #404040;
431
+ --border-color-light: #333333;
432
+ --shadow-color: rgba(0,0,0,0.2);
433
+ --message-bg-user: #2d3748;
434
+ --message-bg-assistant: #1a1a1a;
435
+ --message-bg-system: #2c2516;
436
+ --response-bg: #1e2a3a;
437
+ --score-high: #60a5fa;
438
+ --score-med: #fbbf24;
439
+ --score-low: #ef4444;
440
+ }
441
+
442
+ #exploration-header {
443
+ margin-bottom: 1.5rem;
444
+ padding-bottom: 1rem;
445
+ border-bottom: 1px solid var(--border-color);
446
+ }
447
+
448
+ .filter-container {
449
+ background-color: var(--surface-color);
450
+ border-radius: 10px;
451
+ padding: 1rem;
452
+ margin-bottom: 1.5rem;
453
+ border: 1px solid var(--border-color);
454
+ box-shadow: 0 2px 6px var(--shadow-color);
455
+ }
456
+
457
+ .navigation-buttons button {
458
+ min-width: 120px;
459
+ font-weight: 500;
460
+ }
461
+
462
+ .content-panel {
463
+ margin-top: 1.5rem;
464
+ }
465
+
466
+ @media (max-width: 768px) {
467
+ .filter-row {
468
+ flex-direction: column;
469
+ }
470
+ }
471
+ </style>
472
+ """
473
+ )
474
 
475
+ # Header
476
+ with gr.Row(elem_id="exploration-header"):
477
+ gr.HTML(HEADER_CONTENT)
478
+
479
+ # Filters section
480
+ with gr.Column(elem_classes="filter-container"):
481
+ gr.Markdown("### 🔍 Filter Options")
482
+
483
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
484
+ explore_model = gr.Dropdown(
485
+ choices=MODELS,
486
+ value=MODELS[0],
487
+ label="Model",
488
+ container=True,
489
+ scale=1,
490
+ info="Select AI model",
491
+ )
492
+ explore_dataset = gr.Dropdown(
493
+ choices=DATASETS,
494
+ value=DATASETS[0],
495
+ label="Dataset",
496
+ container=True,
497
+ scale=1,
498
+ info="Select evaluation dataset",
499
+ )
500
+
501
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
502
+ min_score = gr.Slider(
503
+ minimum=float(min(SCORES)),
504
+ maximum=float(max(SCORES)),
505
+ value=float(min(SCORES)),
506
+ step=0.1,
507
+ label="Minimum TSQ Score",
508
+ container=True,
509
+ scale=1,
510
+ info="Filter responses with scores above this threshold",
511
+ )
512
+ max_score = gr.Slider(
513
+ minimum=float(min(SCORES)),
514
+ maximum=float(max(SCORES)),
515
+ value=float(max(SCORES)),
516
+ step=0.1,
517
+ label="Maximum TSQ Score",
518
+ container=True,
519
+ scale=1,
520
+ info="Filter responses with scores below this threshold",
521
+ )
522
+
523
+ # Get the data for initial ranges
524
+ df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
525
+
526
+ # Ensure columns exist and get ranges
527
+ n_turns_max = int(df_chat["n_turns"].max())
528
+ len_query_max = int(df_chat["len_query"].max())
529
+ n_tools_max = int(df_chat["n_tools"].max())
530
+
531
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
532
+ n_turns_filter = gr.Slider(
533
+ minimum=0,
534
+ maximum=n_turns_max,
535
+ value=0,
536
+ step=1,
537
+ label="Minimum Turn Count",
538
+ container=True,
539
+ scale=1,
540
+ info="Filter by minimum number of conversation turns",
541
+ )
542
+
543
+ len_query_filter = gr.Slider(
544
+ minimum=0,
545
+ maximum=len_query_max,
546
+ value=0,
547
+ step=10,
548
+ label="Minimum Query Length",
549
+ container=True,
550
+ scale=1,
551
+ info="Filter by minimum length of query in characters",
552
+ )
553
+
554
+ n_tools_filter = gr.Slider(
555
+ minimum=0,
556
+ maximum=n_tools_max,
557
+ value=0,
558
+ step=1,
559
+ label="Minimum Tool Count",
560
+ container=True,
561
+ scale=1,
562
+ info="Filter by minimum number of tools used",
563
+ )
564
+
565
+ with gr.Row():
566
+ reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
567
 
568
  # Navigation row
569
  with gr.Row(variant="panel"):
570
+ with gr.Column(scale=1):
571
+ prev_btn = gr.Button(
572
+ "← Previous",
573
+ size="lg",
574
+ variant="secondary",
575
+ elem_classes="navigation-buttons",
576
+ )
577
+
578
+ with gr.Column(scale=1, min_width=100):
579
+ index_display = gr.HTML(
580
+ value="<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
581
+ elem_id="index-display",
582
+ )
583
+
584
+ with gr.Column(scale=1):
585
+ next_btn = gr.Button(
586
+ "Next →",
587
+ size="lg",
588
+ variant="secondary",
589
+ elem_classes="navigation-buttons",
590
+ )
591
 
592
+ # Content areas
593
  with gr.Row(equal_height=True):
594
+ with gr.Column(scale=1):
595
+ chat_display = gr.HTML()
596
+ with gr.Column(scale=1):
597
+ metrics_display = gr.HTML()
598
+
599
+ with gr.Row():
600
  tool_info_display = gr.HTML()
601
 
602
+ # State for tracking current index (simple integer state)
603
  current_index = gr.State(value=0)
604
 
605
+ # Reset filters
606
+ def reset_filters():
607
+ return (
608
+ MODELS[0],
609
+ DATASETS[0],
610
+ float(min(SCORES)),
611
+ float(max(SCORES)),
612
+ 0, # n_turns
613
+ 0, # len_query
614
+ 0, # n_tools
615
+ )
616
+
617
+ reset_btn.click(
618
+ reset_filters,
619
+ outputs=[
620
+ explore_model,
621
+ explore_dataset,
622
+ min_score,
623
+ max_score,
624
+ n_turns_filter,
625
+ len_query_filter,
626
+ n_tools_filter,
627
+ ],
628
+ )
629
 
630
+ # Connect filter changes
631
+ # Replace the existing filter connections with this:
632
+ for control in [
633
+ explore_model,
634
+ explore_dataset,
635
+ min_score,
636
+ max_score,
637
+ n_turns_filter,
638
+ len_query_filter,
639
+ n_tools_filter,
640
+ ]:
641
  control.change(
642
+ on_filter_change,
643
+ inputs=[
644
+ explore_model,
645
+ explore_dataset,
646
+ min_score,
647
+ max_score,
648
+ n_turns_filter,
649
+ len_query_filter,
650
+ n_tools_filter,
651
+ ],
652
  outputs=[
653
  chat_display,
654
  metrics_display,
655
  tool_info_display,
656
  index_display,
657
+ ],
 
 
 
 
 
 
 
 
 
 
658
  )
659
 
660
+ # Connect navigation buttons with necessary filter parameters
661
  prev_btn.click(
662
+ navigate_prev,
663
  inputs=[
664
  current_index,
665
  explore_model,
666
  explore_dataset,
667
  min_score,
668
  max_score,
669
+ n_turns_filter,
670
+ len_query_filter,
671
+ n_tools_filter,
672
  ],
673
  outputs=[
674
  chat_display,
 
676
  tool_info_display,
677
  index_display,
678
  current_index,
679
+ ],
680
  )
681
 
682
  next_btn.click(
683
+ navigate_next,
684
  inputs=[
685
  current_index,
686
  explore_model,
687
  explore_dataset,
688
  min_score,
689
  max_score,
690
+ n_turns_filter,
691
+ len_query_filter,
692
+ n_tools_filter,
693
  ],
694
  outputs=[
695
  chat_display,
 
697
  tool_info_display,
698
  index_display,
699
  current_index,
700
+ ],
701
  )
702
 
703
+ def update_slider_ranges(model, dataset):
704
+ df_chat = get_chat_and_score_df(model, dataset)
705
+
706
+ # Make sure columns are numeric first
707
+ df_chat["n_turns"] = pd.to_numeric(
708
+ df_chat["n_turns"], errors="coerce"
709
+ ).fillna(0)
710
+ df_chat["len_query"] = pd.to_numeric(
711
+ df_chat["len_query"], errors="coerce"
712
+ ).fillna(0)
713
+ df_chat["n_tools"] = pd.to_numeric(
714
+ df_chat["n_tools"], errors="coerce"
715
+ ).fillna(0)
716
+
717
+ # Calculate maximums with safety buffers
718
+ n_turns_max = max(1, int(df_chat["n_turns"].max()))
719
+ len_query_max = max(10, int(df_chat["len_query"].max()))
720
+ n_tools_max = max(1, int(df_chat["n_tools"].max()))
721
+
722
+ # Return updated sliders using gr.update()
723
+ return (
724
+ gr.update(maximum=n_turns_max, value=0),
725
+ gr.update(maximum=len_query_max, value=0),
726
+ gr.update(maximum=n_tools_max, value=0),
727
+ )
728
+
729
+ # Connect model and dataset changes to slider range updates
730
+ explore_model.change(
731
+ update_slider_ranges,
732
+ inputs=[explore_model, explore_dataset],
733
+ outputs=[n_turns_filter, len_query_filter, n_tools_filter],
734
+ )
735
+ explore_dataset.change(
736
+ update_slider_ranges,
737
+ inputs=[explore_model, explore_dataset],
738
+ outputs=[n_turns_filter, len_query_filter, n_tools_filter],
739
+ )
740
+
741
+ return [
742
  chat_display,
743
  metrics_display,
744
  tool_info_display,
745
+ index_display,
746
+ ]
747
+
748
+
749
+ def filter_and_update_display(model, dataset, min_score, max_score, current_index):
750
+ """Filter the dataset and update the display with comprehensive error handling."""
751
+ try:
752
+ df_chat = get_chat_and_score_df(model, dataset)
753
+ df_chat = df_chat[
754
+ (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
755
+ ]
756
+
757
+ if df_chat.empty:
758
+ return (
759
+ '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic; background-color: var(--surface-color-alt); border-radius: 8px; border: 1px dashed var(--border-color);">No data available for selected filters</div>',
760
+ '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No metrics available</div>',
761
+ '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No tool information available</div>',
762
+ '<div style="font-weight: 500; color: var(--text-muted);">0/0</div>',
763
+ )
764
+
765
+ max_index = len(df_chat) - 1
766
+ current_index = min(current_index, max_index)
767
+ chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
768
+
769
+ index_display = f"""
770
+ <div style="
771
+ display: flex;
772
+ align-items: center;
773
+ justify-content: center;
774
+ font-weight: 500;
775
+ color: var(--primary-text);
776
+ background-color: var(--surface-color-alt);
777
+ padding: 0.5rem 1rem;
778
+ border-radius: 20px;
779
+ font-size: 0.9rem;
780
+ width: fit-content;
781
+ margin: 0 auto;">
782
+ <span style="margin-right: 0.25rem;">📄</span>{current_index + 1}/{len(df_chat)}
783
+ </div>
784
+ """
785
+ return chat_html, metrics_html, tool_html, index_display
786
+
787
+ except Exception as e:
788
+ error_html = f"""
789
+ <div style="
790
+ padding: 1.5rem;
791
+ color: var(--score-low);
792
+ background-color: var(--surface-color);
793
+ border: 1px solid var(--score-low);
794
+ border-radius: 8px;
795
+ display: flex;
796
+ align-items: flex-start;">
797
+ <div style="
798
+ flex-shrink: 0;
799
+ margin-right: 1rem;
800
+ font-size: 1.5rem;">⚠️</div>
801
+ <div>
802
+ <div style="
803
+ font-weight: 600;
804
+ margin-bottom: 0.5rem;">Error Occurred</div>
805
+ <div style="
806
+ font-family: monospace;
807
+ background-color: var(--surface-color-alt);
808
+ padding: 1rem;
809
+ border-radius: 4px;
810
+ white-space: pre-wrap;
811
+ font-size: 0.9rem;">
812
+ {str(e)}
813
+ </div>
814
+ </div>
815
+ </div>
816
+ """
817
+ return (
818
+ error_html,
819
+ '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No metrics available</div>',
820
+ '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No tool information available</div>',
821
+ '<div style="font-weight: 500; color: var(--text-muted);">0/0</div>',
822
  )
tabs/leaderboard.py CHANGED
@@ -186,6 +186,14 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
186
  plot1 = gr.Plot()
187
  plot2 = gr.Plot()
188
 
 
 
 
 
 
 
 
 
189
  gr.HTML(METHODOLOGY)
190
 
191
  for input_comp in [model_type, category, sort_by]:
 
186
  plot1 = gr.Plot()
187
  plot2 = gr.Plot()
188
 
189
+ gr.HTML(
190
+ """<div class="note-box">
191
+ <p style="margin: 0; font-size: 1em;">
192
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
193
+ </p>
194
+ </div>"""
195
+ )
196
+
197
  gr.HTML(METHODOLOGY)
198
 
199
  for input_comp in [model_type, category, sort_by]:
visualization.py CHANGED
@@ -199,7 +199,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
199
 
200
  ax.set_xscale("log")
201
  ax.set_xlim(0.08, 40)
202
- ax.set_ylim(60, 95)
203
 
204
  ax.set_xlabel(
205
  "I/O Cost per Million Tokens ($)",
@@ -233,7 +233,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
233
  color=colors["text"],
234
  )
235
 
236
- for y1, y2, color in zip([85, 75, 60], [95, 85, 75], colors["performance_bands"]):
237
  ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
238
 
239
  ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
 
199
 
200
  ax.set_xscale("log")
201
  ax.set_xlim(0.08, 40)
202
+ ax.set_ylim(60, 100)
203
 
204
  ax.set_xlabel(
205
  "I/O Cost per Million Tokens ($)",
 
233
  color=colors["text"],
234
  )
235
 
236
+ for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
237
  ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
238
 
239
  ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])