Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

pratikbhavsar commited on 13 days ago

Commit

cfec1f3

1 Parent(s): 3aeb75b

added sonnet and improved data explorer

Browse files

Files changed (23) hide show

app.py +17 -7
chat.py +1052 -252
data_loader.py +5 -3
output/claude-3-7-sonnet-20250219/BFCL_v3_irrelevance.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_composite.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_long_context.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_func.parquet +0 -0
output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_param.parquet +0 -0
output/claude-3-7-sonnet-20250219/tau_long_context.parquet +0 -0
output/claude-3-7-sonnet-20250219/toolace_single_func_call_1.parquet +0 -0
output/claude-3-7-sonnet-20250219/toolace_single_func_call_2.parquet +0 -0
output/claude-3-7-sonnet-20250219/xlam_multiple_tool_multiple_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/xlam_multiple_tool_single_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/xlam_single_tool_multiple_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/xlam_single_tool_single_call.parquet +0 -0
output/claude-3-7-sonnet-20250219/xlam_tool_miss.parquet +0 -0
requirements.txt +2 -3
results.csv +2 -1
tabs/data_exploration.py +755 -99
tabs/leaderboard.py +8 -0
visualization.py +2 -2

app.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import gradio as gr
 import promptquality as pq
 from dotenv import load_dotenv
@@ -15,8 +20,7 @@ from data_loader import (
 )
 from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
 from tabs.model_comparison import create_model_comparison_tab, compare_models
-from tabs.data_exploration import create_exploration_tab
-from chat import filter_and_update_display
 def create_app():
@@ -35,9 +39,7 @@ def create_app():
             mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
-            exp_outputs = create_exploration_tab(
-                df, MODELS, DATASETS, SCORES, HEADER_CONTENT
-            )
         # Initial loads
         app.load(
@@ -55,8 +57,16 @@ def create_app():
         )
         app.load(
-            fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], 0, 1, 0),
-            outputs=exp_outputs,
         )
     return app

+# Add this at the top of your script
+import warnings
+warnings.filterwarnings("ignore")
 import gradio as gr
 import promptquality as pq
 from dotenv import load_dotenv
 )
 from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
 from tabs.model_comparison import create_model_comparison_tab, compare_models
+from tabs.data_exploration import create_exploration_tab, filter_and_display
 def create_app():
             mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
+            exp_outputs = create_exploration_tab(df)
         # Initial loads
         app.load(
         )
         app.load(
+            fn=lambda: filter_and_display(
+                MODELS[0],
+                DATASETS[0],
+                min(SCORES),
+                max(SCORES),
+                0,
+                0,
+                0,
+            ),
+            outputs=exp_outputs[:-1],
         )
     return app

chat.py CHANGED Viewed

@@ -1,256 +1,723 @@
-import gradio as gr
-import pandas as pd
 import json
-def get_updated_df(df, df_output):
-    df = df.iloc[: len(df_output)].copy()
-    df["response"] = df_output["response"].tolist()
-    df["rationale"] = df_output["rationale"].tolist()
-    df["explanation"] = df_output["explanation"].tolist()
-    df["score"] = df_output["score"].tolist()
-    cols = [
-        "conversation",
-        "tools_langchain",
-        "n_turns",
-        "len_query",
-        "n_tools",
-        "response",
-        "rationale",
-        "explanation",
-        "score",
-    ]
-    return df[cols]
-def get_chat_and_score_df(model, dataset):
-    df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
-    df = pd.read_parquet(f"datasets/{dataset}.parquet")
-    df = get_updated_df(df, df_output)
-    return df
-def format_chat_message(role, content, is_response=False):
-    """Format individual chat messages with alignment based on role."""
-    role_style = role.lower()
-    alignment = "flex-end" if role_style == "user" else "flex-start"
-    max_width = "80%"
-    # Clean up any excessive whitespace while preserving intentional line breaks
-    cleaned_content = "\n".join(line.strip() for line in content.split("\n"))
-    background_color = (
-        "var(--response-bg)" if is_response else f"var(--message-bg-{role_style})"
-    )
     return f"""
     <div style="
-        display: flex;
-        justify-content: {alignment};
-        margin: 0.75rem 0;">
         <div style="
-            max-width: {max_width};
             padding: 1rem;
-            border-radius: 12px;
-            background-color: {background_color};
-            border: 1px solid var(--border-color);
             box-shadow: 0 1px 2px var(--shadow-color);">
             <div style="
-                font-weight: 600;
-                color: var(--primary-text);
                 margin-bottom: 0.5rem;
-                font-size: 0.9rem;
-                text-transform: uppercase;">
-                {role + (" Response" if is_response else "")}
             </div>
             <div style="
-                color: var(--text-color);
-                line-height: 1.6;
-                white-space: pre-wrap;
-                font-family: {is_response and 'monospace' or 'inherit'};
-                font-size: {is_response and '0.9rem' or 'inherit'};">
-                {cleaned_content}
             </div>
         </div>
     </div>
     """
-def format_response(response):
-    """Format the response data, handling both JSON and text."""
     try:
         # Try to parse as JSON
-        response_data = json.loads(response)
-        # Format JSON response nicely
-        formatted_response = json.dumps(response_data, indent=2)
-    except (json.JSONDecodeError, TypeError):
-        # If not JSON, use as is
-        formatted_response = str(response)
-    return formatted_response
-def parse_tool_schema(tool):
-    """Parse tool schema to extract name, description, and parameters properly."""
-    name = tool.get("title", "Unnamed Tool")
-    description = tool.get("description", "No description available")
-    parameters = {}
-    if "properties" in tool:
-        for param_name, param_data in tool["properties"].items():
-            param_desc = param_data.get("description", "No description")
-            param_type = param_data.get("type", "unknown")
-            parameters[param_name] = f"{param_desc} (Type: {param_type})"
-    return name, description, parameters
-def format_tool_info(tools):
-    """Format tool information with improved schema parsing and dark theme support."""
-    if isinstance(tools, str):
-        try:
-            tools = json.loads(tools)
-        except:
-            return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
-    if not tools:
-        return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
-    tool_html = ""
-    for tool in tools:
-        name, description, parameters = parse_tool_schema(tool)
-        tool_html += f"""
         <div style="
-            margin: 1rem 0;
-            padding: 1.5rem;
-            border-radius: 8px;
-            background-color: var(--surface-color);
-            border: 1px solid var(--border-color);">
             <div style="
-                font-weight: 600;
                 color: var(--primary-text);
-                margin-bottom: 0.75rem;
-                font-size: 1.1rem;">
-                {name}
             </div>
             <div style="
-                color: var(--text-color);
-                margin-bottom: 1rem;
-                line-height: 1.5;">
-                {description}
             </div>
             <div style="
-                background-color: var(--surface-color-alt);
-                padding: 1rem;
-                border-radius: 4px;
-                border: 1px solid var(--border-color);">
-                {format_parameters(parameters)}
             </div>
         </div>
         """
-    return f"""
-    <div style="
-        max-height: 600px;
-        overflow-y: auto;
-        padding-right: 0.5rem;">
-        <style>
-            :root[data-theme="light"] {{
-                --surface-color: #f8f9fa;
-                --surface-color-alt: #ffffff;
-                --text-color: #202124;
-                --text-muted: #666666;
-                --primary-text: #1a73e8;
-                --border-color: #e9ecef;
-                --shadow-color: rgba(0,0,0,0.1);
-                --message-bg-user: #E5F6FD;
-                --message-bg-assistant: #F7F7F8;
-                --message-bg-system: #FFF3E0;
-                --score-high: #1a73e8;
-                --score-med: #f4b400;
-                --score-low: #ea4335;
-            }}
-            :root[data-theme="dark"] {{
-                --surface-color: #1e1e1e;
-                --surface-color-alt: #2d2d2d;
-                --text-color: #ffffff;
-                --text-muted: #a0a0a0;
-                --primary-text: #60a5fa;
-                --border-color: #404040;
-                --shadow-color: rgba(0,0,0,0.3);
-                --message-bg-user: #2d3748;
-                --message-bg-assistant: #1a1a1a;
-                --message-bg-system: #2c2516;
-                --response-bg: #2a2f3a;
-                --score-high: #60a5fa;
-                --score-med: #fbbf24;
-                --score-low: #ef4444;
-            }}
-        </style>
-        {tool_html}
-    </div>
-    """
 def format_parameters(parameters):
     if not parameters:
-        return '<div style="color: var(--text-muted);">No parameters</div>'
     params_html = ""
     for name, desc in parameters.items():
         params_html += f"""
-        <div style="margin: 0.75rem 0;">
             <div style="
-                font-weight: 500;
-                color: var(--primary-text);
-                margin-bottom: 0.25rem;">
-                {name}
             </div>
             <div style="
                 color: var(--text-color);
-                line-height: 1.4;
-                font-size: 0.95rem;">
-                {desc}
             </div>
         </div>
         """
-    return params_html
-def format_metrics(score, rationale, explanation):
-    """Format metrics display with improved dark theme support."""
-    score_color = (
-        "var(--score-high)"
-        if score >= 0.7
-        else "var(--score-med)" if score >= 0.4 else "var(--score-low)"
     )
     return f"""
     <div style="
-        padding: 1.5rem;
         background-color: var(--surface-color);
-        border-radius: 8px;
         border: 1px solid var(--border-color);
-        box-shadow: 0 2px 4px var(--shadow-color);">
-        <div style="margin-bottom: 1.5rem;">
-            <h3 style="
-                color: var(--text-color);
-                font-size: 1.1rem;
-                margin-bottom: 0.5rem;
-                font-weight: 600;">TSQ Score</h3>
-            <div style="
-                font-size: 2rem;
-                font-weight: 600;
-                color: {score_color};">
-                {score:.2f}
             </div>
         </div>
-        <div style="margin-bottom: 1.5rem;">
             <h3 style="
                 color: var(--text-color);
                 font-size: 1.1rem;
-                margin-bottom: 0.5rem;
-                font-weight: 600;">Rationale</h3>
             <div style="
                 color: var(--text-color);
-                line-height: 1.5;">
                 {rationale}
             </div>
         </div>
@@ -258,93 +725,426 @@ def format_metrics(score, rationale, explanation):
             <h3 style="
                 color: var(--text-color);
                 font-size: 1.1rem;
-                margin-bottom: 0.5rem;
-                font-weight: 600;">Explanation</h3>
             <div style="
                 color: var(--text-color);
-                line-height: 1.5;">
                 {explanation}
             </div>
         </div>
     </div>
     """
-def update_chat_display(df, index):
-    """Update the chat visualization with improved dark theme support."""
-    if df is None or df.empty or index >= len(df):
-        return (
-            '<div style="padding: 1rem; color: var(--text-muted);">No data available</div>',
-            '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
-            '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
-        )
-    row = df.iloc[index]
-    messages = json.loads(row["conversation"])
-    response = row["response"]
-    formatted_response = format_response(response)
-    # Create list of all messages including the response
-    all_messages = [
-        format_chat_message(msg["role"], msg["content"]) for msg in messages
-    ]
-    all_messages.append(
-        format_chat_message("Assistant", formatted_response, is_response=True)
-    )
-    chat_html = f"""
-    <div style="
-        background-color: var(--surface-color);
-        border-radius: 8px;
-        border: 1px solid var(--border-color);
-        box-shadow: 0 2px 4px var(--shadow-color);
-        padding: 1.5rem;">
-        {"".join(all_messages)}
-    </div>
-    """
-    metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
-    tool_html = format_tool_info(row["tools_langchain"])
-    return chat_html, metrics_html, tool_html
-def filter_and_update_display(model, dataset, min_score, max_score, current_index):
-    try:
-        df_chat = get_chat_and_score_df(model, dataset)
-        df_chat = df_chat[
-            (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
-        ]
-        if df_chat.empty:
-            return (
-                '<div style="padding: 1rem; color: var(--text-muted);">No data available for selected filters</div>',
-                '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
-                '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
-                "0/0",
-            )
-        max_index = len(df_chat) - 1
-        current_index = min(current_index, max_index)
-        chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
-        index_display = f'<div style="font-weight: 500; color: var(--primary-text);">{current_index + 1}/{len(df_chat)}</div>'
-        return chat_html, metrics_html, tool_html, index_display
     except Exception as e:
-        error_html = f"""
         <div style="
-            padding: 1rem;
             color: var(--score-low);
             background-color: var(--surface-color);
             border: 1px solid var(--score-low);
-            border-radius: 4px;">
-            Error: {str(e)}
         </div>
         """
-        return (
-            error_html,
-            '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
-            '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
-            "0/0",
-        )

 import json
+def format_user_message(msg):
+    """Format a user message for display."""
+    # Extract the content based on role
+    content = msg.get("content", "")
+    # Handle None content
+    if content is None:
+        content = ""
+    elif isinstance(content, (int, float)):
+        content = str(content)
+    elif isinstance(content, list):
+        # Handle list-type content (may contain multiple parts)
+        content_text = ""
+        for item in content:
+            if item is None:
+                continue
+            if isinstance(item, dict) and "text" in item:
+                text_value = item.get("text", "")
+                if text_value is not None:
+                    content_text += str(text_value) + "\n"
+            elif isinstance(item, str):
+                content_text += item + "\n"
+            elif item is not None:
+                content_text += str(item) + "\n"
+        content = content_text.strip()
+    # User message - align right using text-align instead of flex
     return f"""
     <div style="
+        text-align: right;
+        margin-bottom: 1.25rem;
+        padding: 0 0.5rem;">
         <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-user);
             padding: 1rem;
+            border-radius: 1rem 0 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
             box-shadow: 0 1px 2px var(--shadow-color);">
             <div style="
+                font-weight: 500;
                 margin-bottom: 0.5rem;
+                color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">👤</span>User
+            </div>
+            <div style="white-space: pre-wrap; line-height: 1.5;">
+                {content}
             </div>
+        </div>
+    </div>
+    """
+def format_tool_call(tool_name, tool_input):
+    """Format a tool call for display."""
+    # Ensure tool_name is a string
+    if tool_name is None:
+        tool_name = "Unknown Tool"
+    elif not isinstance(tool_name, str):
+        tool_name = str(tool_name)
+    # Ensure tool_input is serializable
+    if tool_input is None:
+        tool_input = {}
+    try:
+        # Try to serialize the tool input as JSON
+        tool_input_json = json.dumps(tool_input, indent=2)
+    except TypeError:
+        # If serialization fails, create a simplified representation
+        if isinstance(tool_input, dict):
+            simplified_input = {}
+            for k, v in tool_input.items():
+                if v is None or isinstance(v, (str, int, float, bool, list, dict)):
+                    simplified_input[k] = v
+                else:
+                    simplified_input[k] = str(v)
+            tool_input_json = json.dumps(simplified_input, indent=2)
+        else:
+            tool_input_json = str(tool_input)
+    return f"""
+    <div style="
+        background-color: var(--surface-color-alt);
+        padding: 0.75rem;
+        border-radius: 0.5rem;
+        margin-top: 0.75rem;
+        border-left: 3px solid var(--primary-text-light);">
+        <div style="
+            font-weight: 500;
+            margin-bottom: 0.5rem;
+            font-size: 0.9rem;
+            color: var(--primary-text);">
+            <span style="margin-right: 0.5rem;">🔧</span>{tool_name}
+        </div>
+        <div style="
+            font-family: monospace;
+            font-size: 0.85rem;
+            white-space: pre-wrap;">
+            {tool_input_json}
+        </div>
+    </div>
+    """
+def extract_assistant_content(msg):
+    """Extract text content and tool calls from an assistant message."""
+    assistant_text = ""
+    tool_calls_html = ""
+    if "content" in msg:
+        content = msg["content"]
+        # Handle string content
+        if content is None:
+            assistant_text = ""
+        elif isinstance(content, str):
+            assistant_text = content
+        elif isinstance(content, (int, float)):
+            assistant_text = str(content)
+        # Handle list content with text and tool calls
+        elif isinstance(content, list):
+            for item in content:
+                if item is None:
+                    continue
+                if isinstance(item, dict):
+                    if "text" in item:
+                        text_value = item.get("text", "")
+                        if text_value is not None:
+                            assistant_text += str(text_value) + "\n"
+                    elif "type" in item and item["type"] == "tool_use":
+                        # Format tool call in a nicer way
+                        tool_name = item.get("name", "Unknown Tool")
+                        tool_input = item.get("input", {})
+                        if tool_input is None:
+                            tool_input = {}
+                        tool_calls_html += format_tool_call(tool_name, tool_input)
+                elif isinstance(item, str):
+                    assistant_text += item + "\n"
+                elif item is not None:
+                    assistant_text += str(item) + "\n"
+    # Extract tool calls if present
+    elif "tool_calls" in msg:
+        assistant_text = "The assistant used the following tools:"
+        tool_calls = msg.get("tool_calls", [])
+        if tool_calls is None:
+            tool_calls = []
+        for tool_call in tool_calls:
+            if tool_call is None:
+                continue
+            tool_name = tool_call.get("name", "Unknown Tool")
+            tool_args = tool_call.get("args", {})
+            if tool_args is None:
+                tool_args = {}
+            tool_calls_html += format_tool_call(tool_name, tool_args)
+    return assistant_text.strip(), tool_calls_html
+def format_assistant_message(msg):
+    """Format an assistant message for display."""
+    assistant_text, tool_calls_html = extract_assistant_content(msg)
+    return f"""
+    <div style="
+        text-align: left;
+        margin-bottom: 1.25rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-assistant);
+            padding: 1rem;
+            border-radius: 0 1rem 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
+            box-shadow: 0 1px 2px var(--shadow-color);">
             <div style="
+                font-weight: 500;
+                margin-bottom: 0.5rem;
+                color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">🤖</span>Assistant
+            </div>
+            <div style="white-space: pre-wrap; line-height: 1.5;">
+                {assistant_text}
             </div>
+            {tool_calls_html}
+        </div>
+    </div>
+    """
+def format_system_message(msg):
+    """Format a system or other message for display."""
+    content = msg.get("content", "")
+    # Handle None content
+    if content is None:
+        content = ""
+    elif isinstance(content, (int, float)):
+        content = str(content)
+    elif isinstance(content, list):
+        content_text = ""
+        for item in content:
+            if item is None:
+                continue
+            if isinstance(item, dict) and "text" in item:
+                text_value = item.get("text", "")
+                if text_value is not None:
+                    content_text += str(text_value) + "\n"
+            elif isinstance(item, str):
+                content_text += item + "\n"
+            elif item is not None:
+                content_text += str(item) + "\n"
+        content = content_text.strip()
+    return f"""
+    <div style="
+        text-align: center;
+        margin-bottom: 1rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-system);
+            padding: 0.75rem;
+            border-radius: 0.5rem;
+            color: var(--text-color);
+            text-align: left;
+            font-style: italic;
+            font-size: 0.9rem;">
+            {content}
         </div>
     </div>
     """
+def parse_complex_response(response):
+    """Parse complex JSON response and extract text and tool calls."""
     try:
+        # Ensure response is a string
+        if response is None:
+            return "", ""
+        if isinstance(response, (int, float)):
+            return str(response), ""
+        # Convert to string if it's not already
+        if not isinstance(response, str):
+            response = str(response)
         # Try to parse as JSON
+        if not response.strip().startswith("[") and not response.strip().startswith(
+            "{"
+        ):
+            return response, ""
+        response_obj = json.loads(response)
+        # Handle array format like in the example
+        if isinstance(response_obj, list) and len(response_obj) > 0:
+            response_obj = response_obj[0]  # Take first item in array
+        # Extract text content and tool calls
+        text_content = ""
+        tool_calls_html = ""
+        # Handle content field which can be string or list
+        if "content" in response_obj:
+            content = response_obj["content"]
+            if content is None:
+                text_content = ""
+            elif isinstance(content, str):
+                text_content = content
+            elif isinstance(content, (int, float)):
+                text_content = str(content)
+            elif isinstance(content, list):
+                # Extract only text content from items with type="text"
+                for item in content:
+                    if item is None:
+                        continue
+                    if isinstance(item, dict):
+                        if "type" in item and item["type"] == "text" and "text" in item:
+                            text_value = item.get("text", "")
+                            if text_value is not None:
+                                text_content += str(text_value) + "\n"
+        # Get formatted tool calls if they exist
+        if "tool_calls" in response_obj:
+            tool_calls = response_obj.get("tool_calls", [])
+            if tool_calls is None:
+                tool_calls = []
+            if tool_calls:
+                try:
+                    tool_calls_html = f"""
+                    <div style="
+                        background-color: var(--surface-color-alt);
+                        padding: 0.75rem;
+                        border-radius: 0.5rem;
+                        margin-top: 0.75rem;
+                        border-left: 3px solid var(--primary-text-light);">
+                        <div style="
+                            font-weight: 500;
+                            margin-bottom: 0.5rem;
+                            font-size: 0.9rem;
+                            color: var(--primary-text);">
+                            <span style="margin-right: 0.5rem;">🔧</span>Tool Calls
+                        </div>
+                        <div style="
+                            font-family: monospace;
+                            font-size: 0.85rem;
+                            white-space: pre-wrap;">
+                            {json.dumps(tool_calls, indent=2)}
+                        </div>
+                    </div>
+                    """
+                except:
+                    # Fallback if JSON serialization fails
+                    tool_calls_html = (
+                        "<div>Tool calls present but could not be formatted.</div>"
+                    )
+        return text_content.strip(), tool_calls_html
+    except Exception as e:
+        # If parsing fails, return the original response with error info
+        return f"{response}\n\nError parsing response: {str(e)}", ""
+def format_final_response(response):
+    """Format the final response for display."""
+    # First try to process as complex JSON with tool calls
+    text_content, tool_calls_html = parse_complex_response(response)
+    # If that didn't work, try basic JSON parsing
+    if text_content == response:
+        # Clean up JSON response if it looks like JSON
+        if response.strip().startswith("{") and "content" in response:
+            try:
+                response_obj = json.loads(response)
+                if isinstance(response_obj, dict) and "content" in response_obj:
+                    if isinstance(response_obj["content"], str):
+                        text_content = response_obj["content"]
+                    else:
+                        text_content = json.dumps(response_obj["content"], indent=2)
+                else:
+                    text_content = response
+            except:
+                text_content = response
+        else:
+            text_content = response
+    return f"""
+    <div style="
+        text-align: left;
+        margin-bottom: 1.25rem;
+        margin-top: 1.5rem;
+        padding: 0 0.5rem;">
         <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--response-bg);
+            padding: 1rem;
+            border-radius: 0 1rem 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
+            box-shadow: 0 1px 2px var(--shadow-color);
+            border-left: 4px solid var(--primary-text);">
             <div style="
+                font-weight: 500;
+                margin-bottom: 0.5rem;
                 color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">🤖</span>Final Response
             </div>
             <div style="
+                white-space: pre-wrap;
+                line-height: 1.5;
+                font-family: var(--font-sans);">
+                {text_content}
             </div>
+            {tool_calls_html}
+        </div>
+    </div>
+    """
+def update_chat_display(existing_display, new_message):
+    """Update an existing chat display with a new message."""
+    try:
+        # Parse the new message
+        role = new_message.get("role", "unknown").lower()
+        # Format the new message based on its role
+        if role == "user":
+            message_html = format_user_message(new_message)
+        elif role == "assistant" or role == "ai":
+            message_html = format_assistant_message(new_message)
+        else:
+            message_html = format_system_message(new_message)
+        # Find the position to insert the new message (before the Final Response section)
+        insert_marker = '<div style="padding-top: 0.5rem;margin-top: 1rem;margin-bottom: 1rem;border-top: 1px solid var(--border-color-light);'
+        parts = existing_display.split(insert_marker)
+        if len(parts) == 2:
+            # Insert the new message before the Final Response section
+            updated_display = parts[0] + message_html + insert_marker + parts[1]
+            return updated_display
+        else:
+            # If we can't find the insertion point, append to the end
+            return existing_display + message_html
+    except Exception as e:
+        return (
+            existing_display
+            + f"""
+        <div style="
+            padding: 1rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;
+            margin-top: 1rem;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Updating Chat</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+        </div>
+        """
+        )
+def format_chat_display(row):
+    """Format the chat display with better styling for user and assistant messages."""
+    try:
+        # Parse the conversation JSON
+        messages = json.loads(row["conversation"])
+        # Create HTML for all messages
+        messages_html = ""
+        for msg in messages:
+            role = msg.get("role", "unknown").lower()
+            if role == "user":
+                messages_html += format_user_message(msg)
+            elif role == "assistant" or role == "ai":
+                messages_html += format_assistant_message(msg)
+            else:
+                # System or other message types
+                messages_html += format_system_message(msg)
+        # Format the final response from the assistant
+        response_html = format_final_response(row["response"])
+        # Combine all HTML
+        full_chat_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;
+            font-family: var(--font-sans);">
             <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">💬</span>Conversation
+                </div>
             </div>
+            {messages_html}
+            {response_html}
         </div>
         """
+        return full_chat_html
+    except Exception as e:
+        return f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Chat</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+            <div style="margin-top: 1rem; font-family: monospace; font-size: 0.8rem;">
+                Original conversation: {str(row["conversation"])}
+            </div>
+        </div>
+        """
+def parse_tool_schema(tool):
+    """Parse tool schema to extract name, description, and parameters properly."""
+    # Handle schema wrapped in a list
+    if isinstance(tool, list) and len(tool) > 0:
+        tool = tool[0]
+    # Extract function information from the new schema structure with "function" key
+    if "function" in tool:
+        function_data = tool["function"]
+        name = function_data.get("name", "Unnamed Tool")
+        description = function_data.get("description", "No description available")
+        parameters = {}
+        if (
+            "parameters" in function_data
+            and "properties" in function_data["parameters"]
+        ):
+            properties = function_data["parameters"]["properties"]
+            for param_name, param_data in properties.items():
+                param_desc = param_data.get("description", "No description")
+                param_type = param_data.get("type", "unknown")
+                param_default = param_data.get("default", "None")
+                # Include default value in parameter description
+                parameters[param_name] = (
+                    f"{param_desc} (Type: {param_type}, Default: {param_default})"
+                )
+        # Check for required parameters
+        required_params = function_data.get("parameters", {}).get("required", [])
+        if required_params:
+            for param_name in required_params:
+                if param_name in parameters:
+                    parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
+    else:
+        # Original schema parsing
+        name = tool.get("title", "Unnamed Tool")
+        description = tool.get("description", "No description available")
+        parameters = {}
+        if "properties" in tool:
+            for param_name, param_data in tool["properties"].items():
+                param_desc = param_data.get("description", "No description")
+                param_type = param_data.get("type", "unknown")
+                param_title = param_data.get("title", param_name)
+                parameters[param_name] = (
+                    f"{param_desc} (Type: {param_type}, Title: {param_title})"
+                )
+        # Check for required parameters in the original schema
+        required_params = tool.get("required", [])
+        if required_params:
+            for param_name in required_params:
+                if param_name in parameters:
+                    parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
+    return name, description, parameters
 def format_parameters(parameters):
     if not parameters:
+        return '<div style="color: var(--text-muted); font-style: italic;">No parameters</div>'
     params_html = ""
     for name, desc in parameters.items():
+        is_required = "[REQUIRED]" in desc
+        param_style = "required" if is_required else "optional"
+        # Clean up the description to remove the REQUIRED marker but keep the info
+        cleaned_desc = desc.replace("[REQUIRED] ", "") if is_required else desc
         params_html += f"""
+        <div style="
+            margin-bottom: 1.2rem;
+            padding-bottom: 1.2rem;
+            border-bottom: 1px solid var(--border-color);
+            last-child: border-bottom: none;">
             <div style="
+                display: flex;
+                align-items: center;
+                justify-content: space-between;
+                margin-bottom: 0.5rem;">
+                <div style="
+                    font-weight: 600;
+                    color: var(--primary-text);
+                    font-size: 1.05rem;
+                    display: flex;
+                    align-items: center;">
+                    {name}
+                </div>
+                <div style="
+                    font-size: 0.8rem;
+                    padding: 0.2rem 0.6rem;
+                    border-radius: 12px;
+                    background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
+                    color: var(--{param_style}-color);
+                    font-weight: 500;">
+                    {f"Required" if is_required else "Optional"}
+                </div>
             </div>
             <div style="
                 color: var(--text-color);
+                line-height: 1.5;
+                font-size: 0.95rem;
+                opacity: 0.9;">
+                {cleaned_desc}
             </div>
         </div>
         """
+    # Remove the border-bottom from the last parameter
+    params_html = params_html.replace("last-child: border-bottom: none;", "")
+    return (
+        params_html
+        + """
+    <style>
+        div:last-child {
+            border-bottom: none !important;
+            margin-bottom: 0 !important;
+            padding-bottom: 0 !important;
+        }
+    </style>
+    """
     )
+def format_metrics(score, rationale, explanation):
+    """Format metrics display with improved visual hierarchy and dark theme support."""
+    # Determine score color and add emoji indicator
+    if score >= 0.7:
+        score_color = "var(--score-high)"
+        score_emoji = "🟢"
+        score_text = "High"
+    elif score >= 0.4:
+        score_color = "var(--score-med)"
+        score_emoji = "🟠"
+        score_text = "Medium"
+    else:
+        score_color = "var(--score-low)"
+        score_emoji = "🔴"
+        score_text = "Low"
     return f"""
     <div style="
+        padding: 1.75rem;
         background-color: var(--surface-color);
+        border-radius: 10px;
         border: 1px solid var(--border-color);
+        box-shadow: 0 3px 8px var(--shadow-color);">
+        <div style="
+            display: flex;
+            align-items: center;
+            margin-bottom: 1.75rem;
+            padding-bottom: 1.5rem;
+            border-bottom: 1px solid var(--border-color-light);">
+            <div style="flex: 1;">
+                <h3 style="
+                    color: var(--text-color);
+                    font-size: 1.2rem;
+                    margin-bottom: 0.25rem;
+                    font-weight: 600;">TSQ Score</h3>
+                <div style="
+                    display: flex;
+                    align-items: baseline;">
+                    <div style="
+                        font-size: 2.5rem;
+                        font-weight: 700;
+                        color: {score_color};">
+                        {score:.2f}
+                    </div>
+                    <div style="
+                        margin-left: 0.75rem;
+                        font-size: 1rem;
+                        color: {score_color};
+                        font-weight: 500;
+                        display: flex;
+                        align-items: center;">
+                        <span style="margin-right: 0.5rem;">{score_emoji}</span>{score_text}
+                    </div>
+                </div>
             </div>
         </div>
+        <div style="margin-bottom: 1.75rem;">
             <h3 style="
                 color: var(--text-color);
                 font-size: 1.1rem;
+                margin-bottom: 0.75rem;
+                font-weight: 600;
+                display: flex;
+                align-items: center;">
+                <span style="
+                    display: inline-block;
+                    width: 18px;
+                    height: 18px;
+                    background-color: var(--primary-text-light);
+                    border-radius: 4px;
+                    margin-right: 0.5rem;"></span>
+                Rationale
+            </h3>
             <div style="
                 color: var(--text-color);
+                line-height: 1.6;
+                padding-left: 1.5rem;
+                border-left: 3px solid var(--primary-text-light);
+                font-size: 0.95rem;">
                 {rationale}
             </div>
         </div>
             <h3 style="
                 color: var(--text-color);
                 font-size: 1.1rem;
+                margin-bottom: 0.75rem;
+                font-weight: 600;
+                display: flex;
+                align-items: center;">
+                <span style="
+                    display: inline-block;
+                    width: 18px;
+                    height: 18px;
+                    background-color: var(--primary-text-light);
+                    border-radius: 4px;
+                    margin-right: 0.5rem;"></span>
+                Explanation
+            </h3>
             <div style="
                 color: var(--text-color);
+                line-height: 1.6;
+                padding-left: 1.5rem;
+                border-left: 3px solid var(--primary-text-light);
+                font-size: 0.95rem;">
                 {explanation}
             </div>
         </div>
     </div>
     """
+def format_metrics_display(row):
+    """Format the metrics display with score, rationale and explanation."""
+    try:
+        score = row["score"]
+        rationale = row["rationale"]
+        explanation = row["explanation"]
+        # Determine score color and add emoji indicator
+        if score >= 0.7:
+            score_color = "var(--score-high)"
+            score_emoji = "🟢"
+            score_text = "High"
+        elif score >= 0.4:
+            score_color = "var(--score-med)"
+            score_emoji = "🟠"
+            score_text = "Medium"
+        else:
+            score_color = "var(--score-low)"
+            score_emoji = "🔴"
+            score_text = "Low"
+        metrics_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;">
+            <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">📊</span>Evaluation Metrics
+                </div>
+            </div>
+            <div style="
+                margin-bottom: 1.5rem;
+                padding-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);">
+                <div style="
+                    display: flex;
+                    align-items: center;
+                    justify-content: space-between;">
+                    <div>
+                        <div style="
+                            font-weight: 600;
+                            margin-bottom: 0.25rem;
+                            color: var(--text-color);">
+                            TSQ Score
+                        </div>
+                        <div style="
+                            font-size: 2.5rem;
+                            font-weight: 700;
+                            color: {score_color};
+                            display: flex;
+                            align-items: center;">
+                            {score:.2f}
+                            <div style="
+                                margin-left: 0.75rem;
+                                font-size: 1rem;
+                                display: flex;
+                                align-items: center;">
+                                {score_emoji} <span style="margin-left: 0.25rem;">{score_text}</span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div style="margin-bottom: 1.5rem;">
+                <div style="
+                    font-weight: 600;
+                    margin-bottom: 0.75rem;
+                    color: var(--text-color);
+                    display: flex;
+                    align-items: center;">
+                    <span style="
+                        display: inline-block;
+                        width: 12px;
+                        height: 12px;
+                        background-color: var(--primary-text-light);
+                        border-radius: 2px;
+                        margin-right: 0.5rem;"></span>
+                    Rationale
+                </div>
+                <div style="
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 8px;
+                    border-left: 3px solid var(--primary-text-light);
+                    line-height: 1.5;
+                    color: var(--text-color);
+                    font-size: 0.95rem;">
+                    {rationale}
+                </div>
+            </div>
+            <div>
+                <div style="
+                    font-weight: 600;
+                    margin-bottom: 0.75rem;
+                    color: var(--text-color);
+                    display: flex;
+                    align-items: center;">
+                    <span style="
+                        display: inline-block;
+                        width: 12px;
+                        height: 12px;
+                        background-color: var(--primary-text-light);
+                        border-radius: 2px;
+                        margin-right: 0.5rem;"></span>
+                    Explanation
+                </div>
+                <div style="
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 8px;
+                    border-left: 3px solid var(--primary-text-light);
+                    line-height: 1.5;
+                    color: var(--text-color);
+                    font-size: 0.95rem;">
+                    {explanation}
+                </div>
+            </div>
+        </div>
+        """
+        return metrics_html
+    except Exception as e:
+        return f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Metrics</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+        </div>
+        """
+def format_tool_info(tools_data):
+    """Format the tool information with improved styling."""
+    try:
+        if not tools_data or tools_data == "[]":
+            return """
+            <div style="
+                padding: 1.5rem;
+                text-align: center;
+                color: var(--text-muted);
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);">
+                <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
+                <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
+            </div>
+            """
+        if isinstance(tools_data, str):
+            try:
+                tools = json.loads(tools_data)
+            except:
+                tools = []
+        else:
+            tools = tools_data
+        if not tools:
+            return """
+            <div style="
+                padding: 1.5rem;
+                text-align: center;
+                color: var(--text-muted);
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);">
+                <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
+                <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
+            </div>
+            """
+        # Format each tool
+        tool_items = ""
+        for tool in tools:
+            name = tool.get("title", tool.get("name", "Unnamed Tool"))
+            description = tool.get("description", "No description available")
+            # Get parameters
+            parameters = {}
+            required_params = []
+            # Handle different schema formats
+            if "function" in tool:
+                # Function schema format
+                function_data = tool["function"]
+                name = function_data.get("name", name)
+                description = function_data.get("description", description)
+                if (
+                    "parameters" in function_data
+                    and "properties" in function_data["parameters"]
+                ):
+                    properties = function_data["parameters"]["properties"]
+                    for param_name, param_data in properties.items():
+                        param_desc = param_data.get("description", "No description")
+                        param_type = param_data.get("type", "unknown")
+                        param_default = param_data.get("default", "None")
+                        parameters[param_name] = {
+                            "description": param_desc,
+                            "type": param_type,
+                            "default": param_default,
+                        }
+                    required_params = function_data.get("parameters", {}).get(
+                        "required", []
+                    )
+            elif "properties" in tool:
+                # Original schema format
+                if "properties" in tool:
+                    for param_name, param_data in tool["properties"].items():
+                        param_desc = param_data.get("description", "No description")
+                        param_type = param_data.get("type", "unknown")
+                        param_title = param_data.get("title", param_name)
+                        parameters[param_name] = {
+                            "description": param_desc,
+                            "type": param_type,
+                            "title": param_title,
+                        }
+                    required_params = tool.get("required", [])
+            # Format parameters
+            params_html = ""
+            if parameters:
+                for param_name, param_data in parameters.items():
+                    is_required = param_name in required_params
+                    param_style = "required" if is_required else "optional"
+                    params_html += f"""
+                    <div style="
+                        margin-bottom: 1rem;
+                        padding-bottom: 1rem;
+                        border-bottom: 1px solid var(--border-color-light);">
+                        <div style="
+                            display: flex;
+                            align-items: center;
+                            justify-content: space-between;
+                            margin-bottom: 0.5rem;">
+                            <div style="
+                                font-weight: 600;
+                                color: var(--primary-text);
+                                font-size: 0.95rem;">
+                                {param_name}
+                            </div>
+                            <div style="
+                                font-size: 0.75rem;
+                                padding: 0.15rem 0.5rem;
+                                border-radius: 12px;
+                                background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
+                                color: {f"var(--score-low)" if is_required else "var(--text-muted)"};
+                                font-weight: 500;">
+                                {f"Required" if is_required else "Optional"}
+                            </div>
+                        </div>
+                        <div style="
+                            color: var(--text-muted);
+                            line-height: 1.5;
+                            font-size: 0.85rem;
+                            margin-bottom: 0.25rem;">
+                            {param_data.get("description", "No description")}
+                        </div>
+                        <div style="
+                            display: flex;
+                            font-size: 0.8rem;
+                            color: var(--text-muted);">
+                            <div style="margin-right: 1rem;">
+                                <span style="font-weight: 500;">Type:</span> {param_data.get("type", "unknown")}
+                            </div>
+                            {f'<div><span style="font-weight: 500;">Default:</span> {param_data.get("default", "None")}</div>' if "default" in param_data else ''}
+                        </div>
+                    </div>
+                    """
+            else:
+                params_html = """
+                <div style="
+                    color: var(--text-muted);
+                    font-style: italic;
+                    padding: 0.75rem;
+                    text-align: center;
+                    font-size: 0.9rem;">
+                    No parameters
+                </div>
+                """
+            # Remove border from last parameter
+            params_html += """
+            <style>
+                .tool-params > div:last-child {
+                    border-bottom: none !important;
+                    margin-bottom: 0 !important;
+                    padding-bottom: 0 !important;
+                }
+            </style>
+            """
+            tool_items += f"""
+            <div style="
+                margin-bottom: 1.5rem;
+                padding: 1.5rem;
+                border-radius: 8px;
+                background-color: var(--surface-color-alt);
+                border: 1px solid var(--border-color);
+                box-shadow: 0 1px 3px var(--shadow-color);">
+                <div style="
+                    font-weight: 600;
+                    color: var(--primary-text);
+                    margin-bottom: 0.75rem;
+                    font-size: 1.05rem;
+                    display: flex;
+                    align-items: center;">
+                    <span style="margin-right: 8px;">⚙️</span> {name}
+                </div>
+                <div style="
+                    color: var(--text-color);
+                    margin-bottom: 1.25rem;
+                    line-height: 1.5;
+                    font-size: 0.95rem;
+                    padding-left: 0.5rem;
+                    border-left: 3px solid var(--primary-text-light);">
+                    {description}
+                </div>
+                <div style="
+                    font-weight: 600;
+                    color: var(--text-color);
+                    margin-bottom: 0.75rem;
+                    font-size: 0.9rem;">
+                    Parameters:
+                </div>
+                <div class="tool-params">
+                    {params_html}
+                </div>
+            </div>
+            """
+        full_tools_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;">
+            <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">🛠️</span>Available Tools
+                </div>
+            </div>
+            {tool_items}
+        </div>
+        """
+        return full_tools_html
     except Exception as e:
+        return f"""
         <div style="
+            padding: 1.5rem;
             color: var(--score-low);
             background-color: var(--surface-color);
             border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Tool Info</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
         </div>
         """

data_loader.py CHANGED Viewed

@@ -32,6 +32,8 @@ def load_data():
     )
     return df
 # categories.py
 CATEGORIES = {
@@ -602,9 +604,9 @@ HEADER_CONTENT = (
 CARDS = """        <div class="metrics-grid">
             <div class="metric-card">
-                <div class="metric-number metric-blue">17</div>
                 <div class="metric-label">Total Models</div>
-                <div class="metric-detail primary">12 Private</div>
                 <div class="metric-detail primary">5 Open Source</div>
             </div>
@@ -1001,7 +1003,7 @@ METHODOLOGY = """
             <tbody>
                 <tr>
                     <td>Performance Champion</td>
-                    <td>Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
                 </tr>
                 <tr>
                     <td>Price-Performance Paradox</td>

     )
     return df
+df = load_data()
+MODELS = [x.strip() for x in df["Model"].unique().tolist()]
 # categories.py
 CATEGORIES = {
 CARDS = """        <div class="metrics-grid">
             <div class="metric-card">
+                <div class="metric-number metric-blue">18</div>
                 <div class="metric-label">Total Models</div>
+                <div class="metric-detail primary">13 Private</div>
                 <div class="metric-detail primary">5 Open Source</div>
             </div>
             <tbody>
                 <tr>
                     <td>Performance Champion</td>
+                    <td>Claude 3.7 Sonnet comes at the top with 0.953 but Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
                 </tr>
                 <tr>
                     <td>Price-Performance Paradox</td>

output/claude-3-7-sonnet-20250219/BFCL_v3_irrelevance.parquet ADDED Viewed

Binary file (52.7 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED Viewed

Binary file (26.5 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED Viewed

Binary file (26.6 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_composite.parquet ADDED Viewed

Binary file (51.9 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_long_context.parquet ADDED Viewed

Binary file (43.2 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_func.parquet ADDED Viewed

Binary file (51.1 kB). View file

output/claude-3-7-sonnet-20250219/BFCL_v3_multi_turn_miss_param.parquet ADDED Viewed

Binary file (50.5 kB). View file

output/claude-3-7-sonnet-20250219/tau_long_context.parquet ADDED Viewed

Binary file (46.6 kB). View file

output/claude-3-7-sonnet-20250219/toolace_single_func_call_1.parquet ADDED Viewed

Binary file (20.8 kB). View file

output/claude-3-7-sonnet-20250219/toolace_single_func_call_2.parquet ADDED Viewed

Binary file (17.1 kB). View file

output/claude-3-7-sonnet-20250219/xlam_multiple_tool_multiple_call.parquet ADDED Viewed

Binary file (107 kB). View file

output/claude-3-7-sonnet-20250219/xlam_multiple_tool_single_call.parquet ADDED Viewed

Binary file (51.2 kB). View file

output/claude-3-7-sonnet-20250219/xlam_single_tool_multiple_call.parquet ADDED Viewed

Binary file (34.2 kB). View file

output/claude-3-7-sonnet-20250219/xlam_single_tool_single_call.parquet ADDED Viewed

Binary file (56.5 kB). View file

output/claude-3-7-sonnet-20250219/xlam_tool_miss.parquet ADDED Viewed

Binary file (66.7 kB). View file

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-gradio==5.12.0
 pandas
 matplotlib
-plotly
-promptquality==0.72.1

+gradio==5.18.0
 pandas
 matplotlib
+plotly

results.csv CHANGED Viewed

@@ -1,4 +1,5 @@
 Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
 gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
 gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
 gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
@@ -16,4 +17,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
 ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
 Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
 open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
-,,,,,,,0.83,0.79,0.81,0.78,0.76,0.88,0.80,0.96,0.60,0.81,0.82,0.81,0.92,0.85,0.73,0.80

 Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
+claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
 gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
 gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
 gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
 ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
 Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
 open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
+Dataset Avg,,,,,,,0.83,0.80,0.81,0.79,0.78,0.89,0.81,0.96,0.62,0.81,0.82,0.82,0.92,0.85,0.74,0.81

tabs/data_exploration.py CHANGED Viewed

@@ -1,135 +1,674 @@
 import gradio as gr
-from chat import get_chat_and_score_df, update_chat_display
-def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
-    def filter_and_update_display(model, dataset, min_score, max_score, current_index):
-        try:
-            df_chat = get_chat_and_score_df(model, dataset)
-            # Filter by score range
-            df_chat = df_chat[
-                (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
-            ]
-            if df_chat.empty:
-                return (
-                    "<div>No data available for selected filters</div>",
-                    "<div>No metrics available</div>",
-                    "<div>No tool information available</div>",
-                    "0/0",
-                )
-            max_index = len(df_chat) - 1
-            current_index = min(current_index, max_index)
-            chat_html, metrics_html, tool_html = update_chat_display(
-                df_chat, current_index
             )
-            return (
-                chat_html,
-                metrics_html,
-                tool_html,
-                f"{current_index + 1}/{len(df_chat)}",
             )
-        except Exception as e:
-            print(f"Error in filter_and_update_display: {str(e)}")
             return (
-                f"<div>Error: {str(e)}</div>",
-                "<div>No metrics available</div>",
-                "<div>No tool information available</div>",
-                "0/0",
             )
     with gr.Tab("Data Exploration"):
-        gr.HTML(HEADER_CONTENT)
-        # All filters in a single row with consistent sizing
-        with gr.Row(equal_height=True):
-            explore_model = gr.Dropdown(
-                choices=MODELS,
-                value=MODELS[0],
-                label="Model",
-                container=True,
-                scale=1,
-            )
-            explore_dataset = gr.Dropdown(
-                choices=DATASETS,
-                value=DATASETS[0],
-                label="Dataset",
-                container=True,
-                scale=1,
-            )
-            min_score = gr.Slider(
-                minimum=min(SCORES),
-                maximum=max(SCORES),
-                value=min(SCORES),
-                step=0.1,
-                label="Minimum Score - TSQ",
-                container=True,
-                scale=1,
-            )
-            max_score = gr.Slider(
-                minimum=min(SCORES),
-                maximum=max(SCORES),
-                value=max(SCORES),
-                step=0.1,
-                label="Maximum Score - TSQ",
-                container=True,
-                scale=1,
-            )
         # Navigation row
         with gr.Row(variant="panel"):
-            index_display = gr.HTML(  # Changed the variable name to index_display
-                value="0/0", elem_id="index-display", elem_classes="text-center"
-            )
-            with gr.Row():
-                prev_btn = gr.Button("← Previous", size="lg", variant="secondary")
-                next_btn = gr.Button("Next →", size="lg", variant="secondary")
-        # Content area with equal column widths
         with gr.Row(equal_height=True):
-            chat_display = gr.HTML()
-            metrics_display = gr.HTML()
             tool_info_display = gr.HTML()
         current_index = gr.State(value=0)
-        # Update display on filter change
-        def update_on_filter_change(model, dataset, min_score, max_score):
-            return filter_and_update_display(model, dataset, min_score, max_score, 0)
-        for control in [explore_model, explore_dataset, min_score, max_score]:
             control.change(
-                update_on_filter_change,
-                inputs=[explore_model, explore_dataset, min_score, max_score],
                 outputs=[
                     chat_display,
                     metrics_display,
                     tool_info_display,
                     index_display,
-                ],  # Changed to index_display
-            )
-        # Navigation functions
-        def navigate(direction, current_idx, model, dataset, min_score, max_score):
-            new_index = current_idx + direction
-            return (
-                *filter_and_update_display(
-                    model, dataset, min_score, max_score, new_index
-                ),
-                new_index,
             )
         prev_btn.click(
-            lambda idx, m, d, min_s, max_s: navigate(-1, idx, m, d, min_s, max_s),
             inputs=[
                 current_index,
                 explore_model,
                 explore_dataset,
                 min_score,
                 max_score,
             ],
             outputs=[
                 chat_display,
@@ -137,17 +676,20 @@ def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
                 tool_info_display,
                 index_display,
                 current_index,
-            ],  # Changed to index_display
         )
         next_btn.click(
-            lambda idx, m, d, min_s, max_s: navigate(1, idx, m, d, min_s, max_s),
             inputs=[
                 current_index,
                 explore_model,
                 explore_dataset,
                 min_score,
                 max_score,
             ],
             outputs=[
                 chat_display,
@@ -155,12 +697,126 @@ def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
                 tool_info_display,
                 index_display,
                 current_index,
-            ],  # Changed to index_display
         )
-        return (
             chat_display,
             metrics_display,
             tool_info_display,
-            index_display,  # Changed to index_display
         )

 import gradio as gr
+import pandas as pd
+import numpy as np
+from data_loader import MODELS, DATASETS, SCORES, HEADER_CONTENT
+from chat import (
+    format_chat_display,
+    format_metrics_display,
+    format_tool_info,
+)
+def get_updated_df(df, df_output):
+    df = df.iloc[: len(df_output)].copy()
+    df["response"] = df_output["response"].tolist()
+    df["rationale"] = df_output["rationale"].tolist()
+    df["explanation"] = df_output["explanation"].tolist()
+    df["score"] = df_output["score"].tolist()
+    cols = [
+        "conversation",
+        "tools_langchain",
+        "n_turns",
+        "len_query",
+        "n_tools",
+        "response",
+        "rationale",
+        "explanation",
+        "score",
+    ]
+    return df[cols]
+def get_chat_and_score_df(model, dataset):
+    df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
+    df = pd.read_parquet(f"datasets/{dataset}.parquet")
+    df = get_updated_df(df, df_output)
+    return df
+def on_filter_change(
+    model,
+    dataset,
+    min_score,
+    max_score,
+    min_n_turns,
+    min_len_query,
+    min_n_tools,
+):
+    try:
+        # Call filter_and_display with index 0 and unpack 4 values
+        chat_html, metrics_html, tool_html, index_html = filter_and_display(
+            model,
+            dataset,
+            min_score,
+            max_score,
+            min_n_turns,
+            min_len_query,
+            min_n_tools,
+            0,
+        )
+        # Return exactly 4 values
+        return chat_html, metrics_html, tool_html, index_html
+    except Exception as e:
+        error_html = f"""
+        <div style="padding: 1.5rem; color: var(--score-low);">
+            <div style="font-weight: 600;">Filter Error</div>
+            <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
+                {str(e)}
+            </div>
+        </div>
+        """
+        return (
+            error_html,
+            "<div style='text-align: center;'>No metrics available</div>",
+            "<div style='text-align: center;'>No tool information available</div>",
+            "<div style='text-align: center;'>0/0</div>",
+        )
+def navigate_prev(
+    current_idx,
+    model,
+    dataset,
+    min_score,
+    max_score,
+    min_n_turns,
+    min_len_query,
+    min_n_tools,
+):
+    try:
+        # Handle current_idx as dictionary
+        if isinstance(current_idx, dict) and "value" in current_idx:
+            idx_val = int(current_idx["value"])
+        else:
+            idx_val = int(current_idx) if current_idx is not None else 0
+        new_index = max(0, idx_val - 1)
+        chat_html, metrics_html, tool_html, index_html = filter_and_display(
+            model,
+            dataset,
+            min_score,
+            max_score,
+            min_n_turns,
+            min_len_query,
+            min_n_tools,
+            new_index,
+        )
+        return chat_html, metrics_html, tool_html, index_html, new_index
+    except Exception as e:
+        error_html = f"""
+        <div style="padding: 1.5rem; color: var(--score-low);">
+            <div style="font-weight: 600;">Navigation Error</div>
+            <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
+                {str(e)}
+            </div>
+        </div>
+        """
+        return (
+            error_html,
+            "<div style='text-align: center;'>No metrics available</div>",
+            "<div style='text-align: center;'>No tool information available</div>",
+            "<div style='text-align: center;'>0/0</div>",
+            current_idx or 0,
+        )
+def navigate_next(
+    current_idx,
+    model,
+    dataset,
+    min_score,
+    max_score,
+    min_n_turns,
+    min_len_query,
+    min_n_tools,
+):
+    try:
+        # Handle current_idx as dictionary
+        if isinstance(current_idx, dict) and "value" in current_idx:
+            idx_val = int(current_idx["value"])
+        else:
+            idx_val = int(current_idx) if current_idx is not None else 0
+        new_index = idx_val + 1
+        chat_html, metrics_html, tool_html, index_html = filter_and_display(
+            model,
+            dataset,
+            min_score,
+            max_score,
+            min_n_turns,
+            min_len_query,
+            min_n_tools,
+            new_index,
+        )
+        return chat_html, metrics_html, tool_html, index_html, new_index
+    except Exception as e:
+        error_html = f"""
+        <div style="padding: 1.5rem; color: var(--score-low);">
+            <div style="font-weight: 600;">Navigation Error</div>
+            <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
+                {str(e)}
+            </div>
+        </div>
+        """
+        return (
+            error_html,
+            "<div style='text-align: center;'>No metrics available</div>",
+            "<div style='text-align: center;'>No tool information available</div>",
+            "<div style='text-align: center;'>0/0</div>",
+            current_idx or 0,
+        )
+def filter_and_display(
+    model,
+    dataset,
+    min_score,
+    max_score,
+    min_n_turns,
+    min_len_query,
+    min_n_tools,
+    index=0,
+):
+    """Combined function to filter data and update display"""
+    try:
+        # Extract model
+        if isinstance(model, dict):
+            if "value" in model:
+                model_str = str(model["value"])
+            else:
+                model_str = MODELS[0]
+        else:
+            model_str = str(model) if model is not None else MODELS[0]
+        # Extract dataset
+        if isinstance(dataset, dict):
+            if "value" in dataset:
+                dataset_str = str(dataset["value"])
+            else:
+                dataset_str = DATASETS[0]
+        else:
+            dataset_str = str(dataset) if dataset is not None else DATASETS[0]
+        # Extract min_score
+        if isinstance(min_score, dict):
+            if "value" in min_score:
+                min_score_val = float(min_score["value"])
+            else:
+                min_score_val = float(min(SCORES))
+        else:
+            min_score_val = (
+                float(min_score) if min_score is not None else float(min(SCORES))
             )
+        # Extract max_score
+        if isinstance(max_score, dict):
+            if "value" in max_score:
+                max_score_val = float(max_score["value"])
+            else:
+                max_score_val = float(max(SCORES))
+        else:
+            max_score_val = (
+                float(max_score) if max_score is not None else float(max(SCORES))
             )
+        # Extract min_n_turns
+        if isinstance(min_n_turns, dict):
+            if "value" in min_n_turns:
+                min_n_turns_val = int(min_n_turns["value"])
+            else:
+                min_n_turns_val = 0
+        else:
+            min_n_turns_val = int(min_n_turns) if min_n_turns is not None else 0
+        # Extract min_len_query
+        if isinstance(min_len_query, dict):
+            if "value" in min_len_query:
+                min_len_query_val = int(min_len_query["value"])
+            else:
+                min_len_query_val = 0
+        else:
+            min_len_query_val = int(min_len_query) if min_len_query is not None else 0
+        # Extract min_n_tools
+        if isinstance(min_n_tools, dict):
+            if "value" in min_n_tools:
+                min_n_tools_val = int(min_n_tools["value"])
+            else:
+                min_n_tools_val = 0
+        else:
+            min_n_tools_val = int(min_n_tools) if min_n_tools is not None else 0
+        # Extract index
+        if isinstance(index, dict):
+            if "value" in index:
+                try:
+                    index_val = int(index["value"])
+                except (ValueError, TypeError):
+                    index_val = 0
+            else:
+                index_val = 0
+        else:
+            try:
+                index_val = int(index) if index is not None else 0
+            except (ValueError, TypeError):
+                index_val = 0
+        # Get the data
+        df_chat = get_chat_and_score_df(model_str, dataset_str)
+        # Ensure filter columns exist
+        for col, default in [
+            ("score", 0.0),
+            ("n_turns", 0),
+            ("len_query", 0),
+            ("n_tools", 0),
+        ]:
+            if col not in df_chat.columns:
+                df_chat[col] = default
+            else:
+                df_chat[col] = pd.to_numeric(df_chat[col], errors="coerce").fillna(
+                    default
+                )
+        # Apply all filters
+        df_filtered = df_chat[
+            (df_chat["score"] >= min_score_val)
+            & (df_chat["score"] <= max_score_val)
+            & (df_chat["n_turns"] >= min_n_turns_val)
+            & (df_chat["len_query"] >= min_len_query_val)
+            & (df_chat["n_tools"] >= min_n_tools_val)
+        ].copy()
+        # Check if dataframe is empty
+        if len(df_filtered) == 0:
+            empty_message = """
+            <div style="
+                padding: 1.5rem;
+                text-align: center;
+                color: var(--text-muted);
+                background-color: var(--surface-color-alt);
+                border-radius: 8px;
+                border: 1px dashed var(--border-color);
+                margin: 1rem 0;">
+                <div style="font-size: 2rem; margin-bottom: 1rem;">📭</div>
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">No Results Found</div>
+                <div style="font-style: italic; font-size: 0.9rem;">Try adjusting your filters to see more data</div>
+            </div>
+            """
             return (
+                empty_message,
+                empty_message,
+                empty_message,
+                "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
             )
+        # Ensure index is valid
+        max_index = len(df_filtered) - 1
+        valid_index = max(0, min(index_val, max_index))
+        # Get the row
+        row = df_filtered.iloc[valid_index]
+        # Format displays
+        chat_html = format_chat_display(row)
+        metrics_html = format_metrics_display(row)
+        # Get tools info with error handling
+        try:
+            tool_html = format_tool_info(row["tools_langchain"])
+        except Exception as e:
+            tool_html = f"""
+            <div style="padding: 1rem; background-color: var(--surface-color-alt); border-radius: 8px; color: var(--text-muted);">
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">Tool Information Unavailable</div>
+                <div style="font-size: 0.9rem;">Error: {str(e)}</div>
+            </div>
+            """
+        # Index display
+        index_html = f"""
+        <div style="
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-weight: 500;
+            color: var(--primary-text);
+            background-color: var(--surface-color-alt);
+            padding: 0.5rem 1rem;
+            border-radius: 20px;
+            font-size: 0.9rem;
+            width: fit-content;
+            margin: 0 auto;">
+            <span style="margin-right: 0.5rem;">📄</span>{valid_index + 1}/{len(df_filtered)}
+        </div>
+        """
+        return chat_html, metrics_html, tool_html, index_html
+    except Exception as e:
+        error_html = f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 8px;
+            margin: 1rem 0;
+            display: flex;
+            align-items: flex-start;">
+            <div style="flex-shrink: 0; margin-right: 1rem; font-size: 1.5rem;">⚠️</div>
+            <div>
+                <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Occurred</div>
+                <div style="
+                    font-family: monospace;
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 4px;
+                    white-space: pre-wrap;
+                    font-size: 0.9rem;">
+                    {str(e)}
+                </div>
+            </div>
+        </div>
+        """
+        return (
+            error_html,
+            "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No metrics available</div>",
+            "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No tool information available</div>",
+            "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
+        )
+def create_exploration_tab(df):
+    """Create an enhanced data exploration tab with better UI and functionality."""
+    # Main UI setup
     with gr.Tab("Data Exploration"):
+        # CSS styling (unchanged)
+        gr.HTML(
+            """
+        <style>
+            /* Custom styling for the exploration tab */
+            :root[data-theme="light"] {
+                --surface-color: #f8f9fa;
+                --surface-color-alt: #ffffff;
+                --text-color: #202124;
+                --text-muted: #666666;
+                --primary-text: #1a73e8;
+                --primary-text-light: rgba(26, 115, 232, 0.3);
+                --border-color: #e9ecef;
+                --border-color-light: #f1f3f5;
+                --shadow-color: rgba(0,0,0,0.05);
+                --message-bg-user: #E5F6FD;
+                --message-bg-assistant: #F7F7F8;
+                --message-bg-system: #FFF3E0;
+                --response-bg: #F0F7FF;
+                --score-high: #1a73e8;
+                --score-med: #f4b400;
+                --score-low: #ea4335;
+            }
+            :root[data-theme="dark"] {
+                --surface-color: #1e1e1e;
+                --surface-color-alt: #2d2d2d;
+                --text-color: #ffffff;
+                --text-muted: #a0a0a0;
+                --primary-text: #60a5fa;
+                --primary-text-light: rgba(96, 165, 250, 0.3);
+                --border-color: #404040;
+                --border-color-light: #333333;
+                --shadow-color: rgba(0,0,0,0.2);
+                --message-bg-user: #2d3748;
+                --message-bg-assistant: #1a1a1a;
+                --message-bg-system: #2c2516;
+                --response-bg: #1e2a3a;
+                --score-high: #60a5fa;
+                --score-med: #fbbf24;
+                --score-low: #ef4444;
+            }
+            #exploration-header {
+                margin-bottom: 1.5rem;
+                padding-bottom: 1rem;
+                border-bottom: 1px solid var(--border-color);
+            }
+            .filter-container {
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                padding: 1rem;
+                margin-bottom: 1.5rem;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);
+            }
+            .navigation-buttons button {
+                min-width: 120px;
+                font-weight: 500;
+            }
+            .content-panel {
+                margin-top: 1.5rem;
+            }
+            @media (max-width: 768px) {
+                .filter-row {
+                    flex-direction: column;
+                }
+            }
+        </style>
+        """
+        )
+        # Header
+        with gr.Row(elem_id="exploration-header"):
+            gr.HTML(HEADER_CONTENT)
+        # Filters section
+        with gr.Column(elem_classes="filter-container"):
+            gr.Markdown("### 🔍 Filter Options")
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                explore_model = gr.Dropdown(
+                    choices=MODELS,
+                    value=MODELS[0],
+                    label="Model",
+                    container=True,
+                    scale=1,
+                    info="Select AI model",
+                )
+                explore_dataset = gr.Dropdown(
+                    choices=DATASETS,
+                    value=DATASETS[0],
+                    label="Dataset",
+                    container=True,
+                    scale=1,
+                    info="Select evaluation dataset",
+                )
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                min_score = gr.Slider(
+                    minimum=float(min(SCORES)),
+                    maximum=float(max(SCORES)),
+                    value=float(min(SCORES)),
+                    step=0.1,
+                    label="Minimum TSQ Score",
+                    container=True,
+                    scale=1,
+                    info="Filter responses with scores above this threshold",
+                )
+                max_score = gr.Slider(
+                    minimum=float(min(SCORES)),
+                    maximum=float(max(SCORES)),
+                    value=float(max(SCORES)),
+                    step=0.1,
+                    label="Maximum TSQ Score",
+                    container=True,
+                    scale=1,
+                    info="Filter responses with scores below this threshold",
+                )
+            # Get the data for initial ranges
+            df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
+            # Ensure columns exist and get ranges
+            n_turns_max = int(df_chat["n_turns"].max())
+            len_query_max = int(df_chat["len_query"].max())
+            n_tools_max = int(df_chat["n_tools"].max())
+            with gr.Row(equal_height=True, elem_classes="filter-row"):
+                n_turns_filter = gr.Slider(
+                    minimum=0,
+                    maximum=n_turns_max,
+                    value=0,
+                    step=1,
+                    label="Minimum Turn Count",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum number of conversation turns",
+                )
+                len_query_filter = gr.Slider(
+                    minimum=0,
+                    maximum=len_query_max,
+                    value=0,
+                    step=10,
+                    label="Minimum Query Length",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum length of query in characters",
+                )
+                n_tools_filter = gr.Slider(
+                    minimum=0,
+                    maximum=n_tools_max,
+                    value=0,
+                    step=1,
+                    label="Minimum Tool Count",
+                    container=True,
+                    scale=1,
+                    info="Filter by minimum number of tools used",
+                )
+            with gr.Row():
+                reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
         # Navigation row
         with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                prev_btn = gr.Button(
+                    "← Previous",
+                    size="lg",
+                    variant="secondary",
+                    elem_classes="navigation-buttons",
+                )
+            with gr.Column(scale=1, min_width=100):
+                index_display = gr.HTML(
+                    value="<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
+                    elem_id="index-display",
+                )
+            with gr.Column(scale=1):
+                next_btn = gr.Button(
+                    "Next →",
+                    size="lg",
+                    variant="secondary",
+                    elem_classes="navigation-buttons",
+                )
+        # Content areas
         with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                chat_display = gr.HTML()
+            with gr.Column(scale=1):
+                metrics_display = gr.HTML()
+        with gr.Row():
             tool_info_display = gr.HTML()
+        # State for tracking current index (simple integer state)
         current_index = gr.State(value=0)
+        # Reset filters
+        def reset_filters():
+            return (
+                MODELS[0],
+                DATASETS[0],
+                float(min(SCORES)),
+                float(max(SCORES)),
+                0,  # n_turns
+                0,  # len_query
+                0,  # n_tools
+            )
+        reset_btn.click(
+            reset_filters,
+            outputs=[
+                explore_model,
+                explore_dataset,
+                min_score,
+                max_score,
+                n_turns_filter,
+                len_query_filter,
+                n_tools_filter,
+            ],
+        )
+        # Connect filter changes
+        # Replace the existing filter connections with this:
+        for control in [
+            explore_model,
+            explore_dataset,
+            min_score,
+            max_score,
+            n_turns_filter,
+            len_query_filter,
+            n_tools_filter,
+        ]:
             control.change(
+                on_filter_change,
+                inputs=[
+                    explore_model,
+                    explore_dataset,
+                    min_score,
+                    max_score,
+                    n_turns_filter,
+                    len_query_filter,
+                    n_tools_filter,
+                ],
                 outputs=[
                     chat_display,
                     metrics_display,
                     tool_info_display,
                     index_display,
+                ],
             )
+        # Connect navigation buttons with necessary filter parameters
         prev_btn.click(
+            navigate_prev,
             inputs=[
                 current_index,
                 explore_model,
                 explore_dataset,
                 min_score,
                 max_score,
+                n_turns_filter,
+                len_query_filter,
+                n_tools_filter,
             ],
             outputs=[
                 chat_display,
                 tool_info_display,
                 index_display,
                 current_index,
+            ],
         )
         next_btn.click(
+            navigate_next,
             inputs=[
                 current_index,
                 explore_model,
                 explore_dataset,
                 min_score,
                 max_score,
+                n_turns_filter,
+                len_query_filter,
+                n_tools_filter,
             ],
             outputs=[
                 chat_display,
                 tool_info_display,
                 index_display,
                 current_index,
+            ],
         )
+        def update_slider_ranges(model, dataset):
+            df_chat = get_chat_and_score_df(model, dataset)
+            # Make sure columns are numeric first
+            df_chat["n_turns"] = pd.to_numeric(
+                df_chat["n_turns"], errors="coerce"
+            ).fillna(0)
+            df_chat["len_query"] = pd.to_numeric(
+                df_chat["len_query"], errors="coerce"
+            ).fillna(0)
+            df_chat["n_tools"] = pd.to_numeric(
+                df_chat["n_tools"], errors="coerce"
+            ).fillna(0)
+            # Calculate maximums with safety buffers
+            n_turns_max = max(1, int(df_chat["n_turns"].max()))
+            len_query_max = max(10, int(df_chat["len_query"].max()))
+            n_tools_max = max(1, int(df_chat["n_tools"].max()))
+            # Return updated sliders using gr.update()
+            return (
+                gr.update(maximum=n_turns_max, value=0),
+                gr.update(maximum=len_query_max, value=0),
+                gr.update(maximum=n_tools_max, value=0),
+            )
+        # Connect model and dataset changes to slider range updates
+        explore_model.change(
+            update_slider_ranges,
+            inputs=[explore_model, explore_dataset],
+            outputs=[n_turns_filter, len_query_filter, n_tools_filter],
+        )
+        explore_dataset.change(
+            update_slider_ranges,
+            inputs=[explore_model, explore_dataset],
+            outputs=[n_turns_filter, len_query_filter, n_tools_filter],
+        )
+        return [
             chat_display,
             metrics_display,
             tool_info_display,
+            index_display,
+        ]
+def filter_and_update_display(model, dataset, min_score, max_score, current_index):
+    """Filter the dataset and update the display with comprehensive error handling."""
+    try:
+        df_chat = get_chat_and_score_df(model, dataset)
+        df_chat = df_chat[
+            (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
+        ]
+        if df_chat.empty:
+            return (
+                '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic; background-color: var(--surface-color-alt); border-radius: 8px; border: 1px dashed var(--border-color);">No data available for selected filters</div>',
+                '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No metrics available</div>',
+                '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No tool information available</div>',
+                '<div style="font-weight: 500; color: var(--text-muted);">0/0</div>',
+            )
+        max_index = len(df_chat) - 1
+        current_index = min(current_index, max_index)
+        chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
+        index_display = f"""
+        <div style="
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-weight: 500;
+            color: var(--primary-text);
+            background-color: var(--surface-color-alt);
+            padding: 0.5rem 1rem;
+            border-radius: 20px;
+            font-size: 0.9rem;
+            width: fit-content;
+            margin: 0 auto;">
+            <span style="margin-right: 0.25rem;">📄</span>{current_index + 1}/{len(df_chat)}
+        </div>
+        """
+        return chat_html, metrics_html, tool_html, index_display
+    except Exception as e:
+        error_html = f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 8px;
+            display: flex;
+            align-items: flex-start;">
+            <div style="
+                flex-shrink: 0;
+                margin-right: 1rem;
+                font-size: 1.5rem;">⚠️</div>
+            <div>
+                <div style="
+                    font-weight: 600;
+                    margin-bottom: 0.5rem;">Error Occurred</div>
+                <div style="
+                    font-family: monospace;
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 4px;
+                    white-space: pre-wrap;
+                    font-size: 0.9rem;">
+                    {str(e)}
+                </div>
+            </div>
+        </div>
+        """
+        return (
+            error_html,
+            '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No metrics available</div>',
+            '<div style="padding: 1.5rem; color: var(--text-muted); text-align: center; font-style: italic;">No tool information available</div>',
+            '<div style="font-weight: 500; color: var(--text-muted);">0/0</div>',
         )

tabs/leaderboard.py CHANGED Viewed

@@ -186,6 +186,14 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
         plot1 = gr.Plot()
         plot2 = gr.Plot()
         gr.HTML(METHODOLOGY)
         for input_comp in [model_type, category, sort_by]:

         plot1 = gr.Plot()
         plot2 = gr.Plot()
+        gr.HTML(
+            """<div class="note-box">
+                <p style="margin: 0; font-size: 1em;">
+                    Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
+                </p>
+            </div>"""
+        )
         gr.HTML(METHODOLOGY)
         for input_comp in [model_type, category, sort_by]:

visualization.py CHANGED Viewed

@@ -199,7 +199,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
     ax.set_xscale("log")
     ax.set_xlim(0.08, 40)
-    ax.set_ylim(60, 95)
     ax.set_xlabel(
         "I/O Cost per Million Tokens ($)",
@@ -233,7 +233,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
         color=colors["text"],
     )
-    for y1, y2, color in zip([85, 75, 60], [95, 85, 75], colors["performance_bands"]):
         ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
     ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])

     ax.set_xscale("log")
     ax.set_xlim(0.08, 40)
+    ax.set_ylim(60, 100)
     ax.set_xlabel(
         "I/O Cost per Million Tokens ($)",
         color=colors["text"],
     )
+    for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
         ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
     ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])