Spaces:

Thiresh
/

DataAnalysisAgent

Running

App Files Files Community

Thiresh commited on Sep 14

Commit

f64df95

verified ·

1 Parent(s): ef153c9

Update data_analysis_agent.py

Browse files

Files changed (1) hide show

data_analysis_agent.py +595 -595

data_analysis_agent.py CHANGED Viewed

@@ -1,596 +1,596 @@
-import os, io, re
-import pandas as pd
-import numpy as np
-import streamlit as st
-from openai import OpenAI
-import matplotlib.pyplot as plt
-from typing import List, Any, Optional
-# === Configuration ===
-# Global configuration
-API_BASE_URL = "https://integrate.api.nvidia.com/v1"
-API_KEY = "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" #os.environ.get("NVIDIA_API_KEY")
-# Plot configuration
-DEFAULT_FIGSIZE = (6, 4)
-DEFAULT_DPI = 100
-# Display configuration
-MAX_RESULT_DISPLAY_LENGTH = 300
-class ModelConfig:
-    """Configuration class for different models."""
-    def __init__(self, model_name: str, model_url: str, model_print_name: str,
-                 # QueryUnderstandingTool parameters
-                 query_understanding_temperature: float = 0.1,
-                 query_understanding_max_tokens: int = 5,
-                 # CodeGenerationAgent parameters
-                 code_generation_temperature: float = 0.2,
-                 code_generation_max_tokens: int = 1024,
-                 # ReasoningAgent parameters
-                 reasoning_temperature: float = 0.2,
-                 reasoning_max_tokens: int = 1024,
-                 # DataInsightAgent parameters
-                 insights_temperature: float = 0.2,
-                 insights_max_tokens: int = 512,
-                 reasoning_false: str = "detailed thinking off",
-                 reasoning_true: str = "detailed thinking on"):
-        self.MODEL_NAME = model_name
-        self.MODEL_URL = model_url
-        self.MODEL_PRINT_NAME = model_print_name
-        # Function-specific LLM parameters
-        self.QUERY_UNDERSTANDING_TEMPERATURE = query_understanding_temperature
-        self.QUERY_UNDERSTANDING_MAX_TOKENS = query_understanding_max_tokens
-        self.CODE_GENERATION_TEMPERATURE = code_generation_temperature
-        self.CODE_GENERATION_MAX_TOKENS = code_generation_max_tokens
-        self.REASONING_TEMPERATURE = reasoning_temperature
-        self.REASONING_MAX_TOKENS = reasoning_max_tokens
-        self.INSIGHTS_TEMPERATURE = insights_temperature
-        self.INSIGHTS_MAX_TOKENS = insights_max_tokens
-        self.REASONING_FALSE = reasoning_false
-        self.REASONING_TRUE = reasoning_true
-# Predefined model configurations
-MODEL_CONFIGS = {
-    "llama-3-1-nemotron-ultra-v1": ModelConfig(
-        model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
-        model_url="https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1",
-        model_print_name="NVIDIA Llama 3.1 Nemotron Ultra 253B v1",
-        # QueryUnderstandingTool
-        query_understanding_temperature=0.1,
-        query_understanding_max_tokens=5,
-        # CodeGenerationAgent
-        code_generation_temperature=0.2,
-        code_generation_max_tokens=1024,
-        # ReasoningAgent
-        reasoning_temperature=0.6,
-        reasoning_max_tokens=1024,
-        # DataInsightAgent
-        insights_temperature=0.2,
-        insights_max_tokens=512,
-        reasoning_false="detailed thinking off",
-        reasoning_true="detailed thinking on"
-    ),
-    "llama-3-3-nemotron-super-v1-5": ModelConfig(
-        model_name="nvidia/llama-3.3-nemotron-super-49b-v1.5",
-        model_url="https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5",
-        model_print_name="NVIDIA Llama 3.3 Nemotron Super 49B v1.5",
-        # QueryUnderstandingTool
-        query_understanding_temperature=0.1,
-        query_understanding_max_tokens=5,
-        # CodeGenerationAgent
-        code_generation_temperature=0.0,
-        code_generation_max_tokens=1024,
-        # ReasoningAgent
-        reasoning_temperature=0.6,
-        reasoning_max_tokens=2048,
-        # DataInsightAgent
-        insights_temperature=0.2,
-        insights_max_tokens=512,
-        reasoning_false="/no_think",
-        reasoning_true=""
-    )
-}
-# Default configuration (can be changed via environment variable or UI)
-DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "llama-3-1-nemotron-ultra-v1")
-Config = MODEL_CONFIGS.get(DEFAULT_MODEL, MODEL_CONFIGS["llama-3-1-nemotron-ultra-v1"])
-# Initialize OpenAI client with configuration
-client = OpenAI(
-    base_url=API_BASE_URL,
-    api_key=API_KEY
-)
-def get_current_config():
-    """Get the current model configuration based on session state."""
-    # Always return the current model from session state
-    if "current_model" in st.session_state:
-        return MODEL_CONFIGS[st.session_state.current_model]
-    return MODEL_CONFIGS[DEFAULT_MODEL]
-# ------------------  QueryUnderstandingTool ---------------------------
-def QueryUnderstandingTool(query: str) -> bool:
-    """Return True if the query seems to request a visualisation based on keywords."""
-    # Use LLM to understand intent instead of keyword matching
-    current_config = get_current_config()
-    # Prepend the instruction to the query
-    full_prompt = f"""You are a query classifier. Your task is to determine if a user query is requesting a data visualization.
-IMPORTANT: Respond with ONLY 'true' or 'false' (lowercase, no quotes, no punctuation).
-Classify as 'true' ONLY if the query explicitly asks for:
-- A plot, chart, graph, visualization, or figure
-- To "show" or "display" data visually
-- To "create" or "generate" a visual representation
-- Words like: plot, chart, graph, visualize, show, display, create, generate, draw
-Classify as 'false' for:
-- Data analysis without visualization requests
-- Statistical calculations, aggregations, filtering, sorting
-- Questions about data content, counts, summaries
-- Requests for tables, dataframes, or text results
-User query: {query}"""
-    messages = [
-        {"role": "system", "content": current_config.REASONING_FALSE},
-        {"role": "user", "content": full_prompt}
-    ]
-    response = client.chat.completions.create(
-        model=current_config.MODEL_NAME,
-        messages=messages,
-        temperature=current_config.QUERY_UNDERSTANDING_TEMPERATURE,
-        max_tokens=current_config.QUERY_UNDERSTANDING_MAX_TOKENS  # We only need a short response
-    )
-    # Extract the response and convert to boolean
-    intent_response = response.choices[0].message.content.strip().lower()
-    return intent_response == "true"
-# === CodeGeneration TOOLS ============================================
-# ------------------  CodeWritingTool ---------------------------------
-def CodeWritingTool(cols: List[str], query: str) -> str:
-    """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""
-    return f"""
-    Given DataFrame `df` with columns:
-    {', '.join(cols)}
-    Write Python code (pandas **only**, no plotting) to answer:
-    "{query}"
-    Rules
-    -----
-    1. Use pandas operations on `df` only.
-    2. Rely only on the columns in the DataFrame.
-    3. Assign the final result to `result`.
-    4. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
-    5. Do not include any explanations, comments, or prose outside the code block.
-    6. Use **df** as the sole data source. **Do not** read files, fetch data, or use Streamlit.
-    7. Do **not** import any libraries (pandas is already imported as pd).
-    8. Handle missing values (`dropna`) before aggregations.
-    Example
-    -----
-    ```python
-    result = df.groupby("some_column")["a_numeric_col"].mean().sort_values(ascending=False)
-    ```
-    """
-# ------------------  PlotCodeGeneratorTool ---------------------------
-def PlotCodeGeneratorTool(cols: List[str], query: str) -> str:
-    """Generate a prompt for the LLM to write pandas + matplotlib code for a plot based on the query and columns."""
-    return f"""
-    Given DataFrame `df` with columns:
-    {', '.join(cols)}
-    Write Python code using pandas **and matplotlib** (as plt) to answer:
-    "{query}"
-    Rules
-    -----
-    1. Use pandas for data manipulation and matplotlib.pyplot (as plt) for plotting.
-    2. Rely only on the columns in the DataFrame.
-    3. Assign the final result (DataFrame, Series, scalar *or* matplotlib Figure) to a variable named `result`.
-    4. Create only ONE relevant plot. Set `figsize={DEFAULT_FIGSIZE}`, add title/labels.
-    5. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
-    6. Do not include any explanations, comments, or prose outside the code block.
-    7. Handle missing values (`dropna`) before plotting/aggregations.
-    """
-# === CodeGenerationAgent ==============================================
-def CodeGenerationAgent(query: str, df: pd.DataFrame, chat_context: Optional[str] = None):
-    """Selects the appropriate code generation tool and gets code from the LLM for the user's query."""
-    should_plot = QueryUnderstandingTool(query)
-    prompt = PlotCodeGeneratorTool(df.columns.tolist(), query) if should_plot else CodeWritingTool(df.columns.tolist(), query)
-    # Prepend the instruction to the query
-    context_section = f"\nConversation context (recent user turns):\n{chat_context}\n" if chat_context else ""
-    full_prompt = f"""You are a senior Python data analyst who writes clean, efficient code.
-    Solve the given problem with optimal pandas operations. Be concise and focused.
-    Your response must contain ONLY a properly-closed ```python code block with no explanations before or after (starts with ```python and ends with ```).
-    Ensure your solution is correct, handles edge cases, and follows best practices for data analysis.
-    If the latest user request references prior results ambiguously (e.g., "it", "that", "same groups"), infer intent from the conversation context and choose the most reasonable interpretation. {context_section}{prompt}"""
-    current_config = get_current_config()
-    messages = [
-        {"role": "system", "content": current_config.REASONING_FALSE},
-        {"role": "user", "content": full_prompt}
-    ]
-    response = client.chat.completions.create(
-        model=current_config.MODEL_NAME,
-        messages=messages,
-        temperature=current_config.CODE_GENERATION_TEMPERATURE,
-        max_tokens=current_config.CODE_GENERATION_MAX_TOKENS
-    )
-    full_response = response.choices[0].message.content
-    code = extract_first_code_block(full_response)
-    return code, should_plot, ""
-# === ExecutionAgent ====================================================
-def ExecutionAgent(code: str, df: pd.DataFrame, should_plot: bool):
-    """Executes the generated code in a controlled environment and returns the result or error message."""
-    # Set up execution environment with all necessary modules
-    env = {
-        "pd": pd,
-        "df": df
-    }
-    if should_plot:
-        plt.rcParams["figure.dpi"] = DEFAULT_DPI  # Set default DPI for all figures
-        env["plt"] = plt
-        env["io"] = io
-    try:
-        # Execute the code in the environment
-        exec(code, {}, env)
-        result = env.get("result", None)
-        # If no result was assigned, return the last expression
-        if result is None:
-            # Try to get the last executed expression
-            if "result" not in env:
-                return "No result was assigned to 'result' variable"
-        return result
-    except Exception as exc:
-        return f"Error executing code: {exc}"
-# === ReasoningCurator TOOL =========================================
-def ReasoningCurator(query: str, result: Any) -> str:
-    """Builds and returns the LLM prompt for reasoning about the result."""
-    is_error = isinstance(result, str) and result.startswith("Error executing code")
-    is_plot = isinstance(result, (plt.Figure, plt.Axes))
-    if is_error:
-        desc = result
-    elif is_plot:
-        title = ""
-        if isinstance(result, plt.Figure):
-            title = result._suptitle.get_text() if result._suptitle else ""
-        elif isinstance(result, plt.Axes):
-            title = result.get_title()
-        desc = f"[Plot Object: {title or 'Chart'}]"
-    else:
-        desc = str(result)[:MAX_RESULT_DISPLAY_LENGTH]
-    if is_plot:
-        prompt = f'''
-        The user asked: "{query}".
-        Below is a description of the plot result:
-        {desc}
-        Explain in 2–3 concise sentences what the chart shows (no code talk).'''
-    else:
-        prompt = f'''
-        The user asked: "{query}".
-        The result value is: {desc}
-        Explain in 2–3 concise sentences what this tells about the data (no mention of charts).'''
-    return prompt
-# === ReasoningAgent (streaming) =========================================
-def ReasoningAgent(query: str, result: Any):
-    """Streams the LLM's reasoning about the result (plot or value) and extracts model 'thinking' and final explanation."""
-    current_config = get_current_config()
-    prompt = ReasoningCurator(query, result)
-    # Streaming LLM call
-    response = client.chat.completions.create(
-        model=current_config.MODEL_NAME,
-        messages=[
-            {"role": "system", "content": current_config.REASONING_TRUE},
-            {"role": "user", "content": "You are an insightful data analyst. " + prompt}
-        ],
-        temperature=current_config.REASONING_TEMPERATURE,
-        max_tokens=current_config.REASONING_MAX_TOKENS,
-        stream=True
-    )
-    # Stream and display thinking
-    thinking_placeholder = st.empty()
-    full_response = ""
-    thinking_content = ""
-    in_think = False
-    for chunk in response:
-        if chunk.choices[0].delta.content is not None:
-            token = chunk.choices[0].delta.content
-            full_response += token
-            # Simple state machine to extract <think>...</think> as it streams
-            if "<think>" in token:
-                in_think = True
-                token = token.split("<think>", 1)[1]
-            if "</think>" in token:
-                token = token.split("</think>", 1)[0]
-                in_think = False
-            if in_think or ("<think>" in full_response and not "</think>" in full_response):
-                thinking_content += token
-                thinking_placeholder.markdown(
-                    f'<details class="thinking" open><summary>🤔 Model Thinking</summary><pre>{thinking_content}</pre></details>',
-                    unsafe_allow_html=True
-                )
-    # After streaming, extract final reasoning (outside <think>...</think>)
-    cleaned = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL).strip()
-    return thinking_content, cleaned
-# === DataFrameSummary TOOL (pandas only) =========================================
-def DataFrameSummaryTool(df: pd.DataFrame) -> str:
-    """Generate a summary prompt string for the LLM based on the DataFrame."""
-    prompt = f"""
-        Given a dataset with {len(df)} rows and {len(df.columns)} columns:
-        Columns: {', '.join(df.columns)}
-        Data types: {df.dtypes.to_dict()}
-        Missing values: {df.isnull().sum().to_dict()}
-        Provide:
-        1. A brief description of what this dataset contains
-        2. 3-4 possible data analysis questions that could be explored
-        Keep it concise and focused."""
-    return prompt
-# === DataInsightAgent (upload-time only) ===============================
-def DataInsightAgent(df: pd.DataFrame) -> str:
-    """Uses the LLM to generate a brief summary and possible questions for the uploaded dataset."""
-    current_config = get_current_config()
-    prompt = DataFrameSummaryTool(df)
-    try:
-        response = client.chat.completions.create(
-            model=current_config.MODEL_NAME,
-            messages=[
-                {"role": "system", "content": current_config.REASONING_FALSE},
-                {"role": "user", "content": "You are a data analyst providing brief, focused insights. " + prompt}
-            ],
-            temperature=current_config.INSIGHTS_TEMPERATURE,
-            max_tokens=current_config.INSIGHTS_MAX_TOKENS
-        )
-        return response.choices[0].message.content
-    except Exception as exc:
-        raise Exception(f"Error generating dataset insights: {exc}")
-# === Helpers ===========================================================
-def extract_first_code_block(text: str) -> str:
-    """Extracts the first Python code block from a markdown-formatted string."""
-    start = text.find("```python")
-    if start == -1:
-        return ""
-    start += len("```python")
-    end = text.find("```", start)
-    if end == -1:
-        return ""
-    return text[start:end].strip()
-# === Main Streamlit App ===============================================
-def main():
-    st.set_page_config(layout="wide")
-    if "plots" not in st.session_state:
-        st.session_state.plots = []
-    if "current_model" not in st.session_state:
-        st.session_state.current_model = DEFAULT_MODEL
-    # Page logo at top right corner, large and clickable
-    st.markdown(
-        """
-        <div style='position: absolute; top: 20px; right: 30px; z-index: 999;'>
-            <a href='https://www.linkedin.com/in/thiresh-sidda/' target='_blank'>
-                <img src='https://ih1.redbubble.net/image.1849728168.3104/raf,360x360,075,t,fafafa:ca443f4786.jpg' alt='Logo' style='height:120px; border-radius:20px; box-shadow:0 2px 12px rgba(0,0,0,0.15);'>
-            </a>
-        </div>
-        """,
-        unsafe_allow_html=True
-    )
-    # Main title centered with large font and GIF
-    st.markdown(
-        """
-        <div style='display: flex; align-items: center; justify-content: center; margin-bottom: 30px;'>
-            <span style='color:#1976D2; font-weight:bold; font-size:3.5em; margin-right:30px;'>Data Analysis Agent</span>
-            <img src='https://cdn.dribbble.com/userupload/23161671/file/original-4c7894556285d8f223ab21fd10554fe4.gif' alt='GIF' style='height:120px;'>
-        </div>
-        """,
-        unsafe_allow_html=True
-    )
-    medium_blue = "#1976D2"  # Medium blue color
-    # Move left panel to sidebar
-    with st.sidebar:
-        st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.5em;'>Insights Generator</span>", unsafe_allow_html=True)
-        available_models = list(MODEL_CONFIGS.keys())
-        model_display_names = {key: MODEL_CONFIGS[key].MODEL_PRINT_NAME for key in available_models}
-        selected_model = st.selectbox(
-            "Select Model",
-            options=available_models,
-            format_func=lambda x: model_display_names[x],
-            index=available_models.index(st.session_state.current_model)
-        )
-        display_config = MODEL_CONFIGS[selected_model]
-        file = st.file_uploader("Choose CSV", type=["csv"], key="csv_uploader")
-        # Update configuration if model changed
-        if selected_model != st.session_state.current_model:
-            st.session_state.current_model = selected_model
-            new_config = MODEL_CONFIGS[selected_model]
-            if "messages" in st.session_state:
-                st.session_state.messages = []
-            if "plots" in st.session_state:
-                st.session_state.plots = []
-            if "df" in st.session_state and file is not None:
-                with st.spinner("Generating dataset insights with new model …"):
-                    try:
-                        st.session_state.insights = DataInsightAgent(st.session_state.df)
-                        st.success(f"Insights updated with {new_config.MODEL_PRINT_NAME}")
-                    except Exception as e:
-                        st.error(f"Error updating insights: {str(e)}")
-                        if "insights" in st.session_state:
-                            del st.session_state.insights
-                st.rerun()
-        if not file and "df" in st.session_state and "current_file" in st.session_state:
-            del st.session_state.df
-            del st.session_state.current_file
-            if "insights" in st.session_state:
-                del st.session_state.insights
-            st.rerun()
-        if file:
-            if ("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
-                st.session_state.df = pd.read_csv(file)
-                st.session_state.current_file = file.name
-                st.session_state.messages = []
-                with st.spinner("Generating dataset insights …"):
-                    try:
-                        st.session_state.insights = DataInsightAgent(st.session_state.df)
-                    except Exception as e:
-                        st.error(f"Error generating insights: {str(e)}")
-            elif "insights" not in st.session_state:
-                with st.spinner("Generating dataset insights …"):
-                    try:
-                        st.session_state.insights = DataInsightAgent(st.session_state.df)
-                    except Exception as e:
-                        st.error(f"Error generating insights: {str(e)}")
-        if "df" in st.session_state:
-            st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.2em;'>Your Dataset Insights</span>", unsafe_allow_html=True)
-            if "insights" in st.session_state and st.session_state.insights:
-                st.dataframe(st.session_state.df.head())
-                st.markdown(f"<span style='color:{medium_blue};'>{st.session_state.insights}</span>", unsafe_allow_html=True)
-                current_config_left = get_current_config()
-                #st.markdown(f"*<span style='color: grey; font-style: italic;'>Generated with {current_config_left.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
-            else:
-                st.warning("No insights available.")
-        else:
-            st.info("Upload a CSV to begin chatting with your data.")
-    with st.container():
-        st.markdown(
-            f"""
-            <div style='display: flex; align-items: center; justify-content: flex-start; margin-bottom: 10px;'>
-                <span style='color:{medium_blue}; font-weight:bold; font-size:2em; margin-right:20px;'>Chat with your data</span>
-                <img src='https://i.pinimg.com/originals/5f/d5/58/5fd558f8b7a4f9e2138709cbe63c7052.gif' alt='Chat GIF' style='height:48px;'>
-            </div>
-            """,
-            unsafe_allow_html=True
-        )
-        if "df" in st.session_state:
-            current_config_right = get_current_config()
-            st.markdown(f"*<span style='color: grey; font-style: italic;'>Using {current_config_right.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
-        if "messages" not in st.session_state:
-            st.session_state.messages = []
-        clear_col1, clear_col2 = st.columns([9,1])
-        with clear_col2:
-            if st.button("Clear chat"):
-                st.session_state.messages = []
-                st.session_state.plots = []
-                st.rerun()
-        for msg in st.session_state.messages:
-            with st.chat_message(msg["role"]):
-                st.markdown(f"<span style='color:{medium_blue}; font-size:1.1em;'>{msg['content']}</span>", unsafe_allow_html=True)
-                if msg.get("plot_index") is not None:
-                    idx = msg["plot_index"]
-                    if 0 <= idx < len(st.session_state.plots):
-                        st.pyplot(st.session_state.plots[idx], use_container_width=False)
-        if "df" in st.session_state:
-            if user_q := st.chat_input("Ask about your data…"):
-                st.session_state.messages.append({"role": "user", "content": user_q})
-                with st.spinner("Working …"):
-                    recent_user_turns = [m["content"] for m in st.session_state.messages if m["role"] == "user"][-3:]
-                    context_text = "\n".join(recent_user_turns[:-1]) if len(recent_user_turns) > 1 else None
-                    code, should_plot_flag, code_thinking = CodeGenerationAgent(user_q, st.session_state.df, context_text)
-                    result_obj = ExecutionAgent(code, st.session_state.df, should_plot_flag)
-                    raw_thinking, reasoning_txt = ReasoningAgent(user_q, result_obj)
-                    reasoning_txt = reasoning_txt.replace("`", "")
-                is_plot = isinstance(result_obj, (plt.Figure, plt.Axes))
-                plot_idx = None
-                if is_plot:
-                    fig = result_obj.figure if isinstance(result_obj, plt.Axes) else result_obj
-                    st.session_state.plots.append(fig)
-                    plot_idx = len(st.session_state.plots) - 1
-                    header = "Here is the visualization you requested:"
-                elif isinstance(result_obj, (pd.DataFrame, pd.Series)):
-                    header = f"Result: {len(result_obj)} rows" if isinstance(result_obj, pd.DataFrame) else "Result series"
-                else:
-                    header = f"Result: {result_obj}"
-                thinking_html = ""
-                if raw_thinking:
-                    thinking_html = (
-                        '<details class="thinking">'
-                        '<summary>🧠 Reasoning</summary>'
-                        f'<pre>{raw_thinking}</pre>'
-                        '</details>'
-                    )
-                explanation_html = reasoning_txt
-                code_html = (
-                    '<details class="code">'
-                    '<summary>View code</summary>'
-                    '<pre><code class="language-python">'
-                    f'{code}'
-                    '</code></pre>'
-                    '</details>'
-                )
-                assistant_msg = f"{thinking_html}{explanation_html}\n\n{code_html}"
-                st.session_state.messages.append({
-                    "role": "assistant",
-                    "content": assistant_msg,
-                    "plot_index": plot_idx
-                })
-                st.rerun()
-if __name__ == "__main__":
     main()

+import os, io, re
+import pandas as pd
+import numpy as np
+import streamlit as st
+from openai import OpenAI
+import matplotlib.pyplot as plt
+from typing import List, Any, Optional
+# === Configuration ===
+# Global configuration
+API_BASE_URL = "https://integrate.api.nvidia.com/v1"
+API_KEY = os.environ.get("NVIDIA_API_KEY")
+# Plot configuration
+DEFAULT_FIGSIZE = (6, 4)
+DEFAULT_DPI = 100
+# Display configuration
+MAX_RESULT_DISPLAY_LENGTH = 300
+class ModelConfig:
+    """Configuration class for different models."""
+    def __init__(self, model_name: str, model_url: str, model_print_name: str,
+                 # QueryUnderstandingTool parameters
+                 query_understanding_temperature: float = 0.1,
+                 query_understanding_max_tokens: int = 5,
+                 # CodeGenerationAgent parameters
+                 code_generation_temperature: float = 0.2,
+                 code_generation_max_tokens: int = 1024,
+                 # ReasoningAgent parameters
+                 reasoning_temperature: float = 0.2,
+                 reasoning_max_tokens: int = 1024,
+                 # DataInsightAgent parameters
+                 insights_temperature: float = 0.2,
+                 insights_max_tokens: int = 512,
+                 reasoning_false: str = "detailed thinking off",
+                 reasoning_true: str = "detailed thinking on"):
+        self.MODEL_NAME = model_name
+        self.MODEL_URL = model_url
+        self.MODEL_PRINT_NAME = model_print_name
+        # Function-specific LLM parameters
+        self.QUERY_UNDERSTANDING_TEMPERATURE = query_understanding_temperature
+        self.QUERY_UNDERSTANDING_MAX_TOKENS = query_understanding_max_tokens
+        self.CODE_GENERATION_TEMPERATURE = code_generation_temperature
+        self.CODE_GENERATION_MAX_TOKENS = code_generation_max_tokens
+        self.REASONING_TEMPERATURE = reasoning_temperature
+        self.REASONING_MAX_TOKENS = reasoning_max_tokens
+        self.INSIGHTS_TEMPERATURE = insights_temperature
+        self.INSIGHTS_MAX_TOKENS = insights_max_tokens
+        self.REASONING_FALSE = reasoning_false
+        self.REASONING_TRUE = reasoning_true
+# Predefined model configurations
+MODEL_CONFIGS = {
+    "llama-3-1-nemotron-ultra-v1": ModelConfig(
+        model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
+        model_url="https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1",
+        model_print_name="NVIDIA Llama 3.1 Nemotron Ultra 253B v1",
+        # QueryUnderstandingTool
+        query_understanding_temperature=0.1,
+        query_understanding_max_tokens=5,
+        # CodeGenerationAgent
+        code_generation_temperature=0.2,
+        code_generation_max_tokens=1024,
+        # ReasoningAgent
+        reasoning_temperature=0.6,
+        reasoning_max_tokens=1024,
+        # DataInsightAgent
+        insights_temperature=0.2,
+        insights_max_tokens=512,
+        reasoning_false="detailed thinking off",
+        reasoning_true="detailed thinking on"
+    ),
+    "llama-3-3-nemotron-super-v1-5": ModelConfig(
+        model_name="nvidia/llama-3.3-nemotron-super-49b-v1.5",
+        model_url="https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5",
+        model_print_name="NVIDIA Llama 3.3 Nemotron Super 49B v1.5",
+        # QueryUnderstandingTool
+        query_understanding_temperature=0.1,
+        query_understanding_max_tokens=5,
+        # CodeGenerationAgent
+        code_generation_temperature=0.0,
+        code_generation_max_tokens=1024,
+        # ReasoningAgent
+        reasoning_temperature=0.6,
+        reasoning_max_tokens=2048,
+        # DataInsightAgent
+        insights_temperature=0.2,
+        insights_max_tokens=512,
+        reasoning_false="/no_think",
+        reasoning_true=""
+    )
+}
+# Default configuration (can be changed via environment variable or UI)
+DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "llama-3-1-nemotron-ultra-v1")
+Config = MODEL_CONFIGS.get(DEFAULT_MODEL, MODEL_CONFIGS["llama-3-1-nemotron-ultra-v1"])
+# Initialize OpenAI client with configuration
+client = OpenAI(
+    base_url=API_BASE_URL,
+    api_key=API_KEY
+)
+def get_current_config():
+    """Get the current model configuration based on session state."""
+    # Always return the current model from session state
+    if "current_model" in st.session_state:
+        return MODEL_CONFIGS[st.session_state.current_model]
+    return MODEL_CONFIGS[DEFAULT_MODEL]
+# ------------------  QueryUnderstandingTool ---------------------------
+def QueryUnderstandingTool(query: str) -> bool:
+    """Return True if the query seems to request a visualisation based on keywords."""
+    # Use LLM to understand intent instead of keyword matching
+    current_config = get_current_config()
+    # Prepend the instruction to the query
+    full_prompt = f"""You are a query classifier. Your task is to determine if a user query is requesting a data visualization.
+IMPORTANT: Respond with ONLY 'true' or 'false' (lowercase, no quotes, no punctuation).
+Classify as 'true' ONLY if the query explicitly asks for:
+- A plot, chart, graph, visualization, or figure
+- To "show" or "display" data visually
+- To "create" or "generate" a visual representation
+- Words like: plot, chart, graph, visualize, show, display, create, generate, draw
+Classify as 'false' for:
+- Data analysis without visualization requests
+- Statistical calculations, aggregations, filtering, sorting
+- Questions about data content, counts, summaries
+- Requests for tables, dataframes, or text results
+User query: {query}"""
+    messages = [
+        {"role": "system", "content": current_config.REASONING_FALSE},
+        {"role": "user", "content": full_prompt}
+    ]
+    response = client.chat.completions.create(
+        model=current_config.MODEL_NAME,
+        messages=messages,
+        temperature=current_config.QUERY_UNDERSTANDING_TEMPERATURE,
+        max_tokens=current_config.QUERY_UNDERSTANDING_MAX_TOKENS  # We only need a short response
+    )
+    # Extract the response and convert to boolean
+    intent_response = response.choices[0].message.content.strip().lower()
+    return intent_response == "true"
+# === CodeGeneration TOOLS ============================================
+# ------------------  CodeWritingTool ---------------------------------
+def CodeWritingTool(cols: List[str], query: str) -> str:
+    """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""
+    return f"""
+    Given DataFrame `df` with columns:
+    {', '.join(cols)}
+    Write Python code (pandas **only**, no plotting) to answer:
+    "{query}"
+    Rules
+    -----
+    1. Use pandas operations on `df` only.
+    2. Rely only on the columns in the DataFrame.
+    3. Assign the final result to `result`.
+    4. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
+    5. Do not include any explanations, comments, or prose outside the code block.
+    6. Use **df** as the sole data source. **Do not** read files, fetch data, or use Streamlit.
+    7. Do **not** import any libraries (pandas is already imported as pd).
+    8. Handle missing values (`dropna`) before aggregations.
+    Example
+    -----
+    ```python
+    result = df.groupby("some_column")["a_numeric_col"].mean().sort_values(ascending=False)
+    ```
+    """
+# ------------------  PlotCodeGeneratorTool ---------------------------
+def PlotCodeGeneratorTool(cols: List[str], query: str) -> str:
+    """Generate a prompt for the LLM to write pandas + matplotlib code for a plot based on the query and columns."""
+    return f"""
+    Given DataFrame `df` with columns:
+    {', '.join(cols)}
+    Write Python code using pandas **and matplotlib** (as plt) to answer:
+    "{query}"
+    Rules
+    -----
+    1. Use pandas for data manipulation and matplotlib.pyplot (as plt) for plotting.
+    2. Rely only on the columns in the DataFrame.
+    3. Assign the final result (DataFrame, Series, scalar *or* matplotlib Figure) to a variable named `result`.
+    4. Create only ONE relevant plot. Set `figsize={DEFAULT_FIGSIZE}`, add title/labels.
+    5. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
+    6. Do not include any explanations, comments, or prose outside the code block.
+    7. Handle missing values (`dropna`) before plotting/aggregations.
+    """
+# === CodeGenerationAgent ==============================================
+def CodeGenerationAgent(query: str, df: pd.DataFrame, chat_context: Optional[str] = None):
+    """Selects the appropriate code generation tool and gets code from the LLM for the user's query."""
+    should_plot = QueryUnderstandingTool(query)
+    prompt = PlotCodeGeneratorTool(df.columns.tolist(), query) if should_plot else CodeWritingTool(df.columns.tolist(), query)
+    # Prepend the instruction to the query
+    context_section = f"\nConversation context (recent user turns):\n{chat_context}\n" if chat_context else ""
+    full_prompt = f"""You are a senior Python data analyst who writes clean, efficient code.
+    Solve the given problem with optimal pandas operations. Be concise and focused.
+    Your response must contain ONLY a properly-closed ```python code block with no explanations before or after (starts with ```python and ends with ```).
+    Ensure your solution is correct, handles edge cases, and follows best practices for data analysis.
+    If the latest user request references prior results ambiguously (e.g., "it", "that", "same groups"), infer intent from the conversation context and choose the most reasonable interpretation. {context_section}{prompt}"""
+    current_config = get_current_config()
+    messages = [
+        {"role": "system", "content": current_config.REASONING_FALSE},
+        {"role": "user", "content": full_prompt}
+    ]
+    response = client.chat.completions.create(
+        model=current_config.MODEL_NAME,
+        messages=messages,
+        temperature=current_config.CODE_GENERATION_TEMPERATURE,
+        max_tokens=current_config.CODE_GENERATION_MAX_TOKENS
+    )
+    full_response = response.choices[0].message.content
+    code = extract_first_code_block(full_response)
+    return code, should_plot, ""
+# === ExecutionAgent ====================================================
+def ExecutionAgent(code: str, df: pd.DataFrame, should_plot: bool):
+    """Executes the generated code in a controlled environment and returns the result or error message."""
+    # Set up execution environment with all necessary modules
+    env = {
+        "pd": pd,
+        "df": df
+    }
+    if should_plot:
+        plt.rcParams["figure.dpi"] = DEFAULT_DPI  # Set default DPI for all figures
+        env["plt"] = plt
+        env["io"] = io
+    try:
+        # Execute the code in the environment
+        exec(code, {}, env)
+        result = env.get("result", None)
+        # If no result was assigned, return the last expression
+        if result is None:
+            # Try to get the last executed expression
+            if "result" not in env:
+                return "No result was assigned to 'result' variable"
+        return result
+    except Exception as exc:
+        return f"Error executing code: {exc}"
+# === ReasoningCurator TOOL =========================================
+def ReasoningCurator(query: str, result: Any) -> str:
+    """Builds and returns the LLM prompt for reasoning about the result."""
+    is_error = isinstance(result, str) and result.startswith("Error executing code")
+    is_plot = isinstance(result, (plt.Figure, plt.Axes))
+    if is_error:
+        desc = result
+    elif is_plot:
+        title = ""
+        if isinstance(result, plt.Figure):
+            title = result._suptitle.get_text() if result._suptitle else ""
+        elif isinstance(result, plt.Axes):
+            title = result.get_title()
+        desc = f"[Plot Object: {title or 'Chart'}]"
+    else:
+        desc = str(result)[:MAX_RESULT_DISPLAY_LENGTH]
+    if is_plot:
+        prompt = f'''
+        The user asked: "{query}".
+        Below is a description of the plot result:
+        {desc}
+        Explain in 2–3 concise sentences what the chart shows (no code talk).'''
+    else:
+        prompt = f'''
+        The user asked: "{query}".
+        The result value is: {desc}
+        Explain in 2–3 concise sentences what this tells about the data (no mention of charts).'''
+    return prompt
+# === ReasoningAgent (streaming) =========================================
+def ReasoningAgent(query: str, result: Any):
+    """Streams the LLM's reasoning about the result (plot or value) and extracts model 'thinking' and final explanation."""
+    current_config = get_current_config()
+    prompt = ReasoningCurator(query, result)
+    # Streaming LLM call
+    response = client.chat.completions.create(
+        model=current_config.MODEL_NAME,
+        messages=[
+            {"role": "system", "content": current_config.REASONING_TRUE},
+            {"role": "user", "content": "You are an insightful data analyst. " + prompt}
+        ],
+        temperature=current_config.REASONING_TEMPERATURE,
+        max_tokens=current_config.REASONING_MAX_TOKENS,
+        stream=True
+    )
+    # Stream and display thinking
+    thinking_placeholder = st.empty()
+    full_response = ""
+    thinking_content = ""
+    in_think = False
+    for chunk in response:
+        if chunk.choices[0].delta.content is not None:
+            token = chunk.choices[0].delta.content
+            full_response += token
+            # Simple state machine to extract <think>...</think> as it streams
+            if "<think>" in token:
+                in_think = True
+                token = token.split("<think>", 1)[1]
+            if "</think>" in token:
+                token = token.split("</think>", 1)[0]
+                in_think = False
+            if in_think or ("<think>" in full_response and not "</think>" in full_response):
+                thinking_content += token
+                thinking_placeholder.markdown(
+                    f'<details class="thinking" open><summary>🤔 Model Thinking</summary><pre>{thinking_content}</pre></details>',
+                    unsafe_allow_html=True
+                )
+    # After streaming, extract final reasoning (outside <think>...</think>)
+    cleaned = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL).strip()
+    return thinking_content, cleaned
+# === DataFrameSummary TOOL (pandas only) =========================================
+def DataFrameSummaryTool(df: pd.DataFrame) -> str:
+    """Generate a summary prompt string for the LLM based on the DataFrame."""
+    prompt = f"""
+        Given a dataset with {len(df)} rows and {len(df.columns)} columns:
+        Columns: {', '.join(df.columns)}
+        Data types: {df.dtypes.to_dict()}
+        Missing values: {df.isnull().sum().to_dict()}
+        Provide:
+        1. A brief description of what this dataset contains
+        2. 3-4 possible data analysis questions that could be explored
+        Keep it concise and focused."""
+    return prompt
+# === DataInsightAgent (upload-time only) ===============================
+def DataInsightAgent(df: pd.DataFrame) -> str:
+    """Uses the LLM to generate a brief summary and possible questions for the uploaded dataset."""
+    current_config = get_current_config()
+    prompt = DataFrameSummaryTool(df)
+    try:
+        response = client.chat.completions.create(
+            model=current_config.MODEL_NAME,
+            messages=[
+                {"role": "system", "content": current_config.REASONING_FALSE},
+                {"role": "user", "content": "You are a data analyst providing brief, focused insights. " + prompt}
+            ],
+            temperature=current_config.INSIGHTS_TEMPERATURE,
+            max_tokens=current_config.INSIGHTS_MAX_TOKENS
+        )
+        return response.choices[0].message.content
+    except Exception as exc:
+        raise Exception(f"Error generating dataset insights: {exc}")
+# === Helpers ===========================================================
+def extract_first_code_block(text: str) -> str:
+    """Extracts the first Python code block from a markdown-formatted string."""
+    start = text.find("```python")
+    if start == -1:
+        return ""
+    start += len("```python")
+    end = text.find("```", start)
+    if end == -1:
+        return ""
+    return text[start:end].strip()
+# === Main Streamlit App ===============================================
+def main():
+    st.set_page_config(layout="wide")
+    if "plots" not in st.session_state:
+        st.session_state.plots = []
+    if "current_model" not in st.session_state:
+        st.session_state.current_model = DEFAULT_MODEL
+    # Page logo at top right corner, large and clickable
+    st.markdown(
+        """
+        <div style='position: absolute; top: 20px; right: 30px; z-index: 999;'>
+            <a href='https://www.linkedin.com/in/thiresh-sidda/' target='_blank'>
+                <img src='https://ih1.redbubble.net/image.1849728168.3104/raf,360x360,075,t,fafafa:ca443f4786.jpg' alt='Logo' style='height:120px; border-radius:20px; box-shadow:0 2px 12px rgba(0,0,0,0.15);'>
+            </a>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+    # Main title centered with large font and GIF
+    st.markdown(
+        """
+        <div style='display: flex; align-items: center; justify-content: center; margin-bottom: 30px;'>
+            <span style='color:#1976D2; font-weight:bold; font-size:3.5em; margin-right:30px;'>Data Analysis Agent</span>
+            <img src='https://cdn.dribbble.com/userupload/23161671/file/original-4c7894556285d8f223ab21fd10554fe4.gif' alt='GIF' style='height:120px;'>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+    medium_blue = "#1976D2"  # Medium blue color
+    # Move left panel to sidebar
+    with st.sidebar:
+        st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.5em;'>Insights Generator</span>", unsafe_allow_html=True)
+        available_models = list(MODEL_CONFIGS.keys())
+        model_display_names = {key: MODEL_CONFIGS[key].MODEL_PRINT_NAME for key in available_models}
+        selected_model = st.selectbox(
+            "Select Model",
+            options=available_models,
+            format_func=lambda x: model_display_names[x],
+            index=available_models.index(st.session_state.current_model)
+        )
+        display_config = MODEL_CONFIGS[selected_model]
+        file = st.file_uploader("Choose CSV", type=["csv"], key="csv_uploader")
+        # Update configuration if model changed
+        if selected_model != st.session_state.current_model:
+            st.session_state.current_model = selected_model
+            new_config = MODEL_CONFIGS[selected_model]
+            if "messages" in st.session_state:
+                st.session_state.messages = []
+            if "plots" in st.session_state:
+                st.session_state.plots = []
+            if "df" in st.session_state and file is not None:
+                with st.spinner("Generating dataset insights with new model …"):
+                    try:
+                        st.session_state.insights = DataInsightAgent(st.session_state.df)
+                        st.success(f"Insights updated with {new_config.MODEL_PRINT_NAME}")
+                    except Exception as e:
+                        st.error(f"Error updating insights: {str(e)}")
+                        if "insights" in st.session_state:
+                            del st.session_state.insights
+                st.rerun()
+        if not file and "df" in st.session_state and "current_file" in st.session_state:
+            del st.session_state.df
+            del st.session_state.current_file
+            if "insights" in st.session_state:
+                del st.session_state.insights
+            st.rerun()
+        if file:
+            if ("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
+                st.session_state.df = pd.read_csv(file)
+                st.session_state.current_file = file.name
+                st.session_state.messages = []
+                with st.spinner("Generating dataset insights …"):
+                    try:
+                        st.session_state.insights = DataInsightAgent(st.session_state.df)
+                    except Exception as e:
+                        st.error(f"Error generating insights: {str(e)}")
+            elif "insights" not in st.session_state:
+                with st.spinner("Generating dataset insights …"):
+                    try:
+                        st.session_state.insights = DataInsightAgent(st.session_state.df)
+                    except Exception as e:
+                        st.error(f"Error generating insights: {str(e)}")
+        if "df" in st.session_state:
+            st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.2em;'>Your Dataset Insights</span>", unsafe_allow_html=True)
+            if "insights" in st.session_state and st.session_state.insights:
+                st.dataframe(st.session_state.df.head())
+                st.markdown(f"<span style='color:{medium_blue};'>{st.session_state.insights}</span>", unsafe_allow_html=True)
+                current_config_left = get_current_config()
+                #st.markdown(f"*<span style='color: grey; font-style: italic;'>Generated with {current_config_left.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
+            else:
+                st.warning("No insights available.")
+        else:
+            st.info("Upload a CSV to begin chatting with your data.")
+    with st.container():
+        st.markdown(
+            f"""
+            <div style='display: flex; align-items: center; justify-content: flex-start; margin-bottom: 10px;'>
+                <span style='color:{medium_blue}; font-weight:bold; font-size:2em; margin-right:20px;'>Chat with your data</span>
+                <img src='https://i.pinimg.com/originals/5f/d5/58/5fd558f8b7a4f9e2138709cbe63c7052.gif' alt='Chat GIF' style='height:48px;'>
+            </div>
+            """,
+            unsafe_allow_html=True
+        )
+        if "df" in st.session_state:
+            current_config_right = get_current_config()
+            st.markdown(f"*<span style='color: grey; font-style: italic;'>Using {current_config_right.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
+        if "messages" not in st.session_state:
+            st.session_state.messages = []
+        clear_col1, clear_col2 = st.columns([9,1])
+        with clear_col2:
+            if st.button("Clear chat"):
+                st.session_state.messages = []
+                st.session_state.plots = []
+                st.rerun()
+        for msg in st.session_state.messages:
+            with st.chat_message(msg["role"]):
+                st.markdown(f"<span style='color:{medium_blue}; font-size:1.1em;'>{msg['content']}</span>", unsafe_allow_html=True)
+                if msg.get("plot_index") is not None:
+                    idx = msg["plot_index"]
+                    if 0 <= idx < len(st.session_state.plots):
+                        st.pyplot(st.session_state.plots[idx], use_container_width=False)
+        if "df" in st.session_state:
+            if user_q := st.chat_input("Ask about your data…"):
+                st.session_state.messages.append({"role": "user", "content": user_q})
+                with st.spinner("Working …"):
+                    recent_user_turns = [m["content"] for m in st.session_state.messages if m["role"] == "user"][-3:]
+                    context_text = "\n".join(recent_user_turns[:-1]) if len(recent_user_turns) > 1 else None
+                    code, should_plot_flag, code_thinking = CodeGenerationAgent(user_q, st.session_state.df, context_text)
+                    result_obj = ExecutionAgent(code, st.session_state.df, should_plot_flag)
+                    raw_thinking, reasoning_txt = ReasoningAgent(user_q, result_obj)
+                    reasoning_txt = reasoning_txt.replace("`", "")
+                is_plot = isinstance(result_obj, (plt.Figure, plt.Axes))
+                plot_idx = None
+                if is_plot:
+                    fig = result_obj.figure if isinstance(result_obj, plt.Axes) else result_obj
+                    st.session_state.plots.append(fig)
+                    plot_idx = len(st.session_state.plots) - 1
+                    header = "Here is the visualization you requested:"
+                elif isinstance(result_obj, (pd.DataFrame, pd.Series)):
+                    header = f"Result: {len(result_obj)} rows" if isinstance(result_obj, pd.DataFrame) else "Result series"
+                else:
+                    header = f"Result: {result_obj}"
+                thinking_html = ""
+                if raw_thinking:
+                    thinking_html = (
+                        '<details class="thinking">'
+                        '<summary>🧠 Reasoning</summary>'
+                        f'<pre>{raw_thinking}</pre>'
+                        '</details>'
+                    )
+                explanation_html = reasoning_txt
+                code_html = (
+                    '<details class="code">'
+                    '<summary>View code</summary>'
+                    '<pre><code class="language-python">'
+                    f'{code}'
+                    '</code></pre>'
+                    '</details>'
+                )
+                assistant_msg = f"{thinking_html}{explanation_html}\n\n{code_html}"
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": assistant_msg,
+                    "plot_index": plot_idx
+                })
+                st.rerun()
+if __name__ == "__main__":
     main()