Spaces:

FireBird-Tech
/

auto-analyst-backend

Running on CPU Upgrade

File size: 48,864 Bytes

import re
import json
import sys
import contextlib
from io import StringIO
import time
import logging
from src.utils.logger import Logger
import textwrap

logger = Logger(__name__, level="INFO", see_time=False, console_log=False)

@contextlib.contextmanager
def stdoutIO(stdout=None):
    old = sys.stdout
    if stdout is None:
        stdout = StringIO()
    sys.stdout = stdout
    yield stdout
    sys.stdout = old
    
# Precompile regex patterns for better performance
SENSITIVE_MODULES = re.compile(r"(os|sys|subprocess|dotenv|requests|http|socket|smtplib|ftplib|telnetlib|paramiko)")
IMPORT_PATTERN = re.compile(r"^\s*import\s+(" + SENSITIVE_MODULES.pattern + r").*?(\n|$)", re.MULTILINE)
FROM_IMPORT_PATTERN = re.compile(r"^\s*from\s+(" + SENSITIVE_MODULES.pattern + r").*?(\n|$)", re.MULTILINE)
DYNAMIC_IMPORT_PATTERN = re.compile(r"__import__\s*\(\s*['\"](" + SENSITIVE_MODULES.pattern + r")['\"].*?\)")
ENV_ACCESS_PATTERN = re.compile(r"(os\.getenv|os\.environ|load_dotenv|\.__import__\s*\(\s*['\"]os['\"].*?\.environ)")
FILE_ACCESS_PATTERN = re.compile(r"(open\(|read\(|write\(|file\(|with\s+open)")

# Enhanced API key detection patterns
API_KEY_PATTERNS = [
    # Direct key assignments
    re.compile(r"(?i)(api_?key|access_?token|secret_?key|auth_?token|password|credential|secret)s?\s*=\s*[\"\'][\w\-\+\/\=]{8,}[\"\']"),
    # Function calls with keys
    re.compile(r"(?i)\.set_api_key\(\s*[\"\'][\w\-\+\/\=]{8,}[\"\']"),
    # Dictionary assignments
    re.compile(r"(?i)['\"](?:api_?key|access_?token|secret_?key|auth_?token|password|credential|secret)['\"](?:\s*:\s*)[\"\'][\w\-\+\/\=]{8,}[\"\']"),
    # Common key formats (base64-like, hex)
    re.compile(r"[\"\'](?:[A-Za-z0-9\+\/\=]{32,}|[0-9a-fA-F]{32,})[\"\']"),
    # Bearer token pattern
    re.compile(r"[\"\'](Bearer\s+[\w\-\+\/\=]{8,})[\"\']"),
    # Inline URL with auth
    re.compile(r"https?:\/\/[\w\-\+\/\=]{8,}@")
]

# Network request patterns
NETWORK_REQUEST_PATTERNS = re.compile(r"(requests\.|urllib\.|http\.|\.post\(|\.get\(|\.connect\()")

def check_security_concerns(code_str):
    """Check code for security concerns and return info about what was found"""
    security_concerns = {
        "has_concern": False,
        "messages": [],
        "blocked_imports": False,
        "blocked_dynamic_imports": False,
        "blocked_env_access": False,
        "blocked_file_access": False,
        "blocked_api_keys": False,
        "blocked_network": False
    }
    
    # Check for sensitive imports
    if IMPORT_PATTERN.search(code_str) or FROM_IMPORT_PATTERN.search(code_str):
        security_concerns["has_concern"] = True
        security_concerns["blocked_imports"] = True
        security_concerns["messages"].append("Sensitive module imports blocked")
    
    # Check for __import__ bypass technique
    if DYNAMIC_IMPORT_PATTERN.search(code_str):
        security_concerns["has_concern"] = True
        security_concerns["blocked_dynamic_imports"] = True
        security_concerns["messages"].append("Dynamic import of sensitive modules blocked")
    
    # Check for environment variables access
    if ENV_ACCESS_PATTERN.search(code_str):
        security_concerns["has_concern"] = True
        security_concerns["blocked_env_access"] = True
        security_concerns["messages"].append("Environment variables access blocked")
    
    # Check for file operations
    if FILE_ACCESS_PATTERN.search(code_str):
        security_concerns["has_concern"] = True
        security_concerns["blocked_file_access"] = True
        security_concerns["messages"].append("File operations blocked")
    
    # Check for API key patterns
    for pattern in API_KEY_PATTERNS:
        if pattern.search(code_str):
            security_concerns["has_concern"] = True
            security_concerns["blocked_api_keys"] = True
            security_concerns["messages"].append("API key/token usage blocked")
            break
    
    # Check for network requests
    if NETWORK_REQUEST_PATTERNS.search(code_str):
        security_concerns["has_concern"] = True
        security_concerns["blocked_network"] = True
        security_concerns["messages"].append("Network requests blocked")
    
    return security_concerns

def clean_code_for_security(code_str, security_concerns):
    """Apply security modifications to the code based on detected concerns"""
    modified_code = code_str
    
    # Block sensitive imports if needed
    if security_concerns["blocked_imports"]:
        modified_code = IMPORT_PATTERN.sub(r'# BLOCKED: import \1\n', modified_code)
        modified_code = FROM_IMPORT_PATTERN.sub(r'# BLOCKED: from \1\n', modified_code)
    
    # Block dynamic imports if needed
    if security_concerns["blocked_dynamic_imports"]:
        modified_code = DYNAMIC_IMPORT_PATTERN.sub(r'"BLOCKED_DYNAMIC_IMPORT"', modified_code)
    
    # Block environment access if needed
    if security_concerns["blocked_env_access"]:
        modified_code = ENV_ACCESS_PATTERN.sub(r'"BLOCKED_ENV_ACCESS"', modified_code)
    
    # Block file operations if needed
    if security_concerns["blocked_file_access"]:
        modified_code = FILE_ACCESS_PATTERN.sub(r'"BLOCKED_FILE_ACCESS"', modified_code)
    
    # Block API keys if needed
    if security_concerns["blocked_api_keys"]:
        for pattern in API_KEY_PATTERNS:
            modified_code = pattern.sub(r'"BLOCKED_API_KEY"', modified_code)
    
    # Block network requests if needed
    if security_concerns["blocked_network"]:
        modified_code = NETWORK_REQUEST_PATTERNS.sub(r'"BLOCKED_NETWORK_REQUEST"', modified_code)
    
    # Add warning banner if needed
    if security_concerns["has_concern"]:
        security_message = "⚠️ SECURITY WARNING: " + ". ".join(security_concerns["messages"]) + "."
        modified_code = f"print('{security_message}')\n\n" + modified_code
    
    return modified_code
    
def format_correlation_output(text):
    """Format correlation matrix output for better readability"""
    lines = text.split('\n')
    formatted_lines = []
    
    for line in lines:
        # Skip empty lines at the beginning
        if not line.strip() and not formatted_lines:
            continue
        
        if not line.strip():
            formatted_lines.append(line)
            continue
        
        # Check if this line contains correlation values or variable names
        stripped_line = line.strip()
        parts = stripped_line.split()
        
        if len(parts) > 1:
            # Check if this is a header line with variable names
            if all(part.replace('_', '').replace('-', '').isalpha() for part in parts):
                # This is a header row with variable names
                formatted_header = f"{'':12}"  # Empty first column for row labels
                for part in parts:
                    formatted_header += f"{part:>12}"
                formatted_lines.append(formatted_header)
            elif any(char.isdigit() for char in stripped_line) and ('.' in stripped_line or '-' in stripped_line):
                # This looks like a correlation line with numbers
                row_name = parts[0] if parts else ""
                values = parts[1:] if len(parts) > 1 else []
                
                formatted_row = f"{row_name:<12}"
                for value in values:
                    try:
                        val = float(value)
                        formatted_row += f"{val:>12.3f}"
                    except ValueError:
                        formatted_row += f"{value:>12}"
                
                formatted_lines.append(formatted_row)
            else:
                # Other lines (like titles)
                formatted_lines.append(line)
        else:
            formatted_lines.append(line)
    
    return '\n'.join(formatted_lines)

def format_summary_stats(text):
    """Format summary statistics for better readability"""
    lines = text.split('\n')
    formatted_lines = []
    
    for line in lines:
        if not line.strip():
            formatted_lines.append(line)
            continue
        
        # Check if this is a header line with statistical terms only (missing first column)
        stripped_line = line.strip()
        if any(stat in stripped_line.lower() for stat in ['count', 'mean', 'median', 'std', 'min', 'max', '25%', '50%', '75%']):
            parts = stripped_line.split()
            # Check if this is a header row (starts with statistical terms)
            if parts and parts[0].lower() in ['count', 'mean', 'median', 'std', 'min', 'max', '25%', '50%', '75%']:
                # This is a header row - add proper spacing
                formatted_header = f"{'':12}"  # Empty first column for row labels
                for part in parts:
                    formatted_header += f"{part:>15}"
                formatted_lines.append(formatted_header)
            else:
                # This is a data row - format normally
                row_name = parts[0] if parts else ""
                values = parts[1:] if len(parts) > 1 else []
                
                formatted_row = f"{row_name:<12}"
                for value in values:
                    try:
                        if '.' in value or 'e' in value.lower():
                            val = float(value)
                            if abs(val) >= 1000000:
                                formatted_row += f"{val:>15.2e}"
                            elif abs(val) >= 1:
                                formatted_row += f"{val:>15.2f}"
                            else:
                                formatted_row += f"{val:>15.6f}"
                        else:
                            val = int(value)
                            formatted_row += f"{val:>15}"
                    except ValueError:
                        formatted_row += f"{value:>15}"
                
                formatted_lines.append(formatted_row)
        else:
            # Other lines (titles, etc.) - keep as is
            formatted_lines.append(line)
    
    return '\n'.join(formatted_lines)
    
def clean_print_statements(code_block):
    """
    This function cleans up any `print()` statements that might contain unwanted `\n` characters.
    It ensures print statements are properly formatted without unnecessary newlines.
    """
    # This regex targets print statements, even if they have newlines inside
    return re.sub(r'print\((.*?)(\\n.*?)(.*?)\)', r'print(\1\3)', code_block, flags=re.DOTALL)

def remove_code_block_from_summary(summary):
    # use regex to remove code block from summary list
    summary = re.sub(r'```python\n(.*?)\n```', '', summary)
    return summary.split("\n")

def remove_main_block(code):
    # Match the __main__ block
    pattern = r'(?m)^if\s+__name__\s*==\s*["\']__main__["\']\s*:\s*\n((?:\s+.*\n?)*)'
    
    match = re.search(pattern, code)
    if match:
        main_block = match.group(1)
        
        # Dedent the code block inside __main__
        dedented_block = textwrap.dedent(main_block)
        
        # Remove \n from any print statements in the block (also handling multiline print cases)
        dedented_block = clean_print_statements(dedented_block)
        # Replace the block in the code
        cleaned_code = re.sub(pattern, dedented_block, code)
        
        # Optional: Remove leading newlines if any
        cleaned_code = cleaned_code.strip()
        
        return cleaned_code
    return code


def format_code_block(code_str):
    code_clean = re.sub(r'^```python\n?', '', code_str, flags=re.MULTILINE)
    code_clean = re.sub(r'\n```$', '', code_clean)
    return f'\n{code_clean}\n'

def format_code_backticked_block(code_str):
    code_clean = re.sub(r'^```python\n?', '', code_str, flags=re.MULTILINE)
    code_clean = re.sub(r'\n```$', '', code_clean)
    # Only match assignments at top level (not indented)
    # 1. Remove 'df = pd.DataFrame()' if it's at the top level
  
  
    # Remove reading the csv file if it's already in the context
    modified_code = re.sub(r"df\s*=\s*pd\.read_csv\([\"\'].*?[\"\']\).*?(\n|$)", '', code_clean)
    
    # Only match assignments at top level (not indented)
    # 1. Remove 'df = pd.DataFrame()' if it's at the top level
    modified_code = re.sub(
        r"^df\s*=\s*pd\.DataFrame\(\s*\)\s*(#.*)?$",
        '',
        modified_code,
        flags=re.MULTILINE
    )

    # # Remove sample dataframe lines with multiple array values
    modified_code = re.sub(r"^# Sample DataFrames?.*?(\n|$)", '', modified_code, flags=re.MULTILINE | re.IGNORECASE)
    
    # # Remove plt.show() statements
    modified_code = re.sub(r"plt\.show\(\).*?(\n|$)", '', modified_code)
    
    
    # remove main
    code_clean = remove_main_block(modified_code)
    
    return f'```python\n{code_clean}\n```'

    
def execute_code_from_markdown(code_str, dataframe=None):
    import pandas as pd
    import plotly.express as px
    import plotly
    import plotly.graph_objects as go
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    import re
    import traceback
    import sys
    from io import StringIO, BytesIO
    import base64

    # Check for security concerns in the code
    security_concerns = check_security_concerns(code_str)
    
    # Apply security modifications to the code
    modified_code = clean_code_for_security(code_str, security_concerns)
    
    # Enhanced print function that detects and formats tabular data
    captured_outputs = []
    original_print = print
    
    # Set pandas display options for full table display
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 20)  # Limit to 20 rows instead of unlimited
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 50)
    pd.set_option('display.expand_frame_repr', False)
    

    
    def enhanced_print(*args, **kwargs):
        # Convert all args to strings
        str_args = [str(arg) for arg in args]
        output_text = kwargs.get('sep', ' ').join(str_args)
        
        # Special case for DataFrames - use pipe delimiter and clean format
        if isinstance(args[0], pd.DataFrame) and len(args) == 1:
            # Format DataFrame with pipe delimiter using to_csv for reliable column separation
            df = args[0]
            
            # Use StringIO to capture CSV output with pipe delimiter
            from io import StringIO
            csv_buffer = StringIO()
            
            # Export to CSV with pipe delimiter, preserving index
            df.to_csv(csv_buffer, sep='|', index=True, float_format='%.6g')
            csv_output = csv_buffer.getvalue()
            
            # Clean up the CSV output - remove quotes and extra formatting
            lines = csv_output.strip().split('\n')
            cleaned_lines = []
            
            for line in lines:
                # Remove any quotes that might have been added by to_csv
                clean_line = line.replace('"', '')
                # Split by pipe, strip whitespace from each part, then rejoin
                parts = [part.strip() for part in clean_line.split('|')]
                cleaned_lines.append(' | '.join(parts))
            
            output_text = '\n'.join(cleaned_lines)
            captured_outputs.append(f"<TABLE_START>\n{output_text}\n<TABLE_END>")
            original_print(output_text)
            return
        
        # Detect if this looks like tabular data (generic approach)
        is_table = False
        
        # Check for table patterns:
        # 1. Multiple lines with consistent spacing
        lines = output_text.split('\n')
        if len(lines) > 2:
            # Count lines that look like they have multiple columns (2+ spaces between words)
            multi_column_lines = sum(1 for line in lines if len(line.split()) > 1 and '  ' in line)
            if multi_column_lines >= 2:  # At least 2 lines with multiple columns
                is_table = True
            
            # Check for pandas DataFrame patterns like index with column names
            if any(re.search(r'^\s*\d+\s+', line) for line in lines):
                # Look for lines starting with an index number followed by spaces
                is_table = True
                
            # Look for table-like structured output with multiple rows of similar format
            if len(lines) >= 3:
                # Sample a few lines to check for consistent structure
                sample_lines = [lines[i] for i in range(min(len(lines), 5)) if i < len(lines) and lines[i].strip()]
                
                # Check for consistent whitespace patterns
                if len(sample_lines) >= 2:
                    # Get positions of whitespace groups in first line
                    whitespace_positions = []
                    for i, line in enumerate(sample_lines):
                        if not line.strip():
                            continue
                        positions = [m.start() for m in re.finditer(r'\s{2,}', line)]
                        if i == 0:
                            whitespace_positions = positions
                        elif len(positions) == len(whitespace_positions):
                            # Check if whitespace positions are roughly the same
                            is_similar = all(abs(pos - whitespace_positions[j]) <= 3 
                                            for j, pos in enumerate(positions) 
                                            if j < len(whitespace_positions))
                            if is_similar:
                                is_table = True
        
        # 2. Contains common table indicators
        if any(indicator in output_text.lower() for indicator in [
            'count', 'mean', 'std', 'min', 'max', '25%', '50%', '75%',  # Summary stats
            'correlation', 'corr',  # Correlation tables
            'coefficient', 'r-squared', 'p-value',  # Regression tables
        ]):
            is_table = True
        
        # 3. Has many decimal numbers (likely a data table)
        if output_text.count('.') > 5 and len(lines) > 2:
            is_table = True
            
        # If we have detected a table, convert space-delimited to pipe-delimited format
        if is_table:
            # Convert the table to pipe-delimited format for better parsing in frontend
            formatted_lines = []
            for line in lines:
                if not line.strip():
                    formatted_lines.append(line)  # Keep empty lines
                    continue
                
                # Split by multiple spaces and join with pipe delimiter
                parts = re.split(r'\s{2,}', line.strip())
                if parts:
                    formatted_lines.append(" | ".join(parts))
                else:
                    formatted_lines.append(line)
            
            # Use the pipe-delimited format
            output_text = "\n".join(formatted_lines)
            
            # Format and mark the output for table processing in UI
            captured_outputs.append(f"<TABLE_START>\n{output_text}\n<TABLE_END>")
        else:
            captured_outputs.append(output_text)
        
        # Also use original print for stdout capture
        original_print(*args, **kwargs)

    # Custom matplotlib capture function
    def capture_matplotlib_chart():
        """Capture current matplotlib figure as base64 encoded image"""
        try:
            fig = plt.gcf()  # Get current figure
            if fig.get_axes():  # Check if figure has any plots
                buffer = BytesIO()
                fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight', 
                           facecolor='white', edgecolor='none')
                buffer.seek(0)
                img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
                buffer.close()
                plt.close(fig)  # Close the figure to free memory
                return img_base64
            return None
        except Exception:
            return None

    # Store original plt.show function
    original_plt_show = plt.show
    
    def custom_plt_show(*args, **kwargs):
        """Custom plt.show that captures the chart instead of displaying it"""
        img_base64 = capture_matplotlib_chart()
        if img_base64:
            matplotlib_outputs.append(img_base64)
        # Don't call original show to prevent display
    
    context = {
        'pd': pd,
        'px': px,
        'go': go,
        'plt': plt,
        'plotly': plotly,
        '__builtins__': __builtins__,
        '__import__': __import__,
        'sns': sns,
        'np': np,
        'json_outputs': [],  # List to store multiple Plotly JSON outputs
        'matplotlib_outputs': [],  # List to store matplotlib chart images as base64
        'print': enhanced_print  # Replace print with our enhanced version
    }
    
    # Add matplotlib_outputs to local scope for the custom show function
    matplotlib_outputs = context['matplotlib_outputs']
    
    # Replace plt.show with our custom function
    plt.show = custom_plt_show
    
    

    # Modify code to store multiple JSON outputs
    modified_code = re.sub(
        r'(\w*_?)fig(\w*)\.show\(\)',
        r'json_outputs.append(plotly.io.to_json(\1fig\2, pretty=True))',
        modified_code
    )

    modified_code = re.sub(
        r'(\w*_?)fig(\w*)\.to_html\(.*?\)',
        r'json_outputs.append(plotly.io.to_json(\1fig\2, pretty=True))',
        modified_code
    )    
    # Remove reading the csv file if it's already in the context
    modified_code = re.sub(r"df\s*=\s*pd\.read_csv\([\"\'].*?[\"\']\).*?(\n|$)", '', modified_code)
    
    # Only match assignments at top level (not indented)
    # 1. Remove 'df = pd.DataFrame()' if it's at the top level
    modified_code = re.sub(
        r"^df\s*=\s*pd\.DataFrame\(\s*\)\s*(#.*)?$",
        '',
        modified_code,
        flags=re.MULTILINE
    )
    

    # Custom display function for DataFrames to show head + tail for large datasets
    original_repr = pd.DataFrame.__repr__
    
    def custom_df_repr(self):
        if len(self) > 15:
            # For large DataFrames, show first 10 and last 5 rows
            head_part = self.head(10)
            tail_part = self.tail(5)
            
            head_str = head_part.__repr__()
            tail_str = tail_part.__repr__()
            
            # Extract just the data rows (skip the header from tail)
            tail_lines = tail_str.split('\n')
            tail_data = '\n'.join(tail_lines[1:])  # Skip header line
            
            return f"{head_str}\n...\n{tail_data}"
        else:
            return original_repr(self)
    
    # Apply custom representation temporarily
    pd.DataFrame.__repr__ = custom_df_repr

    # If a dataframe is provided, add it to the context
    if dataframe is not None:
        context['df'] = dataframe

    # remove pd.read_csv() if it's already in the context
    modified_code = re.sub(r"pd\.read_csv\(\s*[\"\'].*?[\"\']\s*\)", '', modified_code)

    # Remove sample dataframe lines with multiple array values
    modified_code = re.sub(r"^# Sample DataFrames?.*?(\n|$)", '', modified_code, flags=re.MULTILINE | re.IGNORECASE)
    
    # Replace plt.savefig() calls with plt.show() to ensure plots are displayed
    modified_code = re.sub(r'plt\.savefig\([^)]*\)', 'plt.show()', modified_code)
    
    # Instead of removing plt.show(), keep them - they'll be handled by our custom function
    # Also handle seaborn plots that might not have explicit plt.show()
    # Add plt.show() after seaborn plot functions if not already present
    seaborn_plot_functions = [
        'sns.scatterplot', 'sns.lineplot', 'sns.barplot', 'sns.boxplot', 'sns.violinplot',
        'sns.stripplot', 'sns.swarmplot', 'sns.pointplot', 'sns.catplot', 'sns.relplot',
        'sns.displot', 'sns.histplot', 'sns.kdeplot', 'sns.ecdfplot', 'sns.rugplot',
        'sns.distplot', 'sns.jointplot', 'sns.pairplot', 'sns.FacetGrid', 'sns.PairGrid',
        'sns.heatmap', 'sns.clustermap', 'sns.regplot', 'sns.lmplot', 'sns.residplot'
    ]
    
    # Add automatic plt.show() after seaborn plots if not already present
    for func in seaborn_plot_functions:
        pattern = rf'({re.escape(func)}\([^)]*\)(?:\.[^(]*\([^)]*\))*)'
        def add_show(match):
            plot_call = match.group(1)
            # Check if the next non-empty line already has plt.show()
            return f'{plot_call}\nplt.show()'
        
        modified_code = re.sub(pattern, add_show, modified_code)
    
    # Only add df = pd.read_csv() if no dataframe was provided and the code contains pd.read_csv
    if dataframe is None and 'pd.read_csv' not in modified_code:
        modified_code = re.sub(
            r'import pandas as pd',
            r'import pandas as pd\n\n# Read Housing.csv\ndf = pd.read_csv("Housing.csv")',
            modified_code
        )

    # Identify code blocks by comments
    code_blocks = []
    current_block = []
    current_block_name = "unknown"
    
    for line in modified_code.splitlines():
        # Check if line contains a block identifier comment
        block_match = re.match(r'^# ([a-zA-Z_]+)_agent code start', line)
        if block_match:
            # If we had a previous block, save it
            if current_block:
                code_blocks.append((current_block_name, '\n'.join(current_block)))
            # Start a new block
            current_block_name = block_match.group(1)
            current_block = []
        else:
            current_block.append(line)
    
    # Add the last block if it exists
    if current_block:
        code_blocks.append((current_block_name, '\n'.join(current_block)))
    
    # Execute each code block separately
    all_outputs = []
    for block_name, block_code in code_blocks:
        try:
            # Clear captured outputs for each block
            captured_outputs.clear()
            
            with stdoutIO() as s:
                exec(block_code, context)  # Execute the block
            
            # Get both stdout and our enhanced captured outputs
            stdout_output = s.getvalue()
            
            # Combine outputs, preferring our enhanced format when available
            if captured_outputs:
                combined_output = '\n'.join(captured_outputs)
            else:
                combined_output = stdout_output
            
            all_outputs.append((block_name, combined_output, None))  # None means no error
        except Exception as e:
            # Reset pandas options in case of error
            pd.reset_option('display.max_columns')
            pd.reset_option('display.max_rows')
            pd.reset_option('display.width')
            pd.reset_option('display.max_colwidth')
            pd.reset_option('display.expand_frame_repr')
            
            # Restore original DataFrame representation in case of error
            pd.DataFrame.__repr__ = original_repr
            
            # Restore original plt.show
            plt.show = original_plt_show
            
            error_traceback = traceback.format_exc()
            
            # Extract error message and error type
            error_message = str(e)
            error_type = type(e).__name__
            error_lines = error_traceback.splitlines()
            
            # Format error with context of the actual code
            formatted_error = f"Error in {block_name}_agent: {error_message}\n"
            
            # Add first few lines of traceback
            first_lines = error_lines[:3]
            formatted_error += "\n".join(first_lines) + "\n"
            
            # Parse problem variables/values from the error message
            problem_vars = []
            
            # Look for common error patterns
            if "not in index" in error_message:
                # Extract column names for 'not in index' errors
                column_match = re.search(r"\['([^']+)'(?:, '([^']+)')*\] not in index", error_message)
                if column_match:
                    problem_vars = [g for g in column_match.groups() if g is not None]
                    
                    # Look for DataFrame accessing operations and list/variable definitions
                    potential_lines = []
                    code_lines = block_code.splitlines()
                    
                    # First, find all DataFrame column access patterns
                    df_access_patterns = []
                    for i, line in enumerate(code_lines):
                        # Find DataFrame variables from patterns like "df_name[...]" or "df_name.loc[...]"
                        df_matches = re.findall(r'(\w+)(?:\[|\.)(?:loc|iloc|columns|at|iat|\.select)', line)
                        for df_var in df_matches:
                            df_access_patterns.append((i, df_var))
                        
                        # Find variables that might contain column lists
                        for var in problem_vars:
                            if re.search(r'\b(numeric_columns|categorical_columns|columns|features|cols)\b', line):
                                potential_lines.append(i)
                    
                    # Identify the most likely problematic lines
                    if df_access_patterns:
                        for i, df_var in df_access_patterns:
                            if any(re.search(rf'{df_var}\[.*?\]', line) for line in code_lines):
                                potential_lines.append(i)
                    
                    # If no specific lines found yet, look for any DataFrame operations
                    if not potential_lines:
                        for i, line in enumerate(code_lines):
                            if re.search(r'(?:corr|drop|groupby|pivot|merge|join|concat|apply|map|filter|loc|iloc)\(', line):
                                potential_lines.append(i)
                    
                    # Sort and deduplicate
                    potential_lines = sorted(set(potential_lines))
            elif "name" in error_message and "is not defined" in error_message:
                # Extract variable name for NameError
                var_match = re.search(r"name '([^']+)' is not defined", error_message)
                if var_match:
                    problem_vars = [var_match.group(1)]
            elif "object has no attribute" in error_message:
                # Extract attribute name for AttributeError
                attr_match = re.search(r"'([^']+)' object has no attribute '([^']+)'", error_message)
                if attr_match:
                    problem_vars = [f"{attr_match.group(1)}.{attr_match.group(2)}"]
            
            # Scan code for lines containing the problem variables
            if problem_vars:
                formatted_error += "\nProblem likely in these lines:\n"
                code_lines = block_code.splitlines()
                problem_lines = []
                
                # First try direct variable references
                direct_matches = False
                for i, line in enumerate(code_lines):
                    if any(var in line for var in problem_vars):
                        direct_matches = True
                        # Get line and its context (1 line before and after)
                        start_idx = max(0, i-1)
                        end_idx = min(len(code_lines), i+2)
                        
                        for j in range(start_idx, end_idx):
                            line_prefix = f"{j+1}: "
                            if j == i:  # The line with the problem variable
                                problem_lines.append(f"{line_prefix}>>> {code_lines[j]} <<<")
                            else:
                                problem_lines.append(f"{line_prefix}{code_lines[j]}")
                        
                        problem_lines.append("") # Empty line between sections
                
                # If no direct matches found but we identified potential problematic lines for DataFrame issues
                if not direct_matches and "not in index" in error_message and 'potential_lines' in locals():
                    for i in potential_lines:
                        start_idx = max(0, i-1)
                        end_idx = min(len(code_lines), i+2)
                        
                        for j in range(start_idx, end_idx):
                            line_prefix = f"{j+1}: "
                            if j == i:
                                problem_lines.append(f"{line_prefix}>>> {code_lines[j]} <<<")
                            else:
                                problem_lines.append(f"{line_prefix}{code_lines[j]}")
                        
                        problem_lines.append("") # Empty line between sections
                
                if problem_lines:
                    formatted_error += "\n".join(problem_lines)
                else:
                    # Special message for column errors when we can't find the exact reference
                    if "not in index" in error_message:
                        formatted_error += (f"Unable to locate direct reference to columns: {', '.join(problem_vars)}\n"
                                           f"Check for variables that might contain these column names (like numeric_columns, "
                                           f"categorical_columns, etc.)\n")
                    else:
                        formatted_error += f"Unable to locate lines containing: {', '.join(problem_vars)}\n"
            else:
                # If we couldn't identify specific variables, check for line numbers in traceback
                for line in reversed(error_lines):  # Search from the end of traceback
                    # Look for user code references in the traceback
                    if ', line ' in line and '<module>' in line:
                        try:
                            line_num = int(re.search(r', line (\d+)', line).group(1))
                            code_lines = block_code.splitlines()
                            if 0 < line_num <= len(code_lines):
                                line_idx = line_num - 1
                                start_idx = max(0, line_idx-2)
                                end_idx = min(len(code_lines), line_idx+3)
                                
                                formatted_error += "\nProblem at this location:\n"
                                for i in range(start_idx, end_idx):
                                    line_prefix = f"{i+1}: "
                                    if i == line_idx:
                                        formatted_error += f"{line_prefix}>>> {code_lines[i]} <<<\n"
                                    else:
                                        formatted_error += f"{line_prefix}{code_lines[i]}\n"
                                break
                        except (ValueError, AttributeError, IndexError):
                            pass
            
            # Add the last few lines of the traceback
            formatted_error += "\nFull error details:\n"
            last_lines = error_lines[-3:]
            formatted_error += "\n".join(last_lines)
            
            all_outputs.append((block_name, None, formatted_error))
    
    # Reset pandas options after execution
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')
    pd.reset_option('display.expand_frame_repr')
    
    # Restore original DataFrame representation
    pd.DataFrame.__repr__ = original_repr
    
    # Restore original plt.show
    plt.show = original_plt_show
    
    # Compile all outputs and errors
    output_text = ""
    json_outputs = context.get('json_outputs', [])
    matplotlib_outputs = context.get('matplotlib_outputs', [])
    error_found = False
    
    for block_name, output, error in all_outputs:
        if error:
            output_text += f"\n\n=== ERROR IN {block_name.upper()}_AGENT ===\n{error}\n"
            error_found = True
        elif output:
            output_text += f"\n\n=== OUTPUT FROM {block_name.upper()}_AGENT ===\n{output}\n"
    
    if error_found:
        return output_text, [], []
    else:
        return output_text, json_outputs, matplotlib_outputs
    
    
def format_plan_instructions(plan_instructions):
    """
    Format any plan instructions (JSON string or dict) into markdown sections per agent.
    """
    # Parse input into a dict

    if "basic_qa_agent" in str(plan_instructions):
        return "**Non-Data Request**: Please ask a data related query, don't waste credits!"


    try:
        if isinstance(plan_instructions, str):
            try:
                instructions = json.loads(plan_instructions)
            except json.JSONDecodeError as e:
                # Try to clean the string if it's not valid JSON
                cleaned_str = plan_instructions.strip()
                if cleaned_str.startswith("'") and cleaned_str.endswith("'"):
                    cleaned_str = cleaned_str[1:-1]
                try:
                    instructions = json.loads(cleaned_str)
                except json.JSONDecodeError:
                    raise ValueError(f"Invalid JSON format in plan instructions: {str(e)}")
        elif isinstance(plan_instructions, dict):
            instructions = plan_instructions
        else:
            raise TypeError(f"Unsupported plan instructions type: {type(plan_instructions)}")
    except Exception as e:
        raise ValueError(f"Error processing plan instructions: {str(e)}")
    # logger.log_message(f"Plan instructions: {instructions}", level=logging.INFO)



    markdown_lines = []
    for agent, content in instructions.items():
        if agent != 'basic_qa_agent':
            agent_title = agent.replace('_', ' ').title()
            markdown_lines.append(f"#### {agent_title}")
            if isinstance(content, dict):
                # Handle 'create' key
                create_vals = content.get('create', [])
                if create_vals:
                    markdown_lines.append(f"- **Create**:")
                    for item in create_vals:
                        markdown_lines.append(f"  - {item}")
                else:
                    markdown_lines.append(f"- **Create**: None")
    
                # Handle 'use' key
                use_vals = content.get('use', [])
                if use_vals:
                    markdown_lines.append(f"- **Use**:")
                    for item in use_vals:
                        markdown_lines.append(f"  - {item}")
                else:
                    markdown_lines.append(f"- **Use**: None")
    
                # Handle 'instruction' key
                instr = content.get('instruction')
                if isinstance(instr, str) and instr:
                    markdown_lines.append(f"- **Instruction**: {instr}")
                else:
                    markdown_lines.append(f"- **Instruction**: None")
            else:
                # Fallback for non-dict content
                markdown_lines.append(f"- {content}")
            markdown_lines.append("")  # blank line between agents
        else:
            markdown_lines.append(f"**Non-Data Request**: {content.get('instruction')}")

    return "\n".join(markdown_lines).strip()
    

def format_complexity(instructions):
    markdown_lines = []
    # Extract complexity from various possible locations in the structure
    if isinstance(instructions, dict):
        # Case 1: Direct complexity field
        if 'complexity' in instructions:
            complexity = instructions['complexity']
        # Case 2: Complexity in 'plan' object
        elif 'plan' in instructions and isinstance(instructions['plan'], dict):
            if 'complexity' in instructions['plan']:
                complexity = instructions['plan']['complexity']
        else:
            complexity = "unrelated"
    
    if 'plan' in instructions and isinstance(instructions['plan'], str) and "basic_qa_agent" in instructions['plan']:
        complexity = "unrelated"
    
    if complexity:
        # Pink color scheme variations
        color_map = {
            "unrelated": "#FFB6B6",  # Light pink
            "basic": "#FF9E9E",      # Medium pink
            "intermediate": "#FF7F7F", # Main pink
            "advanced": "#FF5F5F"    # Dark pink
        }
        
        indicator_map = {
            "unrelated": "○",
            "basic": "●",
            "intermediate": "●●",
            "advanced": "●●●"
        }
        
        color = color_map.get(complexity.lower(), "#FFB6B6")  # Default to light pink
        indicator = indicator_map.get(complexity.lower(), "○")
        
        # Slightly larger display with pink styling
        markdown_lines.append(f"<div style='color: {color}; border: 2px solid {color}; padding: 2px 8px; border-radius: 12px; display: inline-block; font-size: 14.4px;'>{indicator} {complexity}</div>\n")

        return "\n".join(markdown_lines).strip()    


def format_response_to_markdown(api_response, agent_name = None, dataframe=None):
    try:
        markdown = []
        # logger.log_message(f"API response for {agent_name} at {time.strftime('%Y-%m-%d %H:%M:%S')}: {api_response}", level=logging.INFO)

        if isinstance(api_response, dict):
            for key in api_response:
                if "error" in api_response[key] and "litellm.RateLimitError" in api_response[key]['error'].lower():
                    return f"**Error**: Rate limit exceeded. Please try switching models from the settings."
                # You can add more checks here if needed for other keys
                       
        # Handle error responses
        if isinstance(api_response, dict) and "error" in api_response:
            return f"**Error**: {api_response['error']}"
        if "response" in api_response and isinstance(api_response['response'], str):
            if any(err in api_response['response'].lower() for err in ["auth", "api", "lm"]):
                return "**Error**: Authentication failed. Please check your API key in settings and try again."
            if "model" in api_response['response'].lower():
                return "**Error**: Model configuration error. Please verify your model selection in settings."

        for agent, content in api_response.items():
            agent = agent.split("__")[0] if "__" in agent else agent
            if "memory" in agent or not content:
                continue
                
            if "complexity" in content:
                markdown.append(f"{format_complexity(content)}\n")
                
            markdown.append(f"\n## {agent.replace('_', ' ').title()}\n")
            
            if agent == "analytical_planner":
                logger.log_message(f"Analytical planner content: {content}", level=logging.INFO)
                if 'plan_desc' in content:
                    markdown.append(f"### Reasoning\n{content['plan_desc']}\n")
                if 'plan_instructions' in content:
                    markdown.append(f"{format_plan_instructions(content['plan_instructions'])}\n")
                else:
                    markdown.append(f"### Reasoning\n{content['rationale']}\n")
            else:  
                if "rationale" in content:
                    markdown.append(f"### Reasoning\n{content['rationale']}\n")

            if 'code' in content:
                markdown.append(f"### Code Implementation\n{format_code_backticked_block(content['code'])}\n")
            if 'answer' in content:
                markdown.append(f"### Answer\n{content['answer']}\n Please ask a query about the data")
            if 'summary' in content:
                import re
                summary_text = content['summary']
                summary_text = re.sub(r'```python\n(.*?)\n```', '', summary_text, flags=re.DOTALL)

                markdown.append("### Summary\n")

                # Extract pre-list intro, bullet points, and post-list text
                intro_match = re.split(r'\(\d+\)', summary_text, maxsplit=1)
                if len(intro_match) > 1:
                    intro_text = intro_match[0].strip()
                    rest_text = "(1)" + intro_match[1]  # reattach for bullet parsing
                else:
                    intro_text = summary_text.strip()
                    rest_text = ""

                if intro_text:
                    markdown.append(f"{intro_text}\n")

                # Split bullets at numbered items like (1)...(8)
                bullets = re.split(r'\(\d+\)', rest_text)
                bullets = [b.strip(" ,.\n") for b in bullets if b.strip()]

                # Check for post-list content (anything after the last number)
                for i, bullet in enumerate(bullets):
                    markdown.append(f"* {bullet}\n")




            if 'refined_complete_code' in content and 'summary' in content:
                try:
                    if content['refined_complete_code'] is not None and content['refined_complete_code'] != "":
                        clean_code = format_code_block(content['refined_complete_code']) 
                        markdown_code = format_code_backticked_block(content['refined_complete_code'])
                        output, json_outputs, matplotlib_outputs = execute_code_from_markdown(clean_code, dataframe)
                    elif "```python" in content['summary']:
                        clean_code = format_code_block(content['summary'])
                        markdown_code = format_code_backticked_block(content['summary'])
                        output, json_outputs, matplotlib_outputs = execute_code_from_markdown(clean_code, dataframe)
                except Exception as e:
                    logger.log_message(f"Error in execute_code_from_markdown: {str(e)}", level=logging.ERROR)
                    markdown_code = f"**Error**: {str(e)}"
                    output = None
                    json_outputs = []
                    matplotlib_outputs = []
                    # continue
                
                if markdown_code is not None:
                    markdown.append(f"### Refined Complete Code\n{markdown_code}\n")
                
                if output:
                    markdown.append("### Execution Output\n")
                    markdown.append(f"```output\n{output}\n```\n")
                    
                if json_outputs:
                    markdown.append("### Plotly JSON Outputs\n")
                    for idx, json_output in enumerate(json_outputs):
                        markdown.append(f"```plotly\n{json_output}\n```\n")
                        
                if matplotlib_outputs:
                    markdown.append("### Matplotlib/Seaborn Charts\n")
                    for idx, img_base64 in enumerate(matplotlib_outputs):
                        markdown.append(f"```matplotlib\n{img_base64}\n```\n")
            # if agent_name is not None:  
            #     if f"memory_{agent_name}" in api_response:
            #         markdown.append(f"### Memory\n{api_response[f'memory_{agent_name}']}\n")

    except Exception as e:
        logger.log_message(f"Error in format_response_to_markdown: {str(e)}", level=logging.ERROR)
        return f"{str(e)}"
        
    # logger.log_message(f"Generated markdown content for agent '{agent_name}' at {time.strftime('%Y-%m-%d %H:%M:%S')}: {markdown}, length: {len(markdown)}", level=logging.INFO)
    
    if not markdown or len(markdown) <= 1:
        logger.log_message(
            f"Invalid markdown content for agent '{agent_name}' at {time.strftime('%Y-%m-%d %H:%M:%S')}: "
            f"Content: '{markdown}', Type: {type(markdown)}, Length: {len(markdown) if markdown else 0}, "
            f"API Response: {api_response}",
            level=logging.ERROR
        )
        return " "
        
    return '\n'.join(markdown)


# Example usage with dummy data
if __name__ == "__main__":
    sample_response = {
        "code_combiner_agent": {
            "reasoning": "Sample reasoning for multiple charts.",
            "refined_complete_code": """
```python
import plotly.express as px
import pandas as pd

# Sample Data
df = pd.DataFrame({'Category': ['A', 'B', 'C'], 'Values': [10, 20, 30]})

# First Chart
fig = px.bar(df, x='Category', y='Values', title='Bar Chart')
fig.show()

# Second Chart
fig2 = px.pie(df, values='Values', names='Category', title='Pie Chart')
fig2.show()
```
"""
        }
    }

    formatted_md = format_response_to_markdown(sample_response)