import os import networkx as nx import re import json import base64 import tempfile import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # kept as requested import io import os import re import json import base64 import tempfile import subprocess import logging from io import BytesIO from typing import Dict, Any, List from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse, HTMLResponse from fastapi import FastAPI from dotenv import load_dotenv import requests import pandas as pd import matplotlib.pyplot as plt import numpy as np # Optional image conversion try: from PIL import Image PIL_READY = True except Exception: PIL_READY = False # Optional PDF libs (use if available) try: import pdfplumber PDFPLUMBER_READY = True except Exception: PDFPLUMBER_READY = False try: from pypdf import PdfReader PYPDF_READY = True except Exception: PYPDF_READY = False # NEW: optional pdfminer text extractor try: from pdfminer.high_level import extract_text as pdfminer_extract_text PDFMINER_READY = True except Exception: PDFMINER_READY = False import zlib # for raw fallback on Flate streams # NEW: DB support import sqlite3 # LangChain / LLM imports from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.tools import tool from langchain.agents import create_tool_calling_agent, AgentExecutor from langchain_openai import ChatOpenAI from langchain_core.messages import HumanMessage, SystemMessage load_dotenv() logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) app = FastAPI(title="TDS Data Analyst Agent") # ---- CORS (needed if prof site calls from browser) ---- from fastapi.middleware.cors import CORSMiddleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=False, allow_methods=["*"], allow_headers=["*"], ) # ---- AI Pipe / OpenAI-compatible setup ---- def get_ai_pipe_token() -> str: """ Pick the first defined OPENAI_API_KEY_* from env. Falls back to OPENAI_API_KEY if present. """ for i in range(1, 6): k = os.getenv(f"OPENAI_API_KEY_{i}") if k: return k return os.getenv("OPENAI_API_KEY") OPENAI_API_KEY = get_ai_pipe_token() OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://aipipe.org/openai/v1") if not OPENAI_API_KEY: raise RuntimeError("No OPENAI_API_KEY or OPENAI_API_KEY_{i} found in environment.") # LangChain OpenAI chat model pointing at AI Pipe chat_model = ChatOpenAI( model=os.getenv("OPENAI_MODEL", "gpt-5-mini"), # <- default to gpt-5-mini temperature=0, api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=int(os.getenv("LLM_TIMEOUT_SECONDS", "240")), ) LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", 240)) @app.get("/", response_class=HTMLResponse) async def serve_frontend(): """Serve the main HTML interface""" try: with open("index.html", "r", encoding="utf-8") as f: return HTMLResponse(content=f.read()) except FileNotFoundError: return HTMLResponse(content="
Please ensure index.html is in the same directory as app.py
", status_code=404) def parse_keys_and_types(raw_qs: str): """ Parses the key/type section from the questions file. Returns: ordered_keys: list of keys in order cast_lookup: dict key -> casting function """ import re keytype_pattern = r"-\s*`([^`]+)`\s*:\s*(\w+)" keytype_matches = re.findall(keytype_pattern, raw_qs) cast_map = { "number": float, "string": str, "integer": int, "int": int, "float": float } cast_lookup = {key: cast_map.get(t.lower(), str) for key, t in keytype_matches} ordered_keys = [k for k, _ in keytype_matches] return ordered_keys, cast_lookup # ----------------------------- # PDF helpers (updated) # ----------------------------- def _pdf_fallback_text_from_bytes(pdf_bytes: bytes) -> str: """ Very rough, pure-Python fallback: - Try to locate Flate-compressed streams and decompress with zlib. - If nothing decompresses, extract long printable sequences from raw bytes. """ text_chunks = [] # Try Flate streams (stream ... endstream) try: raw = pdf_bytes # find all streams for m in re.finditer(rb"stream\s*([\s\S]*?)\s*endstream", raw, flags=re.MULTILINE): block = m.group(1) # common: streams begin with newline; try zlib for wbits in (15, -15): try: dec = zlib.decompress(block, wbits) try: text_chunks.append(dec.decode("utf-8", "ignore")) except Exception: text_chunks.append(dec.decode("latin-1", "ignore")) break except Exception: continue except Exception: pass combined = "\n".join([t for t in text_chunks if t and t.strip()]) if not combined: # As last resort, pull visible ASCII-ish sequences directly from bytes try: ascii_like = re.findall(rb"[ -~\t\r\n]{5,}", pdf_bytes) # printable combined = "\n".join([a.decode("latin-1", "ignore") for a in ascii_like]) except Exception: combined = "" return combined.strip() def extract_pdf_to_dataframe(pdf_bytes: bytes) -> (pd.DataFrame, str): """ Attempt to extract tables and/or text from a PDF. Return (df, info_text). df is: - the largest detected table (if any), or - a single-column DataFrame with 'text' if only text found. info_text is a brief diagnostic string. """ # 1) pdfplumber: tables + text (try empty password if encrypted) if PDFPLUMBER_READY: try: tables = [] texts = [] # pdfplumber will raise on password; try with and without empty password try: with pdfplumber.open(BytesIO(pdf_bytes)) as pdf: pages_iter = pdf.pages for page in pages_iter: try: t = page.extract_tables() or [] for tbl in t: if not tbl: continue tbl = [row for row in tbl if any(cell is not None and str(cell).strip() for cell in row)] if not tbl: continue header = tbl[0] body = tbl[1:] if len(tbl) > 1 else [] if not header or len(set([str(h).strip() for h in header if h is not None])) != len(header): cols = [f"col_{i}" for i in range(len(header or []))] df_tbl = pd.DataFrame(body, columns=cols if body and len(body[0]) == len(cols) else None) else: df_tbl = pd.DataFrame(body, columns=[str(h).strip() if h is not None else "" for h in header]) if not df_tbl.empty: df_tbl = df_tbl.dropna(how="all", axis=1) if not df_tbl.empty: tables.append(df_tbl) except Exception: pass try: txt = page.extract_text() or "" if txt.strip(): texts.append(txt) except Exception: pass except Exception: # try again with empty password try: with pdfplumber.open(BytesIO(pdf_bytes), password="") as pdf: for page in pdf.pages: try: t = page.extract_tables() or [] for tbl in t: if not tbl: continue tbl = [row for row in tbl if any(cell is not None and str(cell).strip() for cell in row)] if not tbl: continue header = tbl[0] body = tbl[1:] if len(tbl) > 1 else [] if not header or len(set([str(h).strip() for h in header if h is not None])) != len(header): cols = [f"col_{i}" for i in range(len(header or []))] df_tbl = pd.DataFrame(body, columns=cols if body and len(body[0]) == len(cols) else None) else: df_tbl = pd.DataFrame(body, columns=[str(h).strip() if h is not None else "" for h in header]) if not df_tbl.empty: df_tbl = df_tbl.dropna(how="all", axis=1) if not df_tbl.empty: tables.append(df_tbl) except Exception: pass try: txt = page.extract_text() or "" if txt.strip(): texts.append(txt) except Exception: pass except Exception as e2: log.warning("pdfplumber failed (even with empty password): %s", e2) if tables: best = max(tables, key=lambda d: (len(d) * max(1, len(d.columns)))) return best.reset_index(drop=True), "pdfplumber:table" if texts: text_joined = "\n".join(texts).strip() if text_joined: return pd.DataFrame({"text": [text_joined]}), "pdfplumber:text" except Exception as e: log.warning("pdfplumber failed: %s", e) # 2) pypdf text-only fallback (try decrypt with empty password if encrypted) if PYPDF_READY: try: reader = PdfReader(BytesIO(pdf_bytes)) if getattr(reader, "is_encrypted", False): try: reader.decrypt("") # try empty password except Exception: pass pages_text = [] for p in reader.pages: try: t = p.extract_text() or "" if t.strip(): pages_text.append(t) except Exception: continue if pages_text: return pd.DataFrame({"text": ["\n".join(pages_text).strip()]}), "pypdf:text" except Exception as e: log.warning("pypdf failed: %s", e) # 2.5) NEW: pdfminer text-only fallback if PDFMINER_READY: try: txt = pdfminer_extract_text(BytesIO(pdf_bytes)) or "" if txt.strip(): return pd.DataFrame({"text": [txt.strip()]}), "pdfminer:text" except Exception as e: log.warning("pdfminer failed: %s", e) # 3) pure-Python raw fallback fallback_text = _pdf_fallback_text_from_bytes(pdf_bytes) if fallback_text: return pd.DataFrame({"text": [fallback_text]}), "raw:flate-or-ascii" # nothing found raise HTTPException(400, "No extractable content found in PDF (scanned images or unsupported PDF).") # ----------------------------- # Tools # ----------------------------- @tool def scrape_url_to_dataframe(target_url: str) -> Dict[str, Any]: """ Fetch a URL and return data as a DataFrame (supports HTML tables, CSV, Excel, Parquet, JSON, PDF, and plain text). Always returns {"status": "success", "data": [...], "columns": [...]} if fetch works. """ print(f"Scraping URL: {target_url}") try: from io import BytesIO, StringIO from bs4 import BeautifulSoup req_headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/138.0.0.0 Safari/537.36" ), "Referer": "https://www.google.com/", } r = requests.get(target_url, headers=req_headers, timeout=20) r.raise_for_status() content_type = r.headers.get("Content-Type", "").lower() frame = None # --- PDF --- if "application/pdf" in content_type or target_url.lower().endswith(".pdf"): df_pdf, _info = extract_pdf_to_dataframe(r.content) frame = df_pdf # --- CSV --- elif "text/csv" in content_type or target_url.lower().endswith(".csv"): frame = pd.read_csv(BytesIO(r.content)) # --- Excel --- elif any(target_url.lower().endswith(ext) for ext in (".xls", ".xlsx")) or "spreadsheetml" in content_type: frame = pd.read_excel(BytesIO(r.content)) # --- Parquet --- elif target_url.lower().endswith(".parquet"): frame = pd.read_parquet(BytesIO(r.content)) # --- JSON --- elif "application/json" in content_type or target_url.lower().endswith(".json"): try: data = r.json() frame = pd.json_normalize(data) except Exception: frame = pd.DataFrame([{"text": r.text}]) # --- HTML / Fallback --- elif "text/html" in content_type or re.search(r'/wiki/|\.org|\.com', target_url, re.IGNORECASE): html_text = r.text # Try HTML tables first try: html_tables = pd.read_html(StringIO(html_text), flavor="bs4") if html_tables: frame = html_tables[0] except ValueError: pass # If no table found, fallback to plain text if frame is None: soup_obj = BeautifulSoup(html_text, "html.parser") page_text = soup_obj.get_text(separator="\n", strip=True) frame = pd.DataFrame({"text": [page_text]}) # --- Unknown type fallback --- else: frame = pd.DataFrame({"text": [r.text]}) # --- Normalize columns --- frame.columns = frame.columns.map(str).str.replace(r'\[.*\]', '', regex=True).str.strip() return { "status": "success", "data": frame.to_dict(orient="records"), "columns": frame.columns.tolist() } except Exception as err: return {"status": "error", "message": str(err)} # ----------------------------- # Utilities for executing code safely # ----------------------------- def clean_llm_output(llm_text: str) -> Dict: """ Extract JSON object from LLM output robustly. Returns dict or {"error": "..."} """ try: if not llm_text: return {"error": "Empty LLM output"} # remove triple-fence markers if present content = re.sub(r"^```(?:json)?\s*", "", llm_text.strip()) content = re.sub(r"\s*```$", "", content) # find outermost JSON object by scanning for balanced braces lbrace = content.find("{") rbrace = content.rfind("}") if lbrace == -1 or rbrace == -1 or rbrace <= lbrace: return {"error": "No JSON object found in LLM output", "raw": content} json_candidate = content[lbrace:rbrace+1] try: return json.loads(json_candidate) except Exception as e: # fallback: try last balanced pair scanning backwards for i in range(rbrace, lbrace, -1): cand = content[lbrace:i+1] try: return json.loads(cand) except Exception: continue return {"error": f"JSON parsing failed: {str(e)}", "raw": json_candidate} except Exception as e: return {"error": str(e)} # --- helper: convert bytes -> data URI for multimodal LLM --- def _bytes_to_data_uri(img_bytes: bytes, mime="image/png") -> str: b64 = base64.b64encode(img_bytes).decode("ascii") return f"data:{mime};base64,{b64}" # ---------- NEW: normalization & numeric coercion helpers ---------- def _normalize_columns_inplace(df: pd.DataFrame): df.columns = ( pd.Index(df.columns) .map(str) .map(lambda s: s.replace("\xa0", " ").strip()) .map(lambda s: re.sub(r"\s+", "_", s.lower())) .map(lambda s: re.sub(r"[^0-9a-z_]", "", s)) ) def _coerce_numeric_columns_inplace(df: pd.DataFrame): for c in list(df.columns): if df[c].dtype == object: s = df[c].astype(str) s2 = s.str.replace(r"[,$%]", "", regex=True).str.replace(r"\s", "", regex=True) coerced = pd.to_numeric(s2, errors="coerce") if coerced.notna().sum() >= max(3, int(0.5 * len(df))): df[c] = coerced def run_vision_extraction(questions_text: str, image_uris: List[str]) -> Dict: """ Send images + instructions to a vision-capable LLM and expect JSON-only output. No Python OCR: the model must visually read the images. """ system_msg = ( "You are a precise data extraction assistant. " "You will be given one or more images (e.g., scanned bank passbooks) and a set of questions. " "Answer ONLY by visually inspecting the images. " "DO NOT suggest Python OCR libraries like pytesseract; they are unavailable. " "Return strictly valid JSON as the final answer. No extra text." ) # Build a multimodal human message: text + images msg_content = [{"type": "text", "text": questions_text}] for uri in image_uris: msg_content.append({"type": "image_url", "image_url": {"url": uri}}) # Call the configured LLM resp = chat_model.invoke([ SystemMessage(content=system_msg), HumanMessage(content=msg_content), ]) text = resp.content if hasattr(resp, "content") else str(resp) parsed = clean_llm_output(text) return parsed SCRAPE_FUNC = r''' from typing import Dict, Any import requests from bs4 import BeautifulSoup import pandas as pd import re def scrape_url_to_dataframe(url: str) -> Dict[str, Any]: try: response = requests.get( url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5 ) response.raise_for_status() except Exception as e: return { "status": "error", "error": str(e), "data": [], "columns": [] } soup = BeautifulSoup(response.text, "html.parser") tables = pd.read_html(response.text) if tables: df = tables[0] # Take first table df.columns = [str(c).strip() for c in df.columns] # Ensure all columns are unique and string df.columns = [str(col) for col in df.columns] return { "status": "success", "data": df.to_dict(orient="records"), "columns": list(df.columns) } else: # Fallback to plain text text_data = soup.get_text(separator="\n", strip=True) # Try to detect possible "keys" from text like Runtime, Genre, etc. detected_cols = set(re.findall(r"\b[A-Z][a-zA-Z ]{2,15}\b", text_data)) df = pd.DataFrame([{}]) # start empty for col in detected_cols: df[col] = None if df.empty: df["text"] = [text_data] return { "status": "success", "data": df.to_dict(orient="records"), "columns": list(df.columns) } ''' def write_and_run_temp_python(code: str, injected_pickle: str = None, timeout: int = 60) -> Dict[str, Any]: """ Write a temp python file which: - provides a safe environment (imports) - loads df/from pickle if provided into df and data variables - defines a robust plot_to_base64() helper that ensures < 100kB (attempts resizing/conversion) - executes the user code (which should populate `results` dict) - prints json.dumps({"status":"success","result":results}) Returns dict with parsed JSON or error details. """ # create file content bootstrap_lines = [ "import json, sys, gc, types, re, math", "import pandas as pd, numpy as np", "import matplotlib", "matplotlib.use('Agg')", "import matplotlib.pyplot as plt", "from io import BytesIO", "import base64", # Avoid hard seaborn dependency in the sandbox "try:\n import seaborn as sns\nexcept Exception:\n sns = None", ] if PIL_READY: bootstrap_lines.append("from PIL import Image") # NEW: loader that can handle dict payloads (tables) or a single df and expose helpers/short names if injected_pickle: bootstrap_lines.append(f"_payload = pd.read_pickle(r'''{injected_pickle}''')\n") bootstrap_lines.append("tables = {}\n") bootstrap_lines.append("df = None\n") bootstrap_lines.append("data = {}\n") bootstrap_lines.append( "def _sanitize(name):\n" " name = re.sub(r'[^0-9a-zA-Z_]+', '_', str(name))\n" " if name and name[0].isdigit():\n" " name = '_' + name\n" " return name\n" ) # helpers bootstrap_lines.append( "def find_column(df, candidates):\n" " cols = [c for c in df.columns]\n" " lower = {c.lower(): c for c in cols}\n" " for cand in candidates:\n" " k = str(cand).lower().strip()\n" " if k in lower:\n" " return lower[k]\n" " # fuzzy contains\n" " for cand in candidates:\n" " k = str(cand).lower().strip()\n" " for c in cols:\n" " if k in c.lower():\n" " return c\n" " return None\n" ) bootstrap_lines.append( "def smart_to_numeric(s):\n" " s = s.astype(str).str.replace(r'[,$%]', '', regex=True).str.replace(r'\\s','', regex=True)\n" " return pd.to_numeric(s, errors='coerce')\n" ) bootstrap_lines.append( "def coerce_numeric_columns(df):\n" " for c in list(df.columns):\n" " if df[c].dtype == object:\n" " coerced = smart_to_numeric(df[c])\n" " if coerced.notna().sum() >= max(3, int(0.5*len(df))):\n" " df[c] = coerced\n" ) bootstrap_lines.append( "def safe_scatter_with_trend(df, x_candidates, y_candidates, title=''):\n" " xcol = find_column(df, x_candidates)\n" " ycol = find_column(df, y_candidates)\n" " if not xcol or not ycol:\n" " plt.figure(); plt.text(0.5,0.5,'Columns not found',ha='center');\n" " return\n" " coerce_numeric_columns(df)\n" " x = df[xcol]\n" " y = df[ycol]\n" " mask = x.notna() & y.notna()\n" " x = x[mask]; y = y[mask]\n" " plt.figure()\n" " if len(x) < 2:\n" " plt.text(0.5,0.5,'Not enough data to plot',ha='center'); plt.xlabel(xcol); plt.ylabel(ycol); plt.title(title); return\n" " plt.scatter(x, y)\n" " # trend line\n" " try:\n" " coeffs = np.polyfit(x, y, 1)\n" " xx = np.linspace(float(np.nanmin(x)), float(np.nanmax(x)), 50)\n" " yy = coeffs[0]*xx + coeffs[1]\n" " plt.plot(xx, yy, 'r:', linewidth=2)\n" " except Exception:\n" " pass\n" " plt.xlabel(xcol); plt.ylabel(ycol); plt.title(title)\n" ) bootstrap_lines.append( "def get_table(name):\n" " # smart lookup in tables by exact key, by short key, or fuzzy\n" " if name in tables: return tables[name]\n" " key = name.lower().strip()\n" " # exact on lower\n" " for k in tables:\n" " if k.lower() == key: return tables[k]\n" " # contains\n" " for k in tables:\n" " if key in k.lower(): return tables[k]\n" " return None\n" ) bootstrap_lines.append( "if isinstance(_payload, dict):\n" " df = _payload.get('__df__')\n" " tables = _payload.get('__tables__', {}) or {}\n" " # expose convenience variables: full key and short key\n" " for _tname, _tdf in list(tables.items()):\n" " try:\n" " globals()[f'{_sanitize(_tname)}_df'] = _tdf\n" " _short = _tname.split('::')[-1]\n" " globals()[f'{_sanitize(_short)}_df'] = _tdf\n" " except Exception:\n" " pass\n" " if df is None and tables:\n" " df = max(tables.values(), key=lambda d: (len(d) * max(1, len(d.columns))))\n" "else:\n" " df = _payload\n" ) bootstrap_lines.append("if isinstance(df, pd.DataFrame):\n data = df.to_dict(orient='records')\nelse:\n data = {}\n") bootstrap_lines.append("all_tables = list(tables.keys())\n") else: # ensure data exists so user code that references data won't break bootstrap_lines.append("tables = {}\n") bootstrap_lines.append("df = None\n") bootstrap_lines.append("data = {}\n") bootstrap_lines.append("def find_column(df, candidates):\n return None\n") bootstrap_lines.append("def smart_to_numeric(s):\n return pd.to_numeric(s, errors='coerce')\n") bootstrap_lines.append("def coerce_numeric_columns(df):\n pass\n") bootstrap_lines.append("def safe_scatter_with_trend(df, x_candidates, y_candidates, title=''):\n plt.figure(); plt.text(0.5,0.5,'No data',ha='center')\n") bootstrap_lines.append("def get_table(name):\n return None\n") # plot_to_base64 helper that tries to reduce size under 100_000 bytes plot_helper = r''' def plot_to_base64(max_bytes=100000): buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', dpi=100) buf.seek(0) img_bytes = buf.getvalue() if len(img_bytes) <= max_bytes: return base64.b64encode(img_bytes).decode('ascii') # try decreasing dpi/figure size iteratively for dpi in [80, 60, 50, 40, 30]: buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', dpi=dpi) buf.seek(0) b = buf.getvalue() if len(b) <= max_bytes: return base64.b64encode(b).decode('ascii') # if Pillow available, try convert to WEBP which is typically smaller try: from PIL import Image buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', dpi=40) buf.seek(0) im = Image.open(buf) out_buf = BytesIO() im.save(out_buf, format='WEBP', quality=80, method=6) out_buf.seek(0) ob = out_buf.getvalue() if len(ob) <= max_bytes: return base64.b64encode(ob).decode('ascii') # try lower quality out_buf = BytesIO() im.save(out_buf, format='WEBP', quality=60, method=6) out_buf.seek(0) ob = out_buf.getvalue() if len(ob) <= max_bytes: return base64.b64encode(ob).decode('ascii') except Exception: pass # as last resort return downsized PNG even if > max_bytes buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', dpi=20) buf.seek(0) return base64.b64encode(buf.getvalue()).decode('ascii') ''' # Build the code to write script_buf = [] script_buf.extend(bootstrap_lines) script_buf.append(plot_helper) script_buf.append(SCRAPE_FUNC) script_buf.append("\nresults = {}\n") script_buf.append(code) # ensure results printed as json script_buf.append("\nprint(json.dumps({'status':'success','result':results}, default=str), flush=True)\n") tmpfile = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') tmpfile.write("\n".join(script_buf)) tmpfile.flush() tmp_pathname = tmpfile.name tmpfile.close() try: proc = subprocess.run([sys.executable, tmp_pathname], capture_output=True, text=True, timeout=timeout) if proc.returncode != 0: # collect stderr and stdout for debugging return {"status": "error", "message": proc.stderr.strip() or proc.stdout.strip()} # parse stdout as json stdout_text = proc.stdout.strip() try: parsed_json = json.loads(stdout_text) return parsed_json except Exception as e: return {"status": "error", "message": f"Could not parse JSON output: {str(e)}", "raw": stdout_text} except subprocess.TimeoutExpired: return {"status": "error", "message": "Execution timed out"} finally: try: os.unlink(tmp_pathname) if injected_pickle and os.path.exists(injected_pickle): os.unlink(injected_pickle) except Exception: pass # ----------------------------- # LLM agent setup # ----------------------------- # Use the ChatOpenAI `chat_model` defined above. # Tools list for agent (LangChain tool decorator returns metadata for the LLM) tools = [scrape_url_to_dataframe] # we only expose scraping as a tool; agent will still produce code # Prompt: instruct agent to call the tool and output JSON only agent_prompt = ChatPromptTemplate.from_messages([ ("system", """You are a full-stack autonomous data analyst agent. You will receive: - A set of **rules** for this request (these rules may differ depending on whether a dataset is uploaded or not) - One or more **questions** - An optional **dataset preview** You must: 1. Follow the provided rules exactly. 2. Return only a valid JSON object — no extra commentary or formatting. 3. The JSON must contain: - "questions": [ list of original question strings exactly as provided ] - "code": "..." (Python code that creates a dict called `results` with each question string as a key and its computed answer as the value) 4. Your Python code will run in a sandbox with: - pandas, numpy, matplotlib available - A helper function `plot_to_base64(max_bytes=100000)` for generating base64-encoded images under 100KB. - Helpers available: `find_column`, `smart_to_numeric`, `coerce_numeric_columns`, `safe_scatter_with_trend`, and `get_table`. 5. When returning plots, prefer `safe_scatter_with_trend(...)` for scatter + dotted red regression when appropriate. 6. Make sure all variables are defined before use, and the code can run without any undefined references. 7) If image files are provided, DO NOT call any OCR libraries in Python. Rely on the model’s visual inspection to extract text from images. 8) If a database was uploaded, access its tables via the dict `tables` (pandas DataFrames keyed by table name). Convenience variables like `