Spaces:

datajoi
/

datajoi-sql-agent

Running

App Files Files Community

Muhammad Mustehson commited on 6 days ago

Commit

a360e3c

1 Parent(s): 902da82

Update Old Code

Browse files

Files changed (8) hide show

.gitignore +5 -1
app.py +167 -157
requirements.txt +9 -10
src/__init__.py +0 -0
src/client.py +131 -0
src/models.py +6 -0
src/pipelines.py +98 -0
src/prompts.py +22 -0

.gitignore CHANGED Viewed

	@@ -1 +1,5 @@
1	- ~~app2~~.py

+.env
+.venv
+__pycache__/
+*.pyc
+*.pyo

app.py CHANGED Viewed

@@ -1,103 +1,88 @@
 import os
-import torch
 import duckdb
-import spaces
-import lancedb
 import gradio as gr
 import pandas as pd
 import pyarrow as pa
-from langchain import hub
-from langsmith import traceable
-from sentence_transformers import SentenceTransformer
-from langchain_huggingface.llms import HuggingFacePipeline
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
-# Height of the Tabs Text Area
-TAB_LINES = 8
-#----------CONNECT TO DATABASE----------
-md_token = os.getenv('MD_TOKEN')
-conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
-#---------------------------------------
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
-else:
-    device = torch.device("cpu")
-    print("Using CPU")
-#---------------------------------------
-#--------------LanceDB-------------
-lance_db = lancedb.connect(
-        uri=os.getenv('lancedb_uri'),
-        api_key=os.getenv('lancedb_api_key'),
-        region=os.getenv('lancedb_region')
-        )
-lance_schema = pa.schema([
-    pa.field("vector", pa.list_(pa.float32())),
-    pa.field("sql-query", pa.utf8())
-])
-try:
-  table = lance_db.create_table(name="SQL-Queries", schema=lance_schema)
-except:
-  table = lance_db.open_table(name="SQL-Queries")
-#---------------------------------------
-#-------LOAD HUGGINGFACE PIPELINE-------
-tokenizer = AutoTokenizer.from_pretrained("defog/llama-3-sqlcoder-8b")
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type= "nf4")
-model = AutoModelForCausalLM.from_pretrained("defog/llama-3-sqlcoder-8b", quantization_config=quantization_config,
-                                             device_map="auto", torch_dtype=torch.bfloat16)
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, return_full_text=False)
-hf = HuggingFacePipeline(pipeline=pipe)
-#---------------------------------------
-#-----LOAD PROMPT FROM LANCHAIN HUB-----
-prompt = hub.pull("sql-agent-prompt")
-#---------------------------------------
-#-----LOAD EMBEDDING MODEL-----
-embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
-#---------------------------------------
-#--------------ALL UTILS----------------
-# Get Databases
-def get_schemas():
     schemas = conn.execute("""
     SELECT DISTINCT schema_name
     FROM information_schema.schemata
     WHERE schema_name NOT IN ('information_schema', 'pg_catalog')
     """).fetchall()
-    return [item[0] for item in schemas]
-# Get Tables
-def get_tables(schema_name):
-    tables = conn.execute(f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'").fetchall()
     return [table[0] for table in tables]
-# Update Tables
-def update_tables(schema_name):
     tables = get_tables(schema_name)
     return gr.update(choices=tables)
-# Get Schema
-def get_table_schema(table):
-    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
-    ddl_create = result.iloc[0,0]
-    parent_database = result.iloc[0,1]
-    schema_name = result.iloc[0,2]
     full_path = f"{parent_database}.{schema_name}.{table}"
     if schema_name != "main":
         old_path = f"{schema_name}.{table}"
@@ -106,85 +91,81 @@ def get_table_schema(table):
     ddl_create = ddl_create.replace(old_path, full_path)
     return ddl_create
-# Get Prompt
-def get_prompt(schema, query_input):
-    return prompt.format(schema=schema, query_input=query_input)
-@spaces.GPU(duration=60)
-@traceable()
-def generate_sql(prompt):
-    result = hf.invoke(prompt)
-    return result.strip()
-@spaces.GPU(duration=10)
-def embed_query(sql_query):
-    print(f'Creating Emebeddings {sql_query}')
-    if sql_query is not None:
-        embeddings = embedding_model.encode(sql_query, normalize_embeddings=True).tolist()
-    return embeddings
-def log2lancedb(embeddings, sql_query):
-    data = [{
-        "sql-query": sql_query,
-        "vector": embeddings
-    }]
-    table.add(data)
-    print(f'Added to Lance DB.')
-#---------------------------------------
-# Generate SQL
-def text2sql(table, query_input):
     if table is None:
-        return {
-            table_schema: "",
-            input_prompt: "",
-            generated_query: "",
-            result_output:pd.DataFrame([{"error": "❌ Please Select Table, Schema.}"}])
-        }
-    schema = get_table_schema(table)
-    print(f'Schema Generated...')
-    prompt = get_prompt(schema, query_input)
-    print(f'Prompt Generated...')
-    try:
-        print(f'Generating SQL... {model.device}')
-        result = generate_sql(prompt)
-        print('SQL Generated...')
-    except Exception as e:
-        return {
-            table_schema: schema,
-            input_prompt: prompt,
-            generated_query: "",
-            result_output:pd.DataFrame([{"error": f"❌ Unable to get the SQL query based on the text. {e}"}])
-        }
     try:
-        embeddings = embed_query(result)
-        log2lancedb(embeddings, result)
-    except Exception as e:
-        print("Error Generating and Logging Embeddings...")
-        print(e)
     try:
-        query_result = conn.sql(result).df()
-    except Exception as e:
-        return {
-            table_schema: schema,
-            input_prompt: prompt,
-            generated_query: result,
-            result_output:pd.DataFrame([{"error": f"❌ Unable to get the SQL query based on the text. {e}"}])
-        }
     return {
         table_schema: schema,
-        input_prompt: prompt,
-        generated_query: result,
-        result_output:query_result
     }
-# Custom CSS styling
 custom_css = """
 .gradio-container {
     background-color: #f0f4f8;
@@ -202,9 +183,11 @@ custom_css = """
 }
 """
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"), css=custom_css) as demo:
     gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
     gr.Markdown("""
     <div style='text-align: center;'>
     <strong style='font-size: 36px;'>Datajoi SQL Agent</strong>
@@ -214,13 +197,18 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
     """)
     with gr.Row():
-        with gr.Column(scale=1, variant='panel'):
-            schema_dropdown = gr.Dropdown(choices=get_schemas(), label="Select Schema", interactive=True)
-            tables_dropdown = gr.Dropdown(choices=[], label="Available Tables", value=None)
         with gr.Column(scale=2):
-            query_input = gr.Textbox(lines=5, label="Text Query", placeholder="Enter your text query here...")
             with gr.Row():
                 with gr.Column(scale=7):
                     pass
@@ -229,17 +217,39 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
             with gr.Tabs():
                 with gr.Tab("Result"):
-                    result_output = gr.DataFrame(label="Query Results", value=[], interactive=False)
                 with gr.Tab("SQL Query"):
-                    generated_query = gr.Textbox(lines=TAB_LINES, label="Generated SQL Query", value="", interactive=False)
                 with gr.Tab("Prompt"):
-                    input_prompt = gr.Textbox(lines=TAB_LINES, label="Input Prompt", value="", interactive=False)
                 with gr.Tab("Schema"):
-                    table_schema = gr.Textbox(lines=TAB_LINES, label="Table Schema", value="", interactive=False)
-        schema_dropdown.change(update_tables, inputs=schema_dropdown, outputs=tables_dropdown)
-        generate_query_button.click(text2sql, inputs=[tables_dropdown, query_input], outputs=[table_schema, input_prompt, generated_query, result_output])
 if __name__ == "__main__":
     demo.launch()

+import logging
 import os
+from typing import Any, Dict, List
 import duckdb
 import gradio as gr
+import lancedb
 import pandas as pd
 import pyarrow as pa
+from dotenv import load_dotenv
+from src.client import LLMChain, embed_client
+from src.pipelines import SQLPipeline
+load_dotenv()
+# ========ENV's========
+MD_TOKEN = os.getenv("MD_TOKEN")
+HF_TOKEN = os.getenv("HF_TOKEN")
+conn = duckdb.connect(f"md:my_db?motherduck_token={MD_TOKEN}", read_only=True)
+LEVEL = "INFO" if not os.getenv("ENV") == "PROD" else "WARNING"
+EMB_URL = os.getenv("EMB_URL")
+EMB_MODEL = os.getenv("EMB_MODEL")
+TAB_LINES = 8
+# =====================
+logging.basicConfig(
+    level=getattr(logging, LEVEL, logging.INFO),
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+pipe = SQLPipeline(duckdb=conn, chain=LLMChain())
+def _setup_lancedb() -> lancedb.table.Table:
+    lance_db = lancedb.connect(
+        uri=os.getenv("lancedb_uri"),
+        api_key=os.getenv("lancedb_api_key"),
+        region=os.getenv("lancedb_region"),
+    )
+    lance_schema = pa.schema(
+        [pa.field("vector", pa.list_(pa.float32())), pa.field("sql-query", pa.utf8())]
+    )
+    try:
+        table = lance_db.create_table(name="SQL-Queries", schema=lance_schema)
+    except Exception:
+        table = lance_db.open_table(name="SQL-Queries")
+    return table
+lance_table = _setup_lancedb()
+def get_schemas() -> List:
     schemas = conn.execute("""
     SELECT DISTINCT schema_name
     FROM information_schema.schemata
     WHERE schema_name NOT IN ('information_schema', 'pg_catalog')
     """).fetchall()
+    return [item[0] for item in schemas]
+def get_tables(schema_name: str) -> List:
+    tables = conn.execute(
+        f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
+    ).fetchall()
     return [table[0] for table in tables]
+def update_tables(schema_name: str):
     tables = get_tables(schema_name)
     return gr.update(choices=tables)
+def get_table_schema(table: str) -> str:
+    result = conn.sql(
+        f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';"
+    ).df()
+    ddl_create = result.iloc[0, 0]
+    parent_database = result.iloc[0, 1]
+    schema_name = result.iloc[0, 2]
     full_path = f"{parent_database}.{schema_name}.{table}"
     if schema_name != "main":
         old_path = f"{schema_name}.{table}"
     ddl_create = ddl_create.replace(old_path, full_path)
     return ddl_create
+def run_pipeline(table: str, query_input: str) -> Dict[str, Any]:
     if table is None:
+        return _error_response(
+            query_input=query_input, message="❌ Please select a table/schema."
+        )
+    schema = ""
     try:
+        schema = get_table_schema(table=table)
+        sql, df = pipe.try_sql_with_retries(
+            user_question=query_input,
+            context=schema,
+        )
+        if not sql or df is None:
+            return _error_response(
+                query_input=query_input,
+                schema=schema,
+                message="❌ Unable to generate SQL from the input text.",
+            )
+    except Exception as exc:
+        logger.exception("Pipeline execution failed")
+        return _error_response(
+            query_input=query_input, schema=schema, message=f"❌ Pipeline error: {exc}"
+        )
     try:
+        sql_str = f"{query_input}\n{sql.get('sql_query', '')}"
+        embeddings = embed_query(sql_str)
+        log2lancedb(embeddings, sql_str)
+    except Exception as exc:
+        logger.warning("Embedding/logging failed: %s", exc)
+    return {
+        table_schema: schema,
+        input_prompt: query_input,
+        generated_query: sql.get("sql_query", ""),
+        result_output: df,
+    }
+def _error_response(
+    *,
+    query_input: str,
+    message: str,
+    schema: str = "",
+) -> Dict[str, Any]:
     return {
         table_schema: schema,
+        input_prompt: query_input,
+        generated_query: "",
+        result_output: pd.DataFrame([{"error": message}]),
     }
+def embed_query(data: str) -> List:
+    logger.info(f"Creating Emebeddings {data}")
+    try:
+        results = embed_client.feature_extraction(text=data, model=EMB_MODEL)
+        return results.tolist()
+    except Exception as e:
+        logger.error(f"Unable to Generate embedding for the given query: {e}")
+    return []
+def log2lancedb(embeddings: List, sql_query: str) -> None:
+    data = [{"sql-query": sql_query, "vector": embeddings}]
+    lance_table.add(data)
+    logger.info("Added to Lance DB.")
 custom_css = """
 .gradio-container {
     background-color: #f0f4f8;
 }
 """
+with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"), css=custom_css
+) as demo:
     gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
     gr.Markdown("""
     <div style='text-align: center;'>
     <strong style='font-size: 36px;'>Datajoi SQL Agent</strong>
     """)
     with gr.Row():
+        with gr.Column(scale=1, variant="panel"):
+            schema_dropdown = gr.Dropdown(
+                choices=get_schemas(), label="Select Schema", interactive=True
+            )
+            tables_dropdown = gr.Dropdown(
+                choices=[], label="Available Tables", value=None
+            )
         with gr.Column(scale=2):
+            query_input = gr.Textbox(
+                lines=5, label="Text Query", placeholder="Enter your text query here..."
+            )
             with gr.Row():
                 with gr.Column(scale=7):
                     pass
             with gr.Tabs():
                 with gr.Tab("Result"):
+                    result_output = gr.DataFrame(
+                        label="Query Results", value=[], interactive=False
+                    )
                 with gr.Tab("SQL Query"):
+                    generated_query = gr.Textbox(
+                        lines=TAB_LINES,
+                        label="Generated SQL Query",
+                        value="",
+                        interactive=False,
+                    )
                 with gr.Tab("Prompt"):
+                    input_prompt = gr.Textbox(
+                        lines=TAB_LINES,
+                        label="Input Prompt",
+                        value="",
+                        interactive=False,
+                    )
                 with gr.Tab("Schema"):
+                    table_schema = gr.Textbox(
+                        lines=TAB_LINES,
+                        label="Table Schema",
+                        value="",
+                        interactive=False,
+                    )
+        schema_dropdown.change(
+            update_tables, inputs=schema_dropdown, outputs=tables_dropdown
+        )
+        generate_query_button.click(
+            run_pipeline,
+            inputs=[tables_dropdown, query_input],
+            outputs=[table_schema, input_prompt, generated_query, result_output],
+        )
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,10 +1,9 @@
-accelerate==0.34.2
-bitsandbytes==0.44.1
-transformers==4.44.2
-duckdb==1.1.1
-langsmith==0.1.135
-langchain==0.3.4
-lancedb==0.15.0
-sentence-transformers==3.2.1
-pyarrow==17.0.0
-langchain-huggingface

+huggingface-hub==0.35.0
+duckdb==1.3.2
+pandas==2.3.1
+numpy==2.3.2
+pydantic
+python-dotenv
+gradio
+pyarrow
+lancedb

src/__init__.py ADDED Viewed

File without changes

src/client.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import json
+import logging
+import os
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel
+load_dotenv()
+logger = logging.getLogger(__name__)
+MAX_RESPONSE_TOKENS = 2048
+TEMPERATURE = 0.9
+models = json.loads(os.getenv("MODEL_NAMES"))
+providers = json.loads(os.getenv("PROVIDERS"))
+EMB_MODEL = os.getenv("EMB_MODEL")
+def _engine_working(engine: InferenceClient) -> bool:
+    try:
+        engine.chat_completion([{"role": "user", "content": "ping"}], max_tokens=1)
+        logger.info("Engine is Working.")
+        return True
+    except Exception as e:
+        logger.exception(f"Engine is not working: {e}")
+        return False
+def _load_llm_client() -> InferenceClient:
+    """
+    Attempts to load the provided model from the huggingface endpoint.
+    Returns InferenceClient if successful.
+    Raises Exception if no model is available.
+    """
+    logger.warning("Loading Model...")
+    errors = []
+    for model in models:
+        for provider in providers:
+            if isinstance(model, str):
+                try:
+                    logger.info(f"Checking model: {model} provider: {provider}")
+                    client = InferenceClient(
+                        model=model,
+                        timeout=15,
+                        provider=provider,
+                    )
+                    if _engine_working(client):
+                        logger.info(
+                            f"The model is loaded : {model} , provider: {provider}"
+                        )
+                        return client
+                except Exception as e:
+                    logger.error(
+                        f"Error loading model {model} provider {provider}: {e}"
+                    )
+                    errors.append(str(e))
+    raise Exception(f"Unable to load any provided model: {errors}.")
+def _load_embedding_client() -> InferenceClient:
+    logger.warning("Loading Embedding Model...")
+    try:
+        emb_client = InferenceClient(timeout=15, model=EMB_MODEL)
+        return emb_client
+    except Exception as e:
+        logger.error(f"Error loading model {EMB_MODEL}: {e}")
+        raise Exception("Unable to load the embedding model.")
+_default_client = _load_llm_client()
+embed_client = _load_embedding_client()
+class LLMChain:
+    def __init__(self, client: InferenceClient = _default_client):
+        self.client = client
+        self.total_tokens = 0
+    def run(
+        self,
+        system_prompt: str | None = None,
+        user_prompt: str | None = None,
+        messages: list[dict] | None = None,
+        format_name: str | None = None,
+        response_format: type[BaseModel] | None = None,
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        try:
+            if system_prompt and user_prompt:
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ]
+            elif not messages:
+                raise ValueError(
+                    "Either system_prompt and user_prompt or messages must be provided."
+                )
+            llm_response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=MAX_RESPONSE_TOKENS,
+                temperature=TEMPERATURE,
+                response_format=(
+                    {
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": format_name,
+                            "schema": response_format.model_json_schema(),
+                            "strict": True,
+                        },
+                    }
+                    if format_name and response_format
+                    else None
+                ),
+            )
+            self.total_tokens += llm_response.usage.total_tokens
+            analysis = llm_response.choices[0].message.content
+            if response_format:
+                analysis = json.loads(analysis)
+                fields = list(response_format.model_fields.keys())
+                if len(fields) == 1:
+                    return analysis.get(fields[0])
+                return {field: analysis.get(field) for field in fields}
+            return analysis
+        except Exception as e:
+            logger.error(f"Error during LLM calls: {e}")
+            return None

src/models.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pydantic import BaseModel, Field
+class SQLQueryModel(BaseModel):
+    sql_query: str = Field(..., description="SQL query to execute.")
+    explanation: str = Field(..., description="Short explanation of the SQL query.")

src/pipelines.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import logging
+import os
+import pandas as pd
+from duckdb import DuckDBPyConnection
+from src.models import SQLQueryModel
+from src.prompts import SQL_PROMPT, USER_PROMPT
+logger = logging.getLogger(__name__)
+SQL_GENERATION_RETRIES = int(os.getenv("SQL_GENERATION_RETRIES", "5"))
+class SQLPipeline:
+    def __init__(
+        self,
+        duckdb: DuckDBPyConnection,
+        chain,
+    ) -> None:
+        self._duckdb = duckdb
+        self.chain = chain
+    def generate_sql(
+        self, user_question: str, context: str, errors: str | None = None
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        """Generate SQL + description."""
+        user_prompt_formatted = USER_PROMPT.format(
+            question=user_question, context=context
+        )
+        if errors:
+            user_prompt_formatted += f"Carefully review the previous error or\
+            exception and rewrite the SQL so that the error does not occur again.\
+            Try a different approach or rewrite SQL if needed. Last error: {errors}"
+        sql = self.chain.run(
+            system_prompt=SQL_PROMPT,
+            user_prompt=user_prompt_formatted,
+            format_name="sql_query",
+            response_format=SQLQueryModel,
+        )
+        logger.info(f"SQL Generated Successfully: {sql}")
+        return sql
+    def run_query(self, sql_query: str) -> pd.DataFrame | None:
+        """Execute SQL and return dataframe."""
+        logger.info("Query Execution Started.")
+        return self._duckdb.query(sql_query).df()
+    def try_sql_with_retries(
+        self,
+        user_question: str,
+        context: str,
+        max_retries: int = SQL_GENERATION_RETRIES,
+    ) -> tuple[
+        str | dict[str, str | int | float | None] | list[str] | None,
+        pd.DataFrame | None,
+    ]:
+        """Try SQL generation + execution with retries."""
+        last_error = None
+        all_errors = ""
+        for attempt in range(
+            1, max_retries + 2
+        ):  # @ Since the first is normal and not consider in retries
+            try:
+                if attempt > 1 and last_error:
+                    logger.info(f"Retrying: {attempt - 1}")
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context, errors=all_errors)
+                    if not sql:
+                        return None, None
+                else:
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context)
+                    if not sql:
+                        return None, None
+                # Try executing query
+                sql_query_str = sql.get("sql_query") if isinstance(sql, dict) else sql
+                if not isinstance(sql_query_str, str):
+                    raise ValueError(
+                        f"Expected SQL query to be a string, got {type(sql_query_str).__name__}"
+                    )
+                query_df = self.run_query(sql_query_str)
+                # If execution succeeds, stop retrying or if df is not empty
+                if query_df is not None and not query_df.empty:
+                    return sql, query_df
+            except Exception as e:
+                last_error = f"\nAttempt {attempt - 1}] {type(e).__name__}: {e}"
+                logger.error(f"Error during SQL generation or execution: {last_error}")
+                all_errors += last_error
+        logger.error(f"Failed after {max_retries} attempts. Last error: {all_errors}")
+        return None, None

src/prompts.py ADDED Viewed

	@@ -0,0 +1,22 @@

+USER_PROMPT = """User's Text Question:
+{question}
+Provided table context information:
+{context}"""
+SQL_PROMPT = """You are an expert Text-to-SQL assistant. Convert the user's natural-language request into a single, read-only, syntactically valid DuckDB SQL SELECT statement that runs against the provided schema (the schema will be supplied as CREATE TABLE DDL). Use the exact table and column names from the schema.
+Return two things:
+1. The SQL statement.
+2. A short natural-language description (1-2 sentences) of what the query returns.
+Rules:
+1. Output MUST be a single SELECT query. JOINs, subqueries, aggregations, GROUP BY, ORDER BY, and LIMIT are allowed.
+2. Do NOT generate any DML/DDL (INSERT, UPDATE, DELETE, DROP, etc.) or non-read operations.
+3. Use DuckDB SQL functions and syntax. For date/time grouping, use DATE_TRUNC('unit', column) (e.g., 'month', 'day', 'year').
+4. Prefer explicit column lists. Use SELECT * only if the user explicitly requests all columns.
+5. Make the query robust and maintainable, so it can be reused or adapted for similar analyses.
+6. After execution in the downstream pipeline, if an error occurs (available as `Last Error` with a short description), analyze that error and rewrite the SQL to resolve it while preserving the user's intent. The rewritten query must still be valid DuckDB SQL.
+7. If the user requests a distribution/histogram, return SQL that selects a single numeric column only, so binning can be performed downstream.
+"""