Spaces:

datajoi
/

sql-test-suite

Sleeping

App Files Files Community

Muhammad Mustehson commited on 7 days ago

Commit

1a436de

1 Parent(s): 9f8e201

initial Draft

Browse files

Files changed (16) hide show

.gitignore +18 -0
app.py +430 -52
audits/.gitkeep +0 -0
config.yaml +19 -0
database/.gitkeep +0 -0
logo.png +0 -0
macros/.gitkeep +0 -0
models/.gitkeep +0 -0
pytest.ini +4 -0
requirements.txt +280 -0
seeds/.gitkeep +0 -0
src/__init__.py +0 -0
src/client.py +119 -0
src/models.py +12 -0
src/pipelines.py +128 -0
src/prompts.py +71 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.cache
+.pytest_cache
+.env
+pyproject.toml
+uv.lock
+*.duckdb

app.py CHANGED Viewed

@@ -1,70 +1,448 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+import io
+import logging
+import os
+import shutil
+import sys
+import tempfile
+import uuid
+from pathlib import Path
+from typing import Tuple
+import duckdb
 import gradio as gr
+import pandas as pd
+import pytest
+import requests
+from dotenv import load_dotenv
+from src.client import LLMChain
+from src.pipelines import Query2Schema
+load_dotenv()
+LEVEL = "INFO" if not os.getenv("ENV") == "PROD" else "WARNING"
+logging.basicConfig(
+    level=getattr(logging, LEVEL, logging.INFO),
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+if not Path("/tmp").exists():
+    os.mkdir("/tmp")
+def create_conn(url: str, save_path: str):
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        with open(save_path, "wb") as out_file:
+            shutil.copyfileobj(response.raw, out_file)
+        return duckdb.connect(database=save_path)
+    except Exception as e:
+        logger.error(f"Error downloading database: {e}")
+        raise
+if not Path("database/chinook.duckdb").exists():
+    conn = create_conn(
+        url="https://raw.githubusercontent.com/RandomFractals/duckdb-sql-tools/main/data/chinook/duckdb/chinook.duckdb",
+        save_path="database/chinook.duckdb",
+    )
+pipe = Query2Schema(duckdb=conn, chain=LLMChain())
+def get_tables_names(schema_name):
+    tables = conn.execute("SELECT table_name FROM information_schema.tables").fetchall()
+    return [table[0] for table in tables]
+def update_table_names(schema_name):
+    tables = get_tables_names(schema_name)
+    return gr.update(choices=tables, value=tables[0] if tables else None)
+def update_column_names(table_name):
+    columns = conn.execute(
+        f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' "
+    ).fetchall()
+    columns = [column[0] for column in columns]
+    df = pd.DataFrame(columns, columns=["Column Names"])
+    # return gr.update(
+    #     choices=columns,
+    #     value=columns[0] if columns else None
+    # )
+    return df
+def get_ddl(table: str) -> str:
+    result = conn.sql(
+        f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';"
+    ).df()
+    ddl_create = result.iloc[0, 0]
+    parent_database = result.iloc[0, 1]
+    schema_name = result.iloc[0, 2]
+    full_path = f"{parent_database}.{schema_name}.{table}"
+    if schema_name != "main":
+        old_path = f"{schema_name}.{table}"
+    else:
+        old_path = table
+    ddl_create = ddl_create.replace(old_path, full_path)
+    return ddl_create
+def run_pipeline(table: str, query_input: str) -> Tuple[str, pd.DataFrame]:
+    try:
+        schema = get_ddl(table=table)
+    except Exception as e:
+        logger.error(f"Failed to fetch DDL for table {table}: {e}")
+        raise
+    try:
+        sql, df = pipe.try_sql_with_retries(
+            user_question=query_input,
+            context=schema,
+        )
+        sql = sql.get("sql_query") if isinstance(sql, dict) else sql
+        if not sql:
+            raise ValueError("SQL generation returned None")
+        return sql, df
+    except Exception as e:
+        logger.error(f"Error generating SQL for table {table}: {e}")
+        raise
+def create_mesh_model(sql: str, db_name: str = "chinook") -> Tuple[str, str, str]:
+    model_name = f"model_{uuid.uuid4().hex[:8]}"
+    # Use catalog.schema.model_name format
+    full_model_name = f"{db_name}.{model_name}"
+    MODEL_HEADER = f"""MODEL (
+name {full_model_name},
+kind FULL
+);
+    """
+    try:
+        model_dir = Path("models/")
+        model_dir.mkdir(parents=True, exist_ok=True)
+        model_path = model_dir / f"{model_name}.sql"
+        model_text = MODEL_HEADER + "\n" + sql.replace("chinook.main.", "")
+        model_path.write_text(model_text)
+        return model_text, str(model_path), full_model_name
+    except Exception as e:
+        logger.error(f"Error creating SQL Mesh model: {e}")
+        raise
+def create_pandera_schema(
+    sql: str, user_instruction: str, model_name: str
+) -> Tuple[str, str]:
+    SCRIPT_HEADER = """
+import pandas as pd
+import pandera.pandas as pa
+from pandera.typing import *
+import pytest
+from sqlmesh import Context
+from datetime import date
+from pathlib import Path
+import shutil
+import duckdb
+    """
+    MESH_STR = f"""
+@pytest.fixture(scope="session")
+def mesh_context():
+    context = Context(paths=".", gateway="duckdb")
+    yield context
+@pytest.fixture
+def today_str():
+    return date.today().isoformat()
+def test_back_fill(mesh_context, today_str):
+    mesh_context.plan(skip_backfill=False, auto_apply=True)
+    mesh_context.run(start=today_str, end=today_str)
+    df = mesh_context.fetchdf("SELECT * FROM {model_name} LIMIT 10")
+    assert not df.empty
     """
+    try:
+        schema = pipe.generate_pandera_schema(
+            sql_query=sql, user_instruction=user_instruction
+        )
+        test_schema = f"""
+def test_schema(mesh_context, today_str):
+    df = mesh_context.evaluate(
+        "{model_name}",
+        start=today_str,
+        end=today_str,
+        execution_time=today_str,
+    )
+    {schema.split()[1].split("(")[0].strip()}.validate(df)
     """
+        print(schema)
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            prefix="test_",
+            suffix=".py",
+            delete=False,
+            encoding="utf-8",
+        ) as f:
+            f.write(SCRIPT_HEADER)
+            f.write("\n\n")
+            f.write(schema)
+            f.write("\n\n")
+            f.write(MESH_STR)
+            f.write("\n\n")
+            f.write(test_schema)
+        file_path = Path(f.name)
+        return schema, str(file_path)
+    except Exception as e:
+        logger.error(f"Error creating Pandera schema: {e}")
+        raise
+def create_test_file(
+    table_name: str, db_name: str, sql_instruction: str, user_instruction: str
+) -> Tuple[str, str, pd.DataFrame, str, str]:
+    try:
+        sql, df = run_pipeline(table=table_name, query_input=sql_instruction)
+        model_text, model_file, model_name = create_mesh_model(sql=sql, db_name=db_name)
+        schema, test_file = create_pandera_schema(
+            sql=sql,
+            user_instruction=user_instruction,
+            model_name=model_name,
+        )
+        return test_file, model_file, df, model_text, schema
+    except Exception as e:
+        logger.error(f"Error creating test file for table {table_name}: {e}")
+        raise
+def run_tests(
+    table_name: str, db_name: str, sql_instruction: str, user_instruction: str
+):
+    test_file, model_file, df, model_text, schema = create_test_file(
+        table_name=table_name,
+        db_name=db_name,
+        sql_instruction=sql_instruction,
+        user_instruction=user_instruction,
+    )
+    capture_out = io.StringIO()
+    capture_err = io.StringIO()
+    old_out = sys.stdout
+    old_err = sys.stderr
+    sys.stdout = capture_out
+    sys.stderr = capture_err
+    try:
+        retcode = pytest.main(
+            [
+                test_file,
+                "-s",
+                "--tb=short",
+                "--disable-warnings",
+                "-o",
+                "cache_dir=/tmp",
+            ]
+        )
+    except Exception as e:
+        sys.stdout = old_out
+        sys.stderr = old_err
+        return f"Error running tests: {str(e)}", ""
+    sys.stdout = old_out
+    sys.stderr = old_err
+    output = capture_out.getvalue() + "\n" + capture_err.getvalue()
+    for f in [test_file, model_file]:
+        try:
+            os.remove(f)
+        except FileNotFoundError:
+            pass
+    return output, df, model_text, schema
+custom_css = """
+/* --- Overall container --- */
+.gradio-container {
+    background-color: #f0f4f8; /* light background */
+    font-family: 'Arial', sans-serif;
+}
+/* --- Logo --- */
+.logo {
+    max-width: 200px;
+    margin: 20px auto;
+    display: block;
+}
+/* --- Buttons --- */
+.gr-button {
+    background-color: #4a90e2 !important;  /* primary color */
+    font-size: 14px;                        /* fixed font size */
+    padding: 6px 12px !important;           /* fixed padding */
+    height: 36px !important;                /* fixed height */
+    min-width: 120px !important;            /* fixed width */
+}
+.gr-button:hover {
+    background-color: #3a7bc8 !important;
+}
+/* --- Logs Textbox --- */
+#logs textarea {
+    overflow-y: scroll;
+    resize: none;
+    height: 400px;
+    width: 100%;
+    font-family: monospace;
+    font-size: 13px;
+    line-height: 1.4;
+}
+/* Optional: small spacing between rows */
+.gr-row {
+    gap: 10px;
+}
 """
+with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"), css=custom_css
+) as demo:
+    gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
+    gr.Markdown(
+        """
+    <div style='text-align: center;'>
+    <strong style='font-size: 36px;'>SQL Test Suite</strong>
+    <br>
+    <span style='font-size: 20px;'>Automated testing and schema validation for SQL models with LLM.</span>
+    </div>
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            schema_dropdown = gr.Dropdown(
+                choices=["chinook", "northwind"],
+                value="chinook",
+                label="Select Schema",
+                interactive=True,
+            )
+            tables_dropdown = gr.Dropdown(
+                choices=[], label="Available Tables", value=None, interactive=True
+            )
+            # columns_dropdown = gr.Dropdown(choices=[], label="Available Columns", value=None, interactive=True)
+            columns_df = gr.DataFrame(label="Columns", value=[], interactive=False)
+            # with gr.Row():
+            #     generate_result = gr.Button("Run Tests", variant="primary")
+        with gr.Column(scale=3):
+            with gr.Row():
+                sql_instruction = gr.Textbox(
+                    lines=3,
+                    label="Business Metric Query (Plain English)",
+                    placeholder=(
+                        "Describe the business question you want to answer.\n"
+                        "Example: 'Show me the average sales per month.'\n"
+                        "Example: 'Total revenue by product category for last year.'"
+                    ),
+                )
+            with gr.Row():
+                user_instruction = gr.Textbox(
+                    lines=5,
+                    label="Define Data Quality Level",
+                    placeholder=(
+                        "Describe the validation rule and how strict it should be.\n"
+                        "Example: Validate that the incident_zip column contains valid 5-digit ZIP codes.\n"
+                    ),
+                )
+            with gr.Row():
+                with gr.Column(scale=7):
+                    pass
+                with gr.Column(scale=1):
+                    run_tests_btn = gr.Button("▶ Run Tests", variant="primary")
+            with gr.Row():
+                with gr.Column():
+                    with gr.Tabs():
+                        with gr.Tab("Test Logs"):
+                            with gr.Row():
+                                with gr.Column():
+                                    test_logs = gr.Textbox(
+                                        label="Test Logs",
+                                        lines=20,
+                                        max_lines=20,
+                                        interactive=False,
+                                        elem_id="logs",
+                                    )
+                        with gr.Tab("SQL Model"):
+                            with gr.Row():
+                                with gr.Column():
+                                    sql_model = gr.Textbox(
+                                        label="SQL Model",
+                                        lines=20,
+                                        max_lines=20,
+                                        interactive=False,
+                                        elem_id="sql_model",
+                                    )
+                        with gr.Tab("Schema"):
+                            with gr.Row():
+                                with gr.Column():
+                                    result_schema = gr.Textbox(
+                                        label="Validation Schema",
+                                        lines=20,
+                                        max_lines=20,
+                                        interactive=False,
+                                    )
+                        with gr.Tab("Data"):
+                            with gr.Row():
+                                with gr.Column():
+                                    result_data = gr.DataFrame(
+                                        label="Query Result",
+                                        value=[],
+                                        interactive=False,
+                                    )
+        schema_dropdown.change(
+            update_table_names, inputs=schema_dropdown, outputs=tables_dropdown
+        )
+        tables_dropdown.change(
+            update_column_names, inputs=tables_dropdown, outputs=columns_df
+        )
+        run_tests_btn.click(
+            run_tests,
+            inputs=[
+                tables_dropdown,
+                schema_dropdown,
+                sql_instruction,
+                user_instruction,
+            ],
+            outputs=[test_logs, result_data, sql_model, result_schema],
+        )
+        demo.load(
+            fn=update_table_names, inputs=schema_dropdown, outputs=tables_dropdown
+        )
 if __name__ == "__main__":
+    demo.launch(debug=True)

audits/.gitkeep ADDED Viewed

File without changes

config.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+gateways:
+  duckdb:
+    connection:
+      type: duckdb
+      catalogs:
+        local: 'database/chinook.duckdb'
+default_gateway: duckdb
+cache_dir: /tmp
+model_defaults:
+  dialect: duckdb
+linter:
+  enabled: true
+  rules:
+    - ambiguousorinvalidcolumn
+    - invalidselectstarexpansion
+    - noambiguousprojections

database/.gitkeep ADDED Viewed

File without changes

logo.png ADDED Viewed

macros/.gitkeep ADDED Viewed

File without changes

models/.gitkeep ADDED Viewed

File without changes

pytest.ini ADDED Viewed

	@@ -0,0 +1,4 @@

+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning

requirements.txt ADDED Viewed

	@@ -0,0 +1,280 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+annotated-doc==0.0.4
+    # via fastapi
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.12.1
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+astor==0.8.1
+    # via sqlmesh
+asttokens==3.0.1
+    # via stack-data
+audioop-lts==0.2.2
+    # via gradio
+brotli==1.2.0
+    # via gradio
+certifi==2026.1.4
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.4
+    # via requests
+click==8.3.1
+    # via
+    #   sqlmesh
+    #   typer
+    #   typer-slim
+    #   uvicorn
+colorama==0.4.6
+    # via
+    #   click
+    #   ipython
+    #   pytest
+    #   tqdm
+comm==0.2.3
+    # via ipywidgets
+croniter==6.0.0
+    # via sqlmesh
+dateparser==1.2.1
+    # via sqlmesh
+decorator==5.2.1
+    # via ipython
+duckdb==1.4.3
+    # via sqlmesh
+executing==2.2.1
+    # via stack-data
+fastapi==0.128.0
+    # via gradio
+ffmpy==1.0.0
+    # via gradio
+filelock==3.20.3
+    # via huggingface-hub
+fsspec==2026.1.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==6.3.0
+    # via data-test-demo (pyproject.toml)
+gradio-client==2.0.3
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.2.0
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   safehttpx
+huggingface-hub==1.3.2
+    # via
+    #   gradio
+    #   gradio-client
+humanize==4.15.0
+    # via sqlmesh
+hyperscript==0.3.0
+    # via sqlmesh
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+iniconfig==2.3.0
+    # via pytest
+ipython==9.9.0
+    # via ipywidgets
+ipython-pygments-lexers==1.1.1
+    # via ipython
+ipywidgets==8.1.8
+    # via
+    #   rich
+    #   sqlmesh
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   sqlmesh
+json-stream==2.4.1
+    # via sqlmesh
+json-stream-rs-tokenizer==0.5.0
+    # via json-stream
+jupyterlab-widgets==3.0.16
+    # via ipywidgets
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib-inline==0.2.1
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+mypy-extensions==1.1.0
+    # via typing-inspect
+numpy==2.4.1
+    # via
+    #   gradio
+    #   pandas
+orjson==3.11.5
+    # via gradio
+packaging==25.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pandera
+    #   pytest
+    #   sqlmesh
+pandas==2.3.3
+    # via
+    #   gradio
+    #   sqlmesh
+pandera==0.28.1
+    # via data-test-demo (pyproject.toml)
+parso==0.8.5
+    # via jedi
+pillow==12.1.0
+    # via gradio
+pluggy==1.6.0
+    # via pytest
+prompt-toolkit==3.0.52
+    # via ipython
+pure-eval==0.2.3
+    # via stack-data
+pydantic==2.12.5
+    # via
+    #   fastapi
+    #   gradio
+    #   pandera
+    #   sqlmesh
+pydantic-core==2.41.5
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via
+    #   ipython
+    #   ipython-pygments-lexers
+    #   pytest
+    #   rich
+pymysql==1.1.2
+    # via sqlmesh
+pytest==9.0.2
+    # via data-test-demo (pyproject.toml)
+python-dateutil==2.9.0.post0
+    # via
+    #   croniter
+    #   dateparser
+    #   pandas
+python-dotenv==1.2.1
+    # via sqlmesh
+python-multipart==0.0.21
+    # via gradio
+pytz==2025.2
+    # via
+    #   croniter
+    #   dateparser
+    #   pandas
+pyyaml==6.0.3
+    # via
+    #   gradio
+    #   huggingface-hub
+regex==2026.1.15
+    # via dateparser
+requests==2.32.5
+    # via sqlmesh
+rich==14.2.0
+    # via
+    #   sqlmesh
+    #   typer
+ruamel-yaml==0.19.1
+    # via sqlmesh
+safehttpx==0.1.7
+    # via gradio
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via
+    #   huggingface-hub
+    #   typer
+six==1.17.0
+    # via python-dateutil
+sqlglot==27.28.1
+    # via sqlmesh
+sqlglotrs==0.7.3
+    # via sqlglot
+sqlmesh==0.228.4
+    # via data-test-demo (pyproject.toml)
+stack-data==0.6.3
+    # via ipython
+starlette==0.50.0
+    # via
+    #   fastapi
+    #   gradio
+tenacity==9.1.2
+    # via sqlmesh
+time-machine==3.2.0
+    # via sqlmesh
+tomlkit==0.13.3
+    # via gradio
+tqdm==4.67.1
+    # via huggingface-hub
+traitlets==5.14.3
+    # via
+    #   ipython
+    #   ipywidgets
+    #   matplotlib-inline
+typeguard==4.4.4
+    # via pandera
+typer==0.21.1
+    # via gradio
+typer-slim==0.21.1
+    # via huggingface-hub
+typing-extensions==4.15.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pandera
+    #   pydantic
+    #   pydantic-core
+    #   typeguard
+    #   typer
+    #   typer-slim
+    #   typing-inspect
+    #   typing-inspection
+typing-inspect==0.9.0
+    # via pandera
+typing-inspection==0.4.2
+    # via pydantic
+tzdata==2025.3
+    # via
+    #   pandas
+    #   tzlocal
+tzlocal==5.3.1
+    # via dateparser
+urllib3==2.6.3
+    # via requests
+uvicorn==0.40.0
+    # via gradio
+wcwidth==0.2.14
+    # via prompt-toolkit
+widgetsnbextension==4.0.15
+    # via ipywidgets

seeds/.gitkeep ADDED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/client.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json
+import logging
+import os
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel
+load_dotenv()
+logger = logging.getLogger(__name__)
+MAX_RESPONSE_TOKENS = 2048
+TEMPERATURE = 0.9
+models = json.loads(os.getenv("MODEL_NAMES"))
+providers = json.loads(os.getenv("PROVIDERS"))
+def _engine_working(engine: InferenceClient) -> bool:
+    try:
+        engine.chat_completion([{"role": "user", "content": "ping"}], max_tokens=1)
+        logger.info("Engine is Working.")
+        return True
+    except Exception as e:
+        logger.exception(f"Engine is not working: {e}")
+        return False
+def _load_llm_client() -> InferenceClient:
+    """
+    Attempts to load the provided model from the huggingface endpoint.
+    Returns InferenceClient if successful.
+    Raises Exception if no model is available.
+    """
+    logger.warning("Loading Model...")
+    errors = []
+    for model in models:
+        for provider in providers:
+            if isinstance(model, str):
+                try:
+                    logger.info(f"Checking model: {model} provider: {provider}")
+                    client = InferenceClient(
+                        model=model,
+                        timeout=15,
+                        provider=provider,
+                    )
+                    if _engine_working(client):
+                        logger.info(
+                            f"The model is loaded : {model} , provider: {provider}"
+                        )
+                        return client
+                except Exception as e:
+                    logger.error(
+                        f"Error loading model {model} provider {provider}: {e}"
+                    )
+                    errors.append(str(e))
+    raise Exception(f"Unable to load any provided model: {errors}.")
+_default_client = _load_llm_client()
+class LLMChain:
+    def __init__(self, client: InferenceClient = _default_client):
+        self.client = client
+        self.total_tokens = 0
+    def run(
+        self,
+        system_prompt: str | None = None,
+        user_prompt: str | None = None,
+        messages: list[dict] | None = None,
+        format_name: str | None = None,
+        response_format: type[BaseModel] | None = None,
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        try:
+            if system_prompt and user_prompt:
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ]
+            elif not messages:
+                raise ValueError(
+                    "Either system_prompt and user_prompt or messages must be provided."
+                )
+            llm_response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=MAX_RESPONSE_TOKENS,
+                temperature=TEMPERATURE,
+                response_format=(
+                    {
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": format_name,
+                            "schema": response_format.model_json_schema(),
+                            "strict": True,
+                        },
+                    }
+                    if format_name and response_format
+                    else None
+                ),
+            )
+            self.total_tokens += llm_response.usage.total_tokens
+            analysis = llm_response.choices[0].message.content
+            if response_format:
+                analysis = json.loads(analysis)
+                fields = list(response_format.model_fields.keys())
+                if len(fields) == 1:
+                    return analysis.get(fields[0])
+                return {field: analysis.get(field) for field in fields}
+            return analysis
+        except Exception as e:
+            logger.error(f"Error during LLM calls: {e}")
+            return None

src/models.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from pydantic import BaseModel, Field
+class SQLQueryModel(BaseModel):
+    sql_query: str = Field(..., description="SQL query to execute.")
+    explanation: str = Field(..., description="Short explanation of the SQL query.")
+class PanderaSchemaModel(BaseModel):
+    schema_name: str = Field(
+        ..., description="Only Pandera schema to validate the data."
+    )

src/pipelines.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import ast
+import logging
+import os
+import pandas as pd
+from dotenv import load_dotenv
+from duckdb import DuckDBPyConnection
+from src.models import PanderaSchemaModel, SQLQueryModel
+load_dotenv()
+logger = logging.getLogger(__name__)
+SQL_GENERATION_RETRIES = int(os.getenv("SQL_GENERATION_RETRIES", "5"))
+PANDERA_PROMPT = os.getenv("PANDERA_PROMPT")
+PANDERA_USER_PROMPT = os.getenv("PANDERA_USER_PROMPT")
+SQL_PROMPT = os.getenv("SQL_PROMPT")
+USER_PROMPT = os.getenv("USER_PROMPT")
+class Query2Schema:
+    def __init__(
+        self,
+        duckdb: DuckDBPyConnection,
+        chain,
+    ) -> None:
+        self._duckdb = duckdb
+        self.chain = chain
+    def generate_sql(
+        self, user_question: str, context: str, errors: str | None = None
+    ) -> str | dict[str, str | int | float | None] | list[str] | None:
+        """Generate SQL + description."""
+        user_prompt_formatted = USER_PROMPT.format(
+            question=user_question, context=context
+        )
+        if errors:
+            user_prompt_formatted += f"Carefully review the previous error or\
+            exception and rewrite the SQL so that the error does not occur again.\
+            Try a different approach or rewrite SQL if needed. Last error: {errors}"
+        sql = self.chain.run(
+            system_prompt=SQL_PROMPT,
+            user_prompt=user_prompt_formatted,
+            format_name="sql_query",
+            response_format=SQLQueryModel,
+        )
+        logger.info(f"SQL Generated Successfully: {sql}")
+        return sql
+    def run_query(self, sql_query: str) -> pd.DataFrame | None:
+        """Execute SQL and return dataframe."""
+        logger.info("Query Execution Started.")
+        return self._duckdb.query(sql_query).df()
+    def try_sql_with_retries(
+        self,
+        user_question: str,
+        context: str,
+        max_retries: int = SQL_GENERATION_RETRIES,
+    ) -> tuple[
+        str | dict[str, str | int | float | None] | list[str] | None,
+        pd.DataFrame | None,
+    ]:
+        """Try SQL generation + execution with retries."""
+        last_error = None
+        all_errors = ""
+        for attempt in range(
+            1, max_retries + 2
+        ):  # @ Since the first is normal and not consider in retries
+            try:
+                if attempt > 1 and last_error:
+                    logger.info(f"Retrying: {attempt - 1}")
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context, errors=all_errors)
+                    if not sql:
+                        return None, None
+                else:
+                    # Generate SQL
+                    sql = self.generate_sql(user_question, context)
+                    if not sql:
+                        return None, None
+                # Try executing query
+                sql_query_str = sql.get("sql_query") if isinstance(sql, dict) else sql
+                if not isinstance(sql_query_str, str):
+                    raise ValueError(
+                        f"Expected SQL query to be a string, got {type(sql_query_str).__name__}"
+                    )
+                query_df = self.run_query(sql_query_str)
+                # If execution succeeds, stop retrying or if df is not empty
+                if query_df is not None and not query_df.empty:
+                    return sql, query_df
+            except Exception as e:
+                last_error = f"\nAttempt {attempt - 1}] {type(e).__name__}: {e}"
+                logger.error(f"Error during SQL generation or execution: {last_error}")
+                all_errors += last_error
+        logger.error(f"Failed after {max_retries} attempts. Last error: {all_errors}")
+        return None, None
+    def generate_pandera_schema(self, sql_query: str, user_instruction: str) -> str:
+        """Generate Pandera schema."""
+        class_lines = []
+        schema_str = self.chain.run(
+            system_prompt=PANDERA_PROMPT,
+            user_prompt=PANDERA_USER_PROMPT.format(
+                sql_query=sql_query, instructions=user_instruction
+            ),
+            format_name="pandera_schema",
+            response_format=PanderaSchemaModel,
+        )
+        parsed = ast.parse(schema_str)
+        original_lines = schema_str.splitlines()
+        for node in parsed.body:
+            if isinstance(node, ast.ClassDef):
+                start, end = node.lineno - 1, node.end_lineno
+                class_lines.extend(original_lines[start:end])
+        return "\n".join(class_lines)

src/prompts.py ADDED Viewed

	@@ -0,0 +1,71 @@

+USER_PROMPT = """User's Text Question:
+{question}
+Provided table context information:
+{context}"""
+SQL_PROMPT = """You are an expert Text-to-SQL assistant. Convert the user's natural-language request into a single, read-only, syntactically valid DuckDB SQL SELECT statement that runs against the provided schema (the schema will be supplied as CREATE TABLE DDL). Use the exact table and column names from the schema.
+Return two things:
+1. The SQL statement.
+2. A short natural-language description (1-2 sentences) of what the query returns.
+Rules:
+1. Output MUST be a single SELECT query. JOINs, subqueries, aggregations, GROUP BY, ORDER BY, and LIMIT are allowed.
+2. Do NOT generate any DML/DDL (INSERT, UPDATE, DELETE, DROP, etc.) or non-read operations.
+3. Use DuckDB SQL functions and syntax. For date/time grouping, use DATE_TRUNC('unit', column) (e.g., 'month', 'day', 'year').
+4. Prefer explicit column lists. Use SELECT * only if the user explicitly requests all columns.
+5. Make the query robust and maintainable, so it can be reused or adapted for similar analyses.
+6. After execution in the downstream pipeline, if an error occurs (available as `Last Error` with a short description), analyze that error and rewrite the SQL to resolve it while preserving the user's intent. The rewritten query must still be valid DuckDB SQL.
+"""
+PANDERA_PROMPT = """You are provided with a SQL query which is used to fetch data from a database. Your task is to generate a valid Pandera SchemaModel class that can be used to validate the resulting data from the query.
+The generated schema should be **general and simple**, not overly complex. Only validate basic aspects like column types, nullability, and simple value constraints (like positive integers, string patterns, or ranges) since you only have the SQL query and the resulting column names/types.
+Follow these guidelines:
+1. **Use Pandera SchemaModel**:
+   - Each column should have a type hint using `Series[Type]`.
+   - Use `pa.Field` to define simple validations.
+2. **Validation rules should be simple and reasonable**:
+   - `nullable` for optional columns
+   - `unique` for IDs if obvious
+   - `gt`/`ge`/`lt`/`le` for numeric ranges if reasonable
+   - `str_matches`, `str_length` or `str_contains` for string patterns (like ZIP codes or emails)
+   - Avoid complex cross-column or statistical checks
+3. **Add Config class**:
+   - Set `coerce = True` to cast data types automatically
+4. **Add optional metadata**:
+   - Include `description` for columns if possible
+   - Include `title` for columns if it helps
+5. **Output only valid Python code**:
+   - The output should be a **single Python class definition**.
+   - Do not include any explanations, comments, or extra text.
+6. **Example Output**:
+import pandas as pd
+import pandera as pa
+from pandera.typing import Series
+class CustomerSchema(pa.DataFrameModel):
+    customer_id: Series[int] = pa.Field(gt=0, unique=True, nullable=False, description="Unique customer identifier")
+    first_name: Series[str] = pa.Field(nullable=False, str_length=(1, 50), description="Customer first name")
+    last_name: Series[str] = pa.Field(nullable=False, str_length=(1, 50), description="Customer last name")
+    email: Series[str] = pa.Field(nullable=False, str_matches=r"^[\\w\\.-]+@[\\w\\.-]+\\.\\w+$", description="Customer email address")
+    age: Series[int] = pa.Field(ge=0, le=120, nullable=True, description="Customer age in years")
+    class Config:
+        coerce = True
+Additional notes:
+If the SQL query uses JOIN, only include columns that appear in the SELECT statement.
+You may infer basic constraints from column names (e.g., columns ending with _id are likely unique integers).
+Avoid domain-specific logic unless it is obvious from the column names or SQL query.
+Keep the schema robust but simple, suitable for automated ETL validation."""
+PANDERA_USER_PROMPT = """SQL Query:
+{sql_query}
+User Instructions:
+{instructions}"""