Spaces:
Running
Running
Commit Β·
a62955e
1
Parent(s): 7c2121a
added column smartness
Browse files- ai/signatures.py +96 -42
ai/signatures.py
CHANGED
|
@@ -10,33 +10,79 @@ Consolidated from 8 signatures down to 4 to minimize LLM round-trips:
|
|
| 10 |
import dspy
|
| 11 |
|
| 12 |
|
| 13 |
-
# ββ 1. Analyze & Plan
|
| 14 |
|
| 15 |
class AnalyzeAndPlan(dspy.Signature):
|
| 16 |
"""You are an expert SQL analyst with strong business intelligence skills.
|
| 17 |
Given a user question, a database schema, and a DATA PROFILE showing actual
|
| 18 |
values in the database, analyze the question and produce a detailed query plan.
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
Steps:
|
| 35 |
-
1.
|
| 36 |
-
2.
|
| 37 |
-
3. Identify
|
| 38 |
-
4.
|
| 39 |
-
5. Produce
|
| 40 |
|
| 41 |
question = dspy.InputField(desc="The user's natural-language question")
|
| 42 |
schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
|
|
@@ -44,38 +90,44 @@ class AnalyzeAndPlan(dspy.Signature):
|
|
| 44 |
data_profile = dspy.InputField(desc="Data profile showing actual values: distinct categorical values, numeric ranges, date ranges")
|
| 45 |
|
| 46 |
intent = dspy.OutputField(desc="What the user wants to know (1 sentence)")
|
| 47 |
-
relevant_tables = dspy.OutputField(desc="Comma-separated list of tables needed")
|
| 48 |
relevant_columns = dspy.OutputField(desc="Comma-separated list of table.column pairs needed")
|
| 49 |
join_conditions = dspy.OutputField(desc="JOIN conditions to use, or 'none'")
|
| 50 |
-
where_conditions = dspy.OutputField(desc="WHERE conditions including status/
|
| 51 |
aggregations = dspy.OutputField(desc="Aggregation functions to apply, or 'none'")
|
| 52 |
group_by = dspy.OutputField(desc="GROUP BY columns, or 'none'")
|
| 53 |
order_by = dspy.OutputField(desc="ORDER BY clause, or 'none'")
|
| 54 |
limit_val = dspy.OutputField(desc="LIMIT value, or 'none'")
|
| 55 |
|
| 56 |
|
| 57 |
-
# ββ 2. SQL Generation ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
|
| 59 |
class SQLGeneration(dspy.Signature):
|
| 60 |
"""Generate a valid PostgreSQL SELECT query based on the query plan.
|
| 61 |
The query must be syntactically correct and only reference existing
|
| 62 |
tables and columns from the schema.
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
CRITICAL: Output ONLY the raw SQL. No markdown, no explanation, no comments."""
|
| 81 |
|
|
@@ -85,12 +137,14 @@ class SQLGeneration(dspy.Signature):
|
|
| 85 |
|
| 86 |
sql_query = dspy.OutputField(
|
| 87 |
desc="The SIMPLEST valid PostgreSQL SELECT query that correctly answers the question. "
|
| 88 |
-
"Use pre-computed
|
| 89 |
-
"
|
|
|
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
| 93 |
-
# ββ 3. SQL Self-Critique & Repair
|
| 94 |
|
| 95 |
class SQLCritiqueAndFix(dspy.Signature):
|
| 96 |
"""Evaluate a generated SQL query for correctness against the schema.
|
|
@@ -110,7 +164,7 @@ class SQLCritiqueAndFix(dspy.Signature):
|
|
| 110 |
)
|
| 111 |
|
| 112 |
|
| 113 |
-
# ββ 4. Interpret & Insight
|
| 114 |
|
| 115 |
class InterpretAndInsight(dspy.Signature):
|
| 116 |
"""Interpret SQL query results for a non-technical user and generate insights.
|
|
@@ -140,7 +194,7 @@ class InterpretAndInsight(dspy.Signature):
|
|
| 140 |
)
|
| 141 |
|
| 142 |
|
| 143 |
-
# ββ 5. SQL Repair
|
| 144 |
|
| 145 |
class SQLRepair(dspy.Signature):
|
| 146 |
"""Given a SQL query that produced a database error, generate a
|
|
|
|
| 10 |
import dspy
|
| 11 |
|
| 12 |
|
| 13 |
+
# ββ 1. Analyze & Plan ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
|
| 15 |
class AnalyzeAndPlan(dspy.Signature):
|
| 16 |
"""You are an expert SQL analyst with strong business intelligence skills.
|
| 17 |
Given a user question, a database schema, and a DATA PROFILE showing actual
|
| 18 |
values in the database, analyze the question and produce a detailed query plan.
|
| 19 |
|
| 20 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
RULE 0 β SIMPLICITY FIRST (HIGHEST PRIORITY)
|
| 22 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
Always use the SIMPLEST possible query that correctly answers the question.
|
| 24 |
+
- If a pre-computed total/summary column already exists in the schema
|
| 25 |
+
(e.g. total_amount, grand_total, total_price), USE IT DIRECTLY.
|
| 26 |
+
NEVER reconstruct it by summing component columns β that is always WRONG
|
| 27 |
+
because it misses labour, taxes, making charges, and other components.
|
| 28 |
+
- For single-record lookups (e.g. "total amount of PO12345"), just filter
|
| 29 |
+
and SELECT that column. No extra joins, no SUM.
|
| 30 |
+
- Only JOIN tables when the required column does not exist in the primary table.
|
| 31 |
+
- Only aggregate (SUM, COUNT, AVG) when the question genuinely asks for an
|
| 32 |
+
aggregate across multiple rows.
|
| 33 |
+
|
| 34 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
RULE 1 β WHICH COLUMN TO USE (CRITICAL β READ CAREFULLY)
|
| 36 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
|
| 38 |
+
ORDER-LEVEL QUESTIONS (revenue, AOV, total sales, order value, total amount):
|
| 39 |
+
β Use: sales_table_v2_sales_order.total_amount
|
| 40 |
+
β This is the PRE-COMPUTED grand total per order (includes all items,
|
| 41 |
+
gold, diamonds, making charges, labour, taxes).
|
| 42 |
+
β Examples: "total revenue", "AOV", "average order value", "total sales",
|
| 43 |
+
"how much did customer X spend", "total amount of order SO123".
|
| 44 |
+
β Formula:
|
| 45 |
+
Revenue = SUM(total_amount) FROM sales_order WHERE status = 'closed'
|
| 46 |
+
AOV = AVG(total_amount) FROM sales_order WHERE status = 'closed'
|
| 47 |
+
OR = SUM(total_amount) / COUNT(DISTINCT so_id) WHERE status = 'closed'
|
| 48 |
+
β NEVER use line_total from sales_order_line_pricing for these β it is a
|
| 49 |
+
per-line amount and will give wrong results.
|
| 50 |
+
|
| 51 |
+
LINE-ITEM / PRODUCT-LEVEL QUESTIONS (per-product revenue, top products by sales):
|
| 52 |
+
β Use: sales_table_v2_sales_order_line_pricing.line_total
|
| 53 |
+
β Use ONLY when the question is about individual product/SKU performance.
|
| 54 |
+
β Examples: "revenue per product", "top selling products by revenue",
|
| 55 |
+
"which product generates most sales".
|
| 56 |
+
β JOIN path: sales_order β sales_order_line β sales_order_line_pricing
|
| 57 |
+
β Still filter by sales_order.status = 'closed'.
|
| 58 |
+
|
| 59 |
+
PURCHASE ORDER TOTALS:
|
| 60 |
+
β Use: purchase_orders_v6_purchase_order.total_amount
|
| 61 |
+
β For: "total amount of PO123", "PO value", "purchase order cost".
|
| 62 |
+
β NEVER sum gold_amount + diamond_amount from PO line tables β that misses labour.
|
| 63 |
+
|
| 64 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
RULE 2 β STATUS FILTERING
|
| 66 |
+
βββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
For ALL revenue, sales, AOV, and financial metrics:
|
| 68 |
+
β WHERE status = 'closed' on sales_table_v2_sales_order
|
| 69 |
+
For product catalog or inventory questions: no status filter needed.
|
| 70 |
+
|
| 71 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
RULE 3 β DATE FILTERING
|
| 73 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
|
| 75 |
+
Use text comparisons for date filters:
|
| 76 |
+
β "last year" (2024): order_date >= '2024-01-01' AND order_date <= '2024-12-31'
|
| 77 |
+
β "this year" (2025): order_date >= '2025-01-01' AND order_date <= '2025-12-31'
|
| 78 |
+
β "last month": use appropriate YYYY-MM-DD range.
|
| 79 |
|
| 80 |
Steps:
|
| 81 |
+
1. Identify: is this ORDER-LEVEL or LINE-ITEM-LEVEL or PO question?
|
| 82 |
+
2. Pick the correct source column per RULE 1 above.
|
| 83 |
+
3. Identify the MINIMUM tables needed (often just one table).
|
| 84 |
+
4. Apply status and date filters as needed.
|
| 85 |
+
5. Produce the simplest correct query plan."""
|
| 86 |
|
| 87 |
question = dspy.InputField(desc="The user's natural-language question")
|
| 88 |
schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
|
|
|
|
| 90 |
data_profile = dspy.InputField(desc="Data profile showing actual values: distinct categorical values, numeric ranges, date ranges")
|
| 91 |
|
| 92 |
intent = dspy.OutputField(desc="What the user wants to know (1 sentence)")
|
| 93 |
+
relevant_tables = dspy.OutputField(desc="Comma-separated list of tables needed (minimum necessary)")
|
| 94 |
relevant_columns = dspy.OutputField(desc="Comma-separated list of table.column pairs needed")
|
| 95 |
join_conditions = dspy.OutputField(desc="JOIN conditions to use, or 'none'")
|
| 96 |
+
where_conditions = dspy.OutputField(desc="WHERE conditions including status/date filters, or 'none'")
|
| 97 |
aggregations = dspy.OutputField(desc="Aggregation functions to apply, or 'none'")
|
| 98 |
group_by = dspy.OutputField(desc="GROUP BY columns, or 'none'")
|
| 99 |
order_by = dspy.OutputField(desc="ORDER BY clause, or 'none'")
|
| 100 |
limit_val = dspy.OutputField(desc="LIMIT value, or 'none'")
|
| 101 |
|
| 102 |
|
| 103 |
+
# ββ 2. SQL Generation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
|
| 105 |
class SQLGeneration(dspy.Signature):
|
| 106 |
"""Generate a valid PostgreSQL SELECT query based on the query plan.
|
| 107 |
The query must be syntactically correct and only reference existing
|
| 108 |
tables and columns from the schema.
|
| 109 |
|
| 110 |
+
CRITICAL RULES:
|
| 111 |
+
|
| 112 |
+
1. USE PRE-COMPUTED TOTALS β NEVER RECONSTRUCT THEM:
|
| 113 |
+
- For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
|
| 114 |
+
- For PO totals: use purchase_orders_v6_purchase_order.total_amount
|
| 115 |
+
- NEVER add gold_amount + diamond_amount or any component columns β
|
| 116 |
+
that always gives the WRONG answer (misses labour, taxes, etc.)
|
| 117 |
+
|
| 118 |
+
2. CORRECT FORMULAS:
|
| 119 |
+
- Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 120 |
+
- AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 121 |
+
- Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
|
| 122 |
+
JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
|
| 123 |
+
|
| 124 |
+
3. DATE FILTERING (order_date is TEXT 'YYYY-MM-DD'):
|
| 125 |
+
- Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
|
| 126 |
+
- Do NOT use EXTRACT() or CAST() on order_date
|
| 127 |
+
|
| 128 |
+
4. SIMPLICITY:
|
| 129 |
+
- Single-record lookup = simple WHERE filter, no aggregation
|
| 130 |
+
- Only JOIN when needed, only aggregate when needed
|
| 131 |
|
| 132 |
CRITICAL: Output ONLY the raw SQL. No markdown, no explanation, no comments."""
|
| 133 |
|
|
|
|
| 137 |
|
| 138 |
sql_query = dspy.OutputField(
|
| 139 |
desc="The SIMPLEST valid PostgreSQL SELECT query that correctly answers the question. "
|
| 140 |
+
"Use pre-computed total_amount for order/PO totals. "
|
| 141 |
+
"Use AVG(total_amount) or SUM(total_amount)/COUNT(DISTINCT so_id) for AOV β "
|
| 142 |
+
"NEVER SUM or AVG of line_total for AOV. "
|
| 143 |
+
"Output ONLY raw SQL β no markdown, no explanation, no code fences."
|
| 144 |
)
|
| 145 |
|
| 146 |
|
| 147 |
+
# ββ 3. SQL Self-Critique & Repair βββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
|
| 149 |
class SQLCritiqueAndFix(dspy.Signature):
|
| 150 |
"""Evaluate a generated SQL query for correctness against the schema.
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
|
| 167 |
+
# ββ 4. Interpret & Insight ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
|
| 169 |
class InterpretAndInsight(dspy.Signature):
|
| 170 |
"""Interpret SQL query results for a non-technical user and generate insights.
|
|
|
|
| 194 |
)
|
| 195 |
|
| 196 |
|
| 197 |
+
# ββ 5. SQL Repair βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 198 |
|
| 199 |
class SQLRepair(dspy.Signature):
|
| 200 |
"""Given a SQL query that produced a database error, generate a
|