Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Running

App Files Files Community

nilotpaldhar2004 commited on 7 days ago

Commit

4f43f55

unverified ·

1 Parent(s): 49d3371

Enhance SQL generation in generate_sql function

Browse files

Refactor SQL generation logic with enhanced regex and improved fallback handling.

Files changed (1) hide show

app.py +50 -52

app.py CHANGED Viewed

@@ -81,75 +81,73 @@ def get_schema(db_bytes: bytes) -> str:
 def generate_sql(question: str, schema: str) -> str:
-    """Run T5 inference to produce SQL."""
-    # Extract table name from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
-    # ── Rule-based shortcuts (fast + accurate) ────────────────────────────────
     q = question.lower().strip()
-    if re.search(r'show.*(first|top).*\d+|first.*\d+.*row|top.*\d+', q):
-        n = re.search(r'\d+', q)
-        return f'SELECT * FROM {quoted} LIMIT {n.group() if n else 10}'
-    if re.search(r'(show|display|get|give).*(first|all).*row|first.*row|show.*row', q):
-        return f'SELECT * FROM {quoted} LIMIT 10'
     if re.search(r'count.*(total|all|record|row)|total.*(record|row|count)|how many', q):
         return f'SELECT COUNT(*) FROM {quoted}'
-    if re.search(r'show.*(all|every).*row|all.*row|select all', q):
-        return f'SELECT * FROM {quoted} LIMIT 50'
-    if re.search(r'average|avg', q) and col_match:
-        num_col = next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|toluene|xylene', c, re.I)), col_match[2])
-        return f'SELECT AVG("{num_col}") FROM {quoted}'
-    if re.search(r'unique|distinct', q) and col_match:
-        return f'SELECT COUNT(DISTINCT "{col_match[0]}") FROM {quoted}'
-    if re.search(r'group by', q) and col_match:
-        return f'SELECT "{col_match[0]}", COUNT(*) FROM {quoted} GROUP BY "{col_match[0]}"'
-    if re.search(r'max|maximum|highest', q) and col_match:
-        num_col = col_match[1] if len(col_match) > 1 else col_match[0]
-        return f'SELECT MAX("{num_col}") FROM {quoted}'
-    if re.search(r'min|minimum|lowest', q) and col_match:
-        num_col = col_match[1] if len(col_match) > 1 else col_match[0]
-        return f'SELECT MIN("{num_col}") FROM {quoted}'
-    # ── T5 model fallback ─────────────────────────────────────────────────────
     col_hint = ", ".join(col_match) if col_match else ""
-    prompt = f"tables:\n{schema}\ncolumns: {col_hint}\nquery for: {question}"
-    inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
-        truncation=True,
-        max_length=512,
-    ).to(DEVICE)
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=MAX_NEW_TOKENS,
-            num_beams=4,
-            early_stopping=True,
-        )
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # Fix 1: replace any FROM/JOIN table reference (quoted or unquoted) with correct table
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
-    # Fix 2: strip junk tokens after table name before LIMIT/WHERE/ORDER etc.
-    # e.g. FROM "city_day" Datetime LIMIT 10  →  FROM "city_day" LIMIT 10
-    sql = re.sub(
-        r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)',
-        r'\1',
-        sql, flags=re.IGNORECASE
-    )
-    # Fix 3: fallback if no SELECT at all
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:

 def generate_sql(question: str, schema: str) -> str:
+    """
+    Enhanced Hybrid SQL Engine.
+    Priority 1: Smart Regex (Deterministic & Instant)
+    Priority 2: T5 Transformer (Probabilistic Fallback)
+    """
+    # 1. Context Extraction
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
+    # 2. Smart Column Detection
+    # Searches for a column name from the schema within the user's question
+    target_col = None
+    for col in col_match:
+        if col.lower() in q:
+            target_col = col
+            break
+    # 3. Enhanced Rule-Based Shortcuts (Smart Logic)
+    # DISTINCT/UNIQUE COUNT
+    if re.search(r'unique|distinct', q):
+        col = target_col if target_col else (col_match[0] if col_match else "*")
+        return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
+    # GROUP BY
+    if re.search(r'group.*by|per|each', q):
+        col = target_col if target_col else (col_match[0] if col_match else "data")
+        return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
+    # AVERAGE (With semantic fallback for your city_day dataset)
+    if re.search(r'average|avg|mean', q):
+        num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[2] if len(col_match)>2 else col_match[0])
+        return f'SELECT AVG("{num_col}") FROM {quoted}'
+    # TOTAL RECORDS
     if re.search(r'count.*(total|all|record|row)|total.*(record|row|count)|how many', q):
         return f'SELECT COUNT(*) FROM {quoted}'
+    # LIMIT/TOP ROWS
+    if re.search(r'show|display|get|first|top', q):
+        n_match = re.search(r'\d+', q)
+        limit = n_match.group() if n_match else 10
+        return f'SELECT * FROM {quoted} LIMIT {limit}'
+    # 4. T5 Model Fallback
     col_hint = ", ".join(col_match) if col_match else ""
+    prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
     with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, num_beams=4, early_stopping=True)
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # Post-inference cleaning (Crucial for SQLite stability)
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
+    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: