tjhalanigrid commited on Mar 26

Commit

cf17729

1 Parent(s): b70f6fd

Added full project

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +13 -3
app.py +569 -0
db.zip +3 -0
int8_dynamic/meta.json +7 -0
int8_dynamic/model.pt +3 -0
int8_dynamic/tokenizer/merges.txt +0 -0
int8_dynamic/tokenizer/special_tokens_map.json +753 -0
int8_dynamic/tokenizer/tokenizer.json +0 -0
int8_dynamic/tokenizer/tokenizer_config.json +959 -0
int8_dynamic/tokenizer/vocab.json +0 -0
requirements.txt +10 -0
scripts/__pycache__/benchmark_parallel_reward.cpython-310.pyc +0 -0
scripts/__pycache__/benchmark_parallel_reward.cpython-313.pyc +0 -0
scripts/__pycache__/benchmark_quantization.cpython-310.pyc +0 -0
scripts/__pycache__/benchmark_rollout_generation.cpython-310.pyc +0 -0
scripts/__pycache__/quantize_export.cpython-310.pyc +0 -0
scripts/__pycache__/quantized_infer_harness.cpython-310.pyc +0 -0
scripts/benchmark_parallel_reward.py +202 -0
scripts/benchmark_quantization.py +108 -0
scripts/benchmark_rollout_generation.py +66 -0
scripts/error_dashboard.py +99 -0
scripts/evaluate.py +170 -0
scripts/plot_task2.py +58 -0
scripts/plot_task3.py +15 -0
scripts/plot_task3_plotly.py +103 -0
scripts/quantize_export.py +86 -0
scripts/quantized_infer_harness.py +46 -0
src/__pycache__/execution_reward.cpython-310.pyc +0 -0
src/__pycache__/quantization_utils.cpython-310.pyc +0 -0
src/__pycache__/quantized_text2sql_engine.cpython-310.pyc +0 -0
src/__pycache__/schema_encoder.cpython-310.pyc +0 -0
src/__pycache__/schema_utils.cpython-310.pyc +0 -0
src/__pycache__/sql_validator.cpython-310.pyc +0 -0
src/__pycache__/text2sql_engine.cpython-310.pyc +0 -0
src/ask.py +93 -0
src/component_analysis.py +229 -0
src/constrained_decoding.py +1058 -0
src/constrained_decoding_sample.py +516 -0
src/convert_to_hf_dataset.py +8 -0
src/eval_baseline_codet5.py +112 -0
src/eval_both_metrics.py +144 -0
src/eval_rl_fixed.py +756 -0
src/eval_rl_t5.py +279 -0
src/eval_single_model.py +218 -0
src/evaluate_model_codet5.py +392 -0
src/evaluate_model_t5_small_sft.py +179 -0
src/evaluate_rl_bart.py +138 -0
src/evaluate_sft_bart.py +190 -0
src/evaluate_without_constraied.py +503 -0
src/execution_reward copy.py +831 -0

README.md CHANGED Viewed

@@ -1,3 +1,13 @@
----
-license: apache-2.0
----

+---
+title: Text2sql Demo
+emoji: 📊
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.8.0
+app_file: app.py
+pinned: false
+license: mit
+python_version: 3.10.13
+short_description: 'Text to SQL with RLHF'
+---

app.py ADDED Viewed

	@@ -0,0 +1,569 @@

+"""
+GRADIO DEMO UI - LAZY LOADING EDITION
+NL → SQL → Result Table
+"""
+import gradio as gr
+import pandas as pd
+import re
+import time
+import os
+import torch
+import sys
+import json
+import subprocess
+import base64
+import io
+from pathlib import Path
+from typing import Iterator
+# ==========================================
+# RELATIVE PATH RESOLUTION (GLOBAL)
+# ==========================================
+try:
+    PROJECT_ROOT = Path(__file__).resolve().parent
+except NameError:
+    PROJECT_ROOT = Path(".").resolve()
+if (PROJECT_ROOT / "data" / "database").exists():
+    DB_ROOT = PROJECT_ROOT / "data" / "database"
+else:
+    DB_ROOT = PROJECT_ROOT / "final_databases"
+def get_db_path(db_id: str) -> str:
+    path1 = DB_ROOT / db_id / f"{db_id}.sqlite"
+    path2 = DB_ROOT / f"{db_id}.sqlite"
+    return str(path1) if path1.exists() else str(path2)
+# ==========================================
+# 🔥 CUDA MOCK PATCH FOR MAC (MPS) / CPU
+# ==========================================
+if not torch.cuda.is_available():
+    class MockCUDAEvent:
+        def __init__(self, enable_timing=False, blocking=False, interprocess=False):
+            self.t = 0.0
+        def record(self, stream=None):
+            self.t = time.perf_counter()
+        def elapsed_time(self, end_event):
+            return (end_event.t - self.t) * 1000.0
+    torch.cuda.Event = MockCUDAEvent
+    if not hasattr(torch.cuda, 'synchronize'):
+        torch.cuda.synchronize = lambda: None
+# ==========================================
+# IMPORTS & ENGINE SETUP
+# ==========================================
+from src.quantized_text2sql_engine import QuantizedText2SQLEngine
+from src.schema_encoder import SchemaEncoder
+DEFAULT_QUANT_ARTIFACT = str(PROJECT_ROOT / "int8_dynamic")
+_ENGINE_CACHE = {}
+_QUERY_LOG = []
+_PERF_LOG = []
+_SUCCESS_LOG = []
+_OP_STATS = {
+    "SELECT": {"ok": 0, "fail": 0}, "WHERE": {"ok": 0, "fail": 0}, "JOIN": {"ok": 0, "fail": 0},
+    "GROUP_BY": {"ok": 0, "fail": 0}, "ORDER_BY": {"ok": 0, "fail": 0}, "HAVING": {"ok": 0, "fail": 0}, "LIMIT": {"ok": 0, "fail": 0},
+}
+def get_quant_engine(artifact_dir: str, use_constrained: bool = False, exec_workers: int = 8, use_cache: bool = True):
+    key = (artifact_dir, bool(use_constrained), int(exec_workers), bool(use_cache))
+    if key not in _ENGINE_CACHE:
+        try:
+            _ENGINE_CACHE[key] = QuantizedText2SQLEngine(artifact_dir, device="cpu", use_constrained=bool(use_constrained), exec_workers=int(exec_workers), use_cache=bool(use_cache))
+        except TypeError:
+            _ENGINE_CACHE[key] = QuantizedText2SQLEngine(artifact_dir)
+    return _ENGINE_CACHE[key]
+# 🚨 LAZY LOADING: We DO NOT load the model here! We only load the fast Schema Encoder.
+quant_engine = None
+try:
+    schema_encoder = SchemaEncoder(DB_ROOT)
+except Exception as e:
+    print(f"Warning: SchemaEncoder failed to load: {e}")
+    schema_encoder = None
+SAMPLES = [
+    ("Show 10 distinct employee first names.", "chinook_1"), ("Which artist has the most albums?", "chinook_1"),
+    ("List all the tracks that belong to the 'Rock' genre.", "chinook_1"), ("What are the names of all the cities?", "flight_1"),
+    ("Find the flight number and cost of the cheapest flight.", "flight_1"), ("List the airlines that fly out of New York.", "flight_1"),
+    ("Which campus was opened between 1935 and 1939?", "csu_1"), ("Count the number of students in each department.", "college_2"),
+    ("List the names of all clubs.", "club_1"), ("How many members does each club have?", "club_1"),
+    ("Show the names of all cinemas.", "cinema"), ("Which cinema has the most screens?", "cinema")
+]
+SAMPLE_QUESTIONS = [q[0] for q in SAMPLES]
+def explain_sql(sql):
+    if not sql: return ""
+    explanation = "This SQL query retrieves information from the database."
+    sql_lower = sql.lower()
+    if "join" in sql_lower: explanation += "\n• It combines data from multiple tables using JOIN."
+    if "where" in sql_lower: explanation += "\n• It filters rows using a WHERE condition."
+    if "group by" in sql_lower: explanation += "\n• It groups results using GROUP BY."
+    if "order by" in sql_lower: explanation += "\n• It sorts the results using ORDER BY."
+    if "limit" in sql_lower: explanation += "\n• It limits the number of returned rows."
+    return explanation
+def sql_ops(sql: str) -> list[str]:
+    s = (sql or "").lower()
+    ops = ["SELECT"]
+    if " where " in f" {s} ": ops.append("WHERE")
+    if " join " in f" {s} ": ops.append("JOIN")
+    if " group by " in f" {s} ": ops.append("GROUP_BY")
+    if " order by " in f" {s} ": ops.append("ORDER_BY")
+    if " having " in f" {s} ": ops.append("HAVING")
+    if " limit " in f" {s} ": ops.append("LIMIT")
+    return ops
+def classify_error(sql: str, error_msg: str | None = None, *, timed_out: bool = False):
+    s = (sql or "").lower()
+    m = (error_msg or "").lower()
+    if timed_out or "interrupted" in m or "timeout" in m: return "timeout"
+    if not s.strip().startswith(("select", "with")): return "syntax_error"
+    if " join " in f" {s} " and " on " not in f" {s} ": return "missing_join"
+    if " where " in f" {s} " and not any(op in s for op in ["=", ">", "<", " in ", " like ", " between ", " is null", " is not null"]): return "wrong_where"
+    if ("is null" in s or "is not null" in s) and ("no such column" in m or "misuse" in m): return "null_handling"
+    if "no such table" in m: return "missing_table"
+    if "no such column" in m: return "missing_column"
+    if "ambiguous column name" in m: return "ambiguous_column"
+    if "datatype mismatch" in m or "type mismatch" in m: return "type_mismatch"
+    if "misuse of aggregate" in m or "misuse of aggregate function" in m: return "wrong_aggregation"
+    if "syntax error" in m: return "syntax_error"
+    if "near" in m and "syntax error" in m: return "syntax_error"
+    if "runtime" in m or "constraint failed" in m: return "runtime_error"
+    return "other"
+def get_hint(error_type):
+    hints = {
+        "missing_join": "Check JOIN conditions between tables.", "wrong_aggregation": "Use proper aggregation like avg(column).",
+        "wrong_where": "Check WHERE condition syntax.", "syntax_error": "Ensure SQL starts with SELECT.",
+        "missing_table": "Use only tables from the provided schema.", "missing_column": "Use only columns from the provided schema.",
+        "ambiguous_column": "Disambiguate by using table.column.", "timeout": "Query took too long; simplify joins.", "other": "Review SQL logic."
+    }
+    return hints.get(error_type, "Review query.")
+def is_relevant_to_schema(question, db_id):
+    if schema_encoder is None: return True
+    try: raw_schema = schema_encoder.structured_schema(db_id).lower()
+    except: return True
+    schema_words = set(re.findall(r'[a-z0-9_]+', raw_schema))
+    q_words = re.findall(r'[a-z0-9_]+', question.lower())
+    stop_words = {"show", "list", "all", "what", "is", "the", "how", "many", "count", "find", "get", "me", "a", "an", "of", "in", "for", "from", "with", "which", "are", "there", "give", "tell", "details", "info", "data", "everything"}
+    meaningful_q_words = [w for w in q_words if w not in stop_words and not w.isdigit()]
+    if not meaningful_q_words: return True
+    for word in meaningful_q_words:
+        singular_word = word[:-1] if word.endswith('s') else word
+        if word in schema_words or singular_word in schema_words: return True
+    return False
+def run_query(method, sample_q, custom_q, db_id):
+    global quant_engine
+    # 🚨 LAZY LOADING: We load the heavy AI model ONLY when the button is clicked.
+    if quant_engine is None:
+        print(f"First request detected! Loading AI model from {DEFAULT_QUANT_ARTIFACT}...", flush=True)
+        try:
+            quant_engine = get_quant_engine(DEFAULT_QUANT_ARTIFACT, use_constrained=False, exec_workers=8, use_cache=True)
+            if quant_engine is None:
+                return "-- ❌ ENGINE CRASH", pd.DataFrame(columns=["Error"]), "Failed to load model. Did you move the tokenizer files and add config.json to int8_dynamic/?"
+        except Exception as e:
+            return f"-- ❌ ENGINE CRASH\n-- {str(e)}", pd.DataFrame(columns=["Error Status"]), f"Critical failure loading model: {e}"
+    def _log(error_type: str, *, question: str, db_id_val: str, sql: str = "", error_msg: str = "") -> None:
+        _QUERY_LOG.append({"t": time.time(), "db_id": str(db_id_val), "question": str(question), "sql": str(sql), "error_type": str(error_type), "error_msg": str(error_msg)})
+    def _perf_log(payload: dict) -> None:
+        _PERF_LOG.append(payload)
+        if len(_PERF_LOG) > 1000: del _PERF_LOG[:200]
+    raw_question = sample_q if method == "💡 Pick a Sample" else custom_q
+    if not raw_question or str(raw_question).strip() == "":
+        return "-- No input provided", pd.DataFrame(columns=["Warning"]), "⚠️ Please enter a question."
+    if not db_id or str(db_id).strip() == "":
+        return "-- No database selected", pd.DataFrame(columns=["Warning"]), "⚠️ Please select a database."
+    typo_corrections = [(r'\bshaw\b', 'show'), (r'\bshw\b', 'show'), (r'\bsho\b', 'show'), (r'\blsit\b', 'list'), (r'\blis\b', 'list'), (r'\bfidn\b', 'find'), (r'\bfnd\b', 'find'), (r'\bgte\b', 'get')]
+    question = str(raw_question)
+    for bad, good in typo_corrections: question = re.sub(bad, good, question, flags=re.IGNORECASE)
+    q_lower = question.strip().lower()
+    if len(q_lower.split()) < 2:
+        _log("gibberish", question=question, db_id_val=str(db_id), error_msg="gibberish filtered")
+        return "-- Input Blocked", pd.DataFrame(columns=["Warning"]), "⚠️ Please enter a clear, meaningful natural language question (more than one word)."
+    if re.search(r'\b(delete|update|insert|drop|alter|truncate)\b', q_lower):
+        _log("blocked_dml", question=question, db_id_val=str(db_id), error_msg="DML blocked")
+        return "-- ❌ BLOCKED: Data Modification", pd.DataFrame(columns=["Security Alert"]), "🛑 Security Alert: Modifying or deleting data is strictly prohibited."
+    if not is_relevant_to_schema(question, db_id):
+        _log("out_of_domain", question=question, db_id_val=str(db_id), error_msg="out of domain")
+        return "-- ❌ BLOCKED: Out of Domain", pd.DataFrame(columns=["Domain Alert"]), f"🛑 Relevance Alert: I don't see anything related to your question in the '{db_id}' schema."
+    start_time = time.time()
+    t0 = time.perf_counter()
+    ui_warnings = ""
+    try:
+        try:
+            result = quant_engine.ask(question, str(db_id), num_beams=4, max_new_tokens=120, timeout_s=2.0)
+        except TypeError:
+            result = quant_engine.ask(question, str(db_id))
+    except Exception as e:
+        _log("backend_crash", question=question, db_id_val=str(db_id), error_msg=str(e))
+        return f"-- ❌ BACKEND CRASH\n-- {str(e)}", pd.DataFrame(columns=["Error Status"]), f"❌ CRITICAL BACKEND CRASH:\n{str(e)}"
+    final_sql = str(result.get("sql", ""))
+    model_sql = final_sql
+    num_match = re.search(r'\b(?:show|list|top|limit|get|first|last|sample|of)\s+(?:[a-zA-Z_]+\s+)?(\d+)\b', q_lower)
+    if not num_match and q_lower.startswith(("show", "list", "get")):
+        num_match = re.search(r'\b(\d+)\b', q_lower)
+    if num_match and final_sql:
+        limit_val = num_match.group(1)
+        final_sql = re.sub(rf"(?i)\s*(?:where|having|and)?\s*count\s*\(\s*\*\s*\)\s*=\s*{limit_val}", "", final_sql)
+        final_sql = re.sub(rf"(?i)\s*(?:where|and)\s+[a-zA-Z0-9_.]+\s*=\s*['\"]?{limit_val}['\"]?", "", final_sql)
+        final_sql = re.sub(r"(?i)\s*where\s*$", "", final_sql)
+        final_sql = re.sub(r"(?i)\s*where\s+(group by|order by|limit)", r" \1", final_sql)
+        agg_kws = ["most", "top", "highest", "lowest", "count", "many", "group", "frequent", "popular"]
+        if not any(k in q_lower for k in agg_kws):
+            final_sql = re.sub(r"(?i)\s*group by\s+[a-zA-Z0-9_.]+\s*order by\s+count\(\*\)\s*(?:desc|asc)?", "", final_sql)
+            final_sql = re.sub(r"(?i)\s*order by\s+count\(\*\)\s*(?:desc|asc)?", "", final_sql)
+            final_sql = re.sub(r"(?i),\s*count\(\*\)", "", final_sql)
+            final_sql = re.sub(r"(?i)count\(\*\)\s*,", "", final_sql)
+        if "group by" in final_sql.lower() and not re.search(r'(?i)\b(count|sum|avg|max|min)\b\(', final_sql):
+            final_sql = re.sub(r"(?i)\s*group by\s+[a-zA-Z0-9_.]+", "", final_sql)
+        if "limit" not in final_sql.lower():
+            final_sql = f"{final_sql.strip().rstrip(';')} LIMIT {limit_val}"
+    # Execution
+    from src.sql_validator import validate_sql_schema
+    db_path = get_db_path(str(db_id))
+    try: strict_valid, _ = validate_sql_schema(final_sql, db_path)
+    except Exception: strict_valid = False
+    error_msg = None
+    rows, cols = [], []
+    sqlite_success = False
+    try:
+        rows, cols = quant_engine._execute_one(final_sql, db_path, timeout_s=2.0)
+        sqlite_success = True
+    except Exception as e:
+        error_msg = str(e)
+        sqlite_success = False
+    if not sqlite_success and model_sql and model_sql != final_sql:
+        try:
+            alt_rows, alt_cols = quant_engine._execute_one(model_sql, db_path, timeout_s=2.0)
+            final_sql = model_sql
+            rows, cols = alt_rows, alt_cols
+            sqlite_success = True
+            error_msg = None
+        except Exception: pass
+    valid = sqlite_success
+    if error_msg or not valid:
+        et = classify_error(final_sql, str(error_msg or ""), timed_out=("interrupted" in str(error_msg or "").lower()))
+        _log(et, question=str(question), db_id_val=str(db_id), sql=str(final_sql), error_msg=str(error_msg or "Execution failed"))
+    latency = round(time.time() - start_time, 3)
+    t1 = time.perf_counter()
+    engine_stats_after = quant_engine.stats() if hasattr(quant_engine, 'stats') else {}
+    perf = {
+        "db_id": str(db_id), "use_constrained_decoding": False, "num_beams": 4,
+        "latency_total_ms": round((t1 - t0) * 1000.0, 2), "constraint_ok": bool(strict_valid), "has_error": bool(error_msg),
+        "exec_cache_hit_rate": float(engine_stats_after.get("exec_cache_hit_rate", 0.0) or 0.0),
+    }
+    _perf_log(perf)
+    window = _PERF_LOG[-50:]
+    avg_ms = sum(float(x.get("latency_total_ms", 0.0) or 0.0) for x in window) / len(window) if window else 0.0
+    constraint_rate = sum(1 for x in window if x.get("constraint_ok")) / len(window) if window else 0.0
+    perf_block = (
+        "\n\n---\nPerformance (task impact)\n"
+        f"- Total latency (ms): {perf['latency_total_ms']}\n"
+        f"- Strict Python Validator OK (Task 3): {perf['constraint_ok']}\n"
+        f"- Exec cache hit-rate (Task 1/5): {round(perf['exec_cache_hit_rate'], 3)}\n"
+        f"- Rolling avg latency last 50 (ms): {round(avg_ms, 2)}\n"
+        f"- Rolling constraint rate last 50: {round(constraint_rate, 3)}\n"
+    )
+    if error_msg or not valid:
+        display_sql = final_sql if final_sql.strip() else "-- ❌ INVALID SQL"
+        explanation = f"{ui_warnings}❌ Error Details:\n\n"
+        if error_msg: explanation += f"{error_msg}\n\n"
+        error_type = classify_error(final_sql, str(error_msg or ""))
+        explanation += f"Error Type: {error_type}\nHint: {get_hint(error_type)}"
+        explanation += perf_block
+        ops = sql_ops(final_sql)
+        for op in ops:
+            if op in _OP_STATS: _OP_STATS[op]["fail"] += 1
+        return display_sql, pd.DataFrame(columns=["Execution Notice"]), explanation
+    safe_cols = cols if cols else ["Result"]
+    explanation = f"{ui_warnings}✅ Query executed successfully\n\nRows returned: {len(rows)}\nExecution Time: {latency} sec\n\n{explain_sql(final_sql)}{perf_block}"
+    ops = sql_ops(final_sql)
+    for op in ops:
+        if op in _OP_STATS: _OP_STATS[op]["ok"] += 1
+    _SUCCESS_LOG.append({"t": time.time(), "db_id": str(db_id), "question": question, "sql": final_sql, "ops": ops})
+    limit_match = re.search(r'LIMIT\s+(\d+)', final_sql, re.IGNORECASE)
+    if limit_match and len(rows) < int(limit_match.group(1)):
+        explanation += f"\n\nℹ️ Query allowed up to {int(limit_match.group(1))} rows but only {len(rows)} matched."
+    return final_sql, pd.DataFrame(rows, columns=safe_cols), explanation
+def task1_benchmark(n_rollouts: int, max_workers: int) -> Iterator[tuple[str, str]]:
+    project_root = str(PROJECT_ROOT)
+    env = os.environ.copy()
+    env["PYTHONPATH"] = project_root + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
+    env.setdefault("MPLBACKEND", "Agg")
+    env.setdefault("MPLCONFIGDIR", "/tmp/mplconfig")
+    try: os.makedirs(env["MPLCONFIGDIR"], exist_ok=True)
+    except Exception: pass
+    cmd = [sys.executable, "-u", "scripts/benchmark_parallel_reward.py", "--n", str(int(n_rollouts)), "--max-workers", str(int(max_workers)), "--skip-profile"]
+    proc = subprocess.Popen(cmd, cwd=project_root, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
+    last_yield = time.perf_counter()
+    lines: list[str] = []
+    yield "Running Task 1 benchmark...\n", "<i>Running...</i>"
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        lines.append(line)
+        now = time.perf_counter()
+        if now - last_yield >= 0.5:
+            last_yield = now
+            yield "".join(lines[-200:]).strip(), "<i>Running...</i>"
+    proc.wait()
+    out = "".join(lines).strip()
+    plot_path = str(PROJECT_ROOT / "results" / "task1_plot.png")
+    if os.path.exists(plot_path):
+        try:
+            b64 = base64.b64encode(Path(plot_path).read_bytes()).decode("ascii")
+            yield out, f"<img src='data:image/png;base64,{b64}' style='max-width: 100%; border: 1px solid #e2e8f0; border-radius: 8px;' />"
+            return
+        except Exception:
+            yield out, f"<pre>{plot_path}</pre>"
+            return
+    yield out, "<i>No plot generated</i>"
+def task2_dashboard_structured():
+    if not _QUERY_LOG:
+        empty_counts = pd.DataFrame(columns=["error_type", "count", "hint"])
+        empty_recent = pd.DataFrame(columns=["time", "db_id", "error_type", "question", "error_msg"])
+        return empty_counts, empty_recent, gr.update(choices=[], value=None)
+    counts = {}
+    for r in _QUERY_LOG[-1000:]:
+        k = r.get("error_type") or "other"
+        counts[k] = counts.get(k, 0) + 1
+    rows = [{"error_type": k, "count": int(v), "hint": get_hint(k)} for k, v in sorted(counts.items(), key=lambda x: (-x[1], x[0]))]
+    counts_df = pd.DataFrame(rows)
+    recent = []
+    for r in _QUERY_LOG[-100:]:
+        ts = r.get("t")
+        try: ts_s = time.strftime("%H:%M:%S", time.localtime(float(ts))) if ts else ""
+        except Exception: ts_s = ""
+        recent.append({"time": ts_s, "db_id": r.get("db_id", ""), "error_type": r.get("error_type", ""), "question": r.get("question", ""), "error_msg": r.get("error_msg", "")})
+    recent_df = pd.DataFrame(recent)
+    choices = [str(x["error_type"]) for x in rows]
+    default = choices[0] if choices else None
+    return counts_df, recent_df, gr.update(choices=choices, value=default)
+def task2_error_examples(error_type: str) -> str:
+    if not error_type: return ""
+    hint = get_hint(error_type)
+    matches = [r for r in reversed(_QUERY_LOG) if (r.get("error_type") or "") == str(error_type)][:3]
+    if not matches: return f"Error type: {error_type}\nHint: {hint}\n\nNo examples yet."
+    out = [f"Error type: {error_type}", f"Hint: {hint}", ""]
+    for i, r in enumerate(matches, 1):
+        out.extend([f"Example {i}", f"DB: {r.get('db_id','')}", f"Q: {r.get('question','')}", f"SQL: {r.get('sql','')}", f"Msg: {r.get('error_msg','')}", ""])
+    return "\n".join(out).strip()
+def _plot_op_stats_html() -> str:
+    try:
+        import matplotlib.pyplot as plt
+        labels = list(_OP_STATS.keys())
+        oks = [int(_OP_STATS[k]["ok"]) for k in labels]
+        fails = [int(_OP_STATS[k]["fail"]) for k in labels]
+        fig, ax = plt.subplots(figsize=(9, 3.5))
+        x = list(range(len(labels)))
+        ax.bar(x, oks, label="ok", color="#16a34a")
+        ax.bar(x, fails, bottom=oks, label="fail", color="#dc2626")
+        ax.set_xticks(x)
+        ax.set_xticklabels(labels, rotation=30, ha="right")
+        ax.set_title("Success/Failure by SQL operation")
+        ax.legend()
+        fig.tight_layout()
+        buf = io.BytesIO()
+        fig.savefig(buf, format="png", dpi=160)
+        plt.close(fig)
+        b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+        return f"<img src='data:image/png;base64,{b64}' style='max-width: 100%; border: 1px solid #e2e8f0; border-radius: 8px;' />"
+    except Exception as e: return f"<pre>Plot error: {e}</pre>"
+def task2_ops_table():
+    rows = []
+    for op, d in _OP_STATS.items():
+        ok = int(d.get("ok", 0))
+        fail = int(d.get("fail", 0))
+        total = ok + fail
+        rows.append({"op": op, "ok": ok, "fail": fail, "total": total, "success_rate": (ok / total) if total else 0.0})
+    return pd.DataFrame(rows), _plot_op_stats_html()
+def toggle_input_method(method, current_sample):
+    if method == "💡 Pick a Sample":
+        db = next((db for q, db in SAMPLES if q == current_sample), "chinook_1")
+        return (gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=db, interactive=False))
+    return (gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(interactive=True))
+def load_sample(selected_question):
+    if not selected_question: return gr.update()
+    return gr.update(value=next((db for q, db in SAMPLES if q == selected_question), "chinook_1"))
+def clear_inputs():
+    return (gr.update(value="💡 Pick a Sample"), gr.update(value=SAMPLE_QUESTIONS[0], visible=True), gr.update(visible=False), gr.update(value="", visible=False), gr.update(value="chinook_1", interactive=False), "", pd.DataFrame(), "")
+def update_schema(db_id):
+    if not db_id or schema_encoder is None: return ""
+    try:
+        raw_schema = schema_encoder.structured_schema(db_id)
+        html_output = "<div style='max-height: 250px; overflow-y: auto; background: #f8fafc; padding: 12px; border-radius: 8px; border: 1px solid #e2e8f0; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; font-size: 0.9em; line-height: 1.6;'>"
+        for line in raw_schema.strip().split('\n'):
+            line = line.strip()
+            if not line: continue
+            match = re.search(r'^([a-zA-Z0-9_]+)\s*\((.*)\)', line)
+            if match: html_output += f"<div style='margin-bottom: 8px;'><strong style='color: #0f172a; font-size: 1.05em; font-weight: 800;'>{match.group(1).upper()}</strong> <span style='color: #64748b;'>( {match.group(2).lower()} )</span></div>"
+            else: html_output += f"<div style='color: #475569;'>{line}</div>"
+        html_output += "</div>"
+        return html_output
+    except Exception as e: return f"<div style='color: red;'>Error loading schema: {str(e)}</div>"
+# =========================
+# UI LAYOUT
+# =========================
+with gr.Blocks(title="Text-to-SQL RLHF") as demo:
+    gr.HTML("""
+        <div style="text-align: center; background-color: #e0e7ff; padding: 20px; border-radius: 10px; margin-bottom: 20px; border: 1px solid #c7d2fe;">
+            <h1 style="color: #3730a3; margin-top: 0; margin-bottom: 10px; font-size: 2.2em;"> Text-to-SQL using RLHF + Execution Reward</h1>
+            <p style="color: #4f46e5; font-size: 1.1em; margin: 0;">Convert Natural Language to SQL, strictly validated and safely executed on local SQLite databases.</p>
+        </div>
+    """)
+    DBS = sorted(["flight_1", "student_assessment", "store_1", "bike_1", "book_2", "chinook_1", "academic", "aircraft", "car_1", "cinema", "club_1", "csu_1", "college_1", "college_2", "company_1", "company_employee", "customer_complaints", "department_store", "employee_hire_evaluation", "museum_visit", "products_for_hire", "restaurant_1", "school_finance", "shop_membership", "small_bank_1", "student_1", "tvshow", "voter_1", "world_1"])
+    with gr.Tabs():
+        with gr.Tab("Inference"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 1. Configuration & Input")
+                    input_method = gr.Radio(choices=["💡 Pick a Sample", "✍️ Type my own"], value="💡 Pick a Sample", label="How do you want to ask?")
+                    sample_dropdown = gr.Dropdown(choices=SAMPLE_QUESTIONS, value=SAMPLE_QUESTIONS[0], label="Select a Sample Question", info="The database will be selected automatically.", visible=True)
+                    type_own_warning = gr.Markdown("**⚠️ Please select a Database first, then type your custom question below:**", visible=False)
+                    gr.Markdown("---")
+                    db_id = gr.Dropdown(choices=DBS, value="chinook_1", label="Select Database", interactive=False)
+                    custom_question = gr.Textbox(label="Ask your Custom Question", placeholder="Type your own question here...", lines=3, visible=False)
+                    gr.Markdown("#### 📋 Database Structure")
+                    gr.HTML("<p style='font-size: 0.85em; color: #64748b; margin-top: -10px; margin-bottom: 5px;'>Use these exact names! Table names are <strong>Dark</strong>, Column names are <span style='color: #94a3b8;'>Light</span>.</p>")
+                    schema_display = gr.HTML(value=update_schema("chinook_1"))
+                    with gr.Row():
+                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                        run_btn = gr.Button(" Generate & Run SQL", variant="primary")
+                with gr.Column(scale=2):
+                    gr.Markdown("### 2. Execution Results")
+                    final_sql = gr.Code(language="sql", label="Final Executed SQL")
+                    result_table = gr.Dataframe(label="Query Result Table", interactive=False, wrap=True)
+                    explanation = gr.Textbox(label="AI Explanation + Execution Details", lines=8)
+        with gr.Tab("Diagnostics"):
+            gr.Markdown("## Diagnostics & Telemetry")
+            with gr.Accordion("Task 1: Parallel Reward Benchmark", open=False):
+                gr.Markdown("*(Simulates the heavy RLHF training workload by running hundreds of complex SQL queries concurrently to test SQLite multi-threading performance.)*")
+                t1_n = gr.Number(value=20, precision=0, label="Rollouts (n)")
+                t1_workers = gr.Number(value=10, precision=0, label="Max workers")
+                t1_run = gr.Button("Run Task 1 benchmark")
+                t1_out = gr.Textbox(label="Output", lines=12)
+                t1_plot = gr.HTML(label="Plot (if generated)")
+                t1_run.click(fn=task1_benchmark, inputs=[t1_n, t1_workers], outputs=[t1_out, t1_plot])
+            with gr.Accordion("Task 2: Error Dashboard", open=True):
+                gr.Markdown("*(Live telemetry tracking the most common SQL failures. Populates automatically when queries fail in the Inference tab.)*")
+                t2_refresh = gr.Button("Refresh dashboard")
+                t2_counts = gr.Dataframe(label="Error counts", interactive=False, wrap=True)
+                t2_recent = gr.Dataframe(label="Recent errors", interactive=False, wrap=True)
+                t2_type = gr.Dropdown(choices=[], value=None, label="Select error type")
+                t2_examples = gr.Textbox(label="Examples + hint", lines=10)
+                t2_refresh.click(fn=task2_dashboard_structured, inputs=[], outputs=[t2_counts, t2_recent, t2_type])
+                t2_type.change(fn=task2_error_examples, inputs=[t2_type], outputs=[t2_examples])
+            with gr.Accordion("Task 2: Clause Telemetry", open=False):
+                gr.Markdown("*(Analyzes which specific SQL clauses—SELECT, WHERE, JOIN, etc.—are most prone to errors during natural language generation.)*")
+                t2_ops_refresh = gr.Button("Refresh SQL-op stats")
+                t2_ops_tbl = gr.Dataframe(label="Success/failure by op", interactive=False, wrap=True)
+                t2_ops_plot = gr.HTML(label="Op plot")
+                t2_ops_refresh.click(fn=task2_ops_table, inputs=[], outputs=[t2_ops_tbl, t2_ops_plot])
+    # EVENT BINDING: The .then() forces the diagnostic tab to update live in the background!
+    input_method.change(fn=toggle_input_method, inputs=[input_method, sample_dropdown], outputs=[sample_dropdown, type_own_warning, custom_question, db_id])
+    sample_dropdown.change(fn=load_sample, inputs=[sample_dropdown], outputs=[db_id])
+    db_id.change(fn=update_schema, inputs=[db_id], outputs=[schema_display])
+    run_btn.click(
+        fn=run_query,
+        inputs=[input_method, sample_dropdown, custom_question, db_id],
+        outputs=[final_sql, result_table, explanation]
+    ).then(
+        fn=task2_dashboard_structured, inputs=[], outputs=[t2_counts, t2_recent, t2_type]
+    ).then(
+        fn=task2_ops_table, inputs=[], outputs=[t2_ops_tbl, t2_ops_plot]
+    )
+    clear_btn.click(fn=clear_inputs, inputs=[], outputs=[input_method, sample_dropdown, type_own_warning, custom_question, db_id, final_sql, result_table, explanation])
+if __name__ == "__main__":
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    base_port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
+    max_retries = 10
+    for port in range(base_port, base_port + max_retries):
+        try:
+            print(f"Attempting to start Gradio UI on {server_name}:{port}...", flush=True)
+            demo.launch(server_name=server_name, server_port=port)
+            break  # If successful, exit the loop
+        except OSError as e:
+            if "Cannot find empty port" in str(e) or "Address already in use" in str(e):
+                print(f"⚠️ Port {port} is in use, trying next port...")
+                continue
+            else:
+                # If it's a different OSError, raise it normally
+                raise e
+    else:
+        print(f"❌ Could not find an open port between {base_port} and {base_port + max_retries - 1}.")

db.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb0e6ef12110c4c9808205cb210a35b5c4412397e15f47ed437e739e161d4213
+size 53803466

int8_dynamic/meta.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "mode": "int8_dynamic",
+  "base_model": "Salesforce/codet5-base",
+  "adapter_path": "checkpoints/best_rlhf_model_2",
+  "created_at_s": 1774418718.320342,
+  "estimated_model_bytes": 98804736
+}

int8_dynamic/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f398e044cd49fc84553b746d26ad79beb1dd565d90cf8f6f5e50d27f48d08228
+size 322871519

int8_dynamic/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

int8_dynamic/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,753 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<extra_id_99>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_98>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_97>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_96>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_95>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_94>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_93>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_92>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_91>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_90>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_89>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_88>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_87>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_86>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_85>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_84>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_83>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_82>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_81>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_80>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_79>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_78>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_77>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_76>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_75>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_74>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_73>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_72>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_71>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_70>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_69>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_68>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_67>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_66>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_65>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_64>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_63>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_62>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_61>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_60>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_59>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_58>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_57>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_56>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_55>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_54>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_53>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_52>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_51>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_50>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_49>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_48>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_47>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_46>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_45>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_44>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_43>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_42>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_41>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_40>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_39>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_38>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_37>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_36>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_35>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_34>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_33>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_32>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_31>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_30>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_29>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_28>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_27>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_26>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_25>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_24>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_23>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_22>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_21>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_20>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_19>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_18>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_17>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_16>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_15>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_14>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_13>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_12>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_11>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_10>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_9>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_8>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_7>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_6>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_5>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_4>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_3>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_2>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_1>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<extra_id_0>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

int8_dynamic/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

int8_dynamic/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,959 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<extra_id_99>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<extra_id_98>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<extra_id_97>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<extra_id_96>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<extra_id_95>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<extra_id_94>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<extra_id_93>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<extra_id_92>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<extra_id_91>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<extra_id_90>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<extra_id_89>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<extra_id_88>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<extra_id_87>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<extra_id_86>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<extra_id_85>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<extra_id_84>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<extra_id_83>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<extra_id_82>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<extra_id_81>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<extra_id_80>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<extra_id_79>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<extra_id_78>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<extra_id_77>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<extra_id_76>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<extra_id_75>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<extra_id_74>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<extra_id_73>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<extra_id_72>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<extra_id_71>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<extra_id_70>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<extra_id_69>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<extra_id_68>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<extra_id_67>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<extra_id_66>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<extra_id_65>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<extra_id_64>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<extra_id_63>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<extra_id_62>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<extra_id_61>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<extra_id_60>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<extra_id_59>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<extra_id_58>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<extra_id_57>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<extra_id_56>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<extra_id_55>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<extra_id_54>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<extra_id_53>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<extra_id_52>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<extra_id_51>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<extra_id_50>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<extra_id_49>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<extra_id_48>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<extra_id_47>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<extra_id_46>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<extra_id_45>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<extra_id_44>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<extra_id_43>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<extra_id_42>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<extra_id_41>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<extra_id_40>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<extra_id_39>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<extra_id_38>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<extra_id_37>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<extra_id_36>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<extra_id_35>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<extra_id_34>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<extra_id_33>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<extra_id_32>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<extra_id_31>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<extra_id_30>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<extra_id_29>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<extra_id_28>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<extra_id_27>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<extra_id_26>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<extra_id_25>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<extra_id_24>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<extra_id_23>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<extra_id_22>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<extra_id_21>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<extra_id_20>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<extra_id_19>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<extra_id_18>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<extra_id_17>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<extra_id_16>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<extra_id_15>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<extra_id_14>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<extra_id_13>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<extra_id_12>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<extra_id_11>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<extra_id_10>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<extra_id_9>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<extra_id_8>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<extra_id_7>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<extra_id_6>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<extra_id_5>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<extra_id_4>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<extra_id_3>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<extra_id_2>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<extra_id_1>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<extra_id_0>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<extra_id_99>",
+    "<extra_id_98>",
+    "<extra_id_97>",
+    "<extra_id_96>",
+    "<extra_id_95>",
+    "<extra_id_94>",
+    "<extra_id_93>",
+    "<extra_id_92>",
+    "<extra_id_91>",
+    "<extra_id_90>",
+    "<extra_id_89>",
+    "<extra_id_88>",
+    "<extra_id_87>",
+    "<extra_id_86>",
+    "<extra_id_85>",
+    "<extra_id_84>",
+    "<extra_id_83>",
+    "<extra_id_82>",
+    "<extra_id_81>",
+    "<extra_id_80>",
+    "<extra_id_79>",
+    "<extra_id_78>",
+    "<extra_id_77>",
+    "<extra_id_76>",
+    "<extra_id_75>",
+    "<extra_id_74>",
+    "<extra_id_73>",
+    "<extra_id_72>",
+    "<extra_id_71>",
+    "<extra_id_70>",
+    "<extra_id_69>",
+    "<extra_id_68>",
+    "<extra_id_67>",
+    "<extra_id_66>",
+    "<extra_id_65>",
+    "<extra_id_64>",
+    "<extra_id_63>",
+    "<extra_id_62>",
+    "<extra_id_61>",
+    "<extra_id_60>",
+    "<extra_id_59>",
+    "<extra_id_58>",
+    "<extra_id_57>",
+    "<extra_id_56>",
+    "<extra_id_55>",
+    "<extra_id_54>",
+    "<extra_id_53>",
+    "<extra_id_52>",
+    "<extra_id_51>",
+    "<extra_id_50>",
+    "<extra_id_49>",
+    "<extra_id_48>",
+    "<extra_id_47>",
+    "<extra_id_46>",
+    "<extra_id_45>",
+    "<extra_id_44>",
+    "<extra_id_43>",
+    "<extra_id_42>",
+    "<extra_id_41>",
+    "<extra_id_40>",
+    "<extra_id_39>",
+    "<extra_id_38>",
+    "<extra_id_37>",
+    "<extra_id_36>",
+    "<extra_id_35>",
+    "<extra_id_34>",
+    "<extra_id_33>",
+    "<extra_id_32>",
+    "<extra_id_31>",
+    "<extra_id_30>",
+    "<extra_id_29>",
+    "<extra_id_28>",
+    "<extra_id_27>",
+    "<extra_id_26>",
+    "<extra_id_25>",
+    "<extra_id_24>",
+    "<extra_id_23>",
+    "<extra_id_22>",
+    "<extra_id_21>",
+    "<extra_id_20>",
+    "<extra_id_19>",
+    "<extra_id_18>",
+    "<extra_id_17>",
+    "<extra_id_16>",
+    "<extra_id_15>",
+    "<extra_id_14>",
+    "<extra_id_13>",
+    "<extra_id_12>",
+    "<extra_id_11>",
+    "<extra_id_10>",
+    "<extra_id_9>",
+    "<extra_id_8>",
+    "<extra_id_7>",
+    "<extra_id_6>",
+    "<extra_id_5>",
+    "<extra_id_4>",
+    "<extra_id_3>",
+    "<extra_id_2>",
+    "<extra_id_1>",
+    "<extra_id_0>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

int8_dynamic/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==5.8.0
+pandas
+sqlparse
+transformers
+torch --index-url https://download.pytorch.org/whl/cpu
+peft
+trl
+sentencepiece
+matplotlib
+huggingface_hub

scripts/__pycache__/benchmark_parallel_reward.cpython-310.pyc ADDED Viewed

Binary file (6.31 kB). View file

scripts/__pycache__/benchmark_parallel_reward.cpython-313.pyc ADDED Viewed

Binary file (10.3 kB). View file

scripts/__pycache__/benchmark_quantization.cpython-310.pyc ADDED Viewed

Binary file (3.79 kB). View file

scripts/__pycache__/benchmark_rollout_generation.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

scripts/__pycache__/quantize_export.cpython-310.pyc ADDED Viewed

Binary file (2.05 kB). View file

scripts/__pycache__/quantized_infer_harness.cpython-310.pyc ADDED Viewed

Binary file (1.62 kB). View file

scripts/benchmark_parallel_reward.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+# Ensure headless-safe matplotlib + writable cache when called from Gradio/subprocess.
+os.environ.setdefault("MPLBACKEND", "Agg")
+os.environ.setdefault("MPLCONFIGDIR", os.environ.get("MPLCONFIGDIR", "/tmp/mplconfig"))
+import time
+import json
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+from pathlib import Path
+# ==========================================
+# RELATIVE PATH RESOLUTION
+# ==========================================
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.append(str(PROJECT_ROOT))
+# Dynamically resolve where the databases are kept
+if (PROJECT_ROOT / "data" / "database").exists() and list((PROJECT_ROOT / "data" / "database").rglob("*.sqlite")):
+    DB_ROOT = PROJECT_ROOT / "data" / "database"
+else:
+    DB_ROOT = PROJECT_ROOT / "final_databases"
+from src.execution_reward import (
+    execution_reward_batch_sequential,
+    execution_reward_batch_parallel,
+    execution_reward_batch_parallel_by_db,
+    execution_reward_timed,
+    set_use_cache,
+    set_use_schema_validation,
+    clear_result_cache
+)
+def generate_mock_rollouts(num_rollouts: int = 100, heavy_n: int = 500_000):
+    """Generates heavy queries across multiple databases to properly test true concurrency."""
+    print(f"\nGenerating {num_rollouts} heavy rollouts to simulate RLHF query workload...", flush=True)
+    # Smart search for real databases
+    real_dbs = [str(p) for p in DB_ROOT.rglob("*.sqlite")]
+    if real_dbs:
+        print(f"Found {len(real_dbs)} real SQLite databases in {DB_ROOT}. Distributing workload...", flush=True)
+    else:
+        print(f"❌ CRITICAL ERROR: No real databases found in {DB_ROOT}. Cannot run benchmark.", flush=True)
+        sys.exit(1)
+    rollouts = []
+    for i in range(num_rollouts):
+        db_path = real_dbs[i % len(real_dbs)]
+        # Heavy deterministic CPU-ish query (may be cut off by the 2s timeout depending on machine).
+        heavy_sql = f"""
+        WITH RECURSIVE cnt(x) AS (
+            SELECT 1
+            UNION ALL
+            SELECT x+1 FROM cnt WHERE x < {heavy_n + (i % 10_000)}
+        )
+        SELECT sum(x) FROM cnt;
+        """
+        clean_sql = heavy_sql.replace("\n", " ").strip()
+        rollouts.append((clean_sql, db_path, clean_sql))
+        if num_rollouts >= 500 and (i + 1) % 250 == 0:
+            print(f"  generated {i + 1}/{num_rollouts}...", flush=True)
+    return rollouts
+def profile_bottlenecks(rollouts, sample_size: int = 20, print_every: int = 5):
+    """Profiles CPU usage to identify time spent in parsing, planning, and execution."""
+    print("\n" + "="*65)
+    print(" 🔍 CPU PROFILING: IDENTIFYING BOTTLENECKS (100 Rollouts)")
+    print("="*65)
+    clear_result_cache()
+    set_use_cache(False) # Disable cache to force real work
+    set_use_schema_validation(False)  # CTE-heavy benchmark queries may fail schema validation
+    total_parse = 0.0
+    total_plan = 0.0
+    total_exec = 0.0
+    # Profile a small subset by default so the script prints quickly.
+    sample_size = min(int(sample_size), len(rollouts))
+    sample_rollouts = rollouts[:sample_size]
+    for i, (pred, db, gold) in enumerate(sample_rollouts, 1):
+        _, timings = execution_reward_timed(pred, db, gold, measure_plan=True)
+        total_parse += timings['parse_s']
+        total_plan += timings['plan_s']
+        total_exec += timings['exec_s']
+        if print_every and (i % int(print_every) == 0 or i == sample_size):
+            print(f"  profiled {i}/{sample_size}...", flush=True)
+    total_time = total_parse + total_plan + total_exec
+    if total_time == 0: total_time = 0.0001 # Prevent div by zero
+    print(f"{'Phase':<15} | {'Avg Time (ms)':<15} | {'% of Total CPU':<15}")
+    print("-" * 65)
+    print(f"{'Regex Parsing':<15} | {(total_parse/sample_size)*1000:<15.2f} | {(total_parse/total_time)*100:<14.1f}%")
+    print(f"{'Query Planning':<15} | {(total_plan/sample_size)*1000:<15.2f} | {(total_plan/total_time)*100:<14.1f}%")
+    print(f"{'DB Execution':<15} | {(total_exec/sample_size)*1000:<15.2f} | {(total_exec/total_time)*100:<14.1f}%")
+    print("="*65 + "\n")
+def run_benchmark_for_setting(rollouts, use_cache: bool, max_workers: int):
+    set_use_cache(use_cache)
+    set_use_schema_validation(False)  # benchmark focuses on execution speed
+    # Sequential
+    clear_result_cache()
+    start_time = time.perf_counter()
+    execution_reward_batch_sequential(rollouts)
+    sequential_s = time.perf_counter() - start_time
+    # Parallel
+    clear_result_cache()
+    start_time = time.perf_counter()
+    # 1 thread per DB (recommended)
+    execution_reward_batch_parallel_by_db(rollouts, max_workers=max_workers)
+    parallel_s = time.perf_counter() - start_time
+    speedup = sequential_s / parallel_s if parallel_s > 0 else 0
+    return {
+        "sequential_s": sequential_s,
+        "parallel_s": parallel_s,
+        "speedup": speedup
+    }
+def print_comparison_table(results):
+    print("="*65)
+    print(f"{'Setting':<16} | {'Sequential (s)':<14} | {'Parallel (s)':<14} | {'Speedup':<10}")
+    print("-" * 65)
+    for setting, key in [("With Cache", "with_cache"), ("Without Cache", "without_cache")]:
+        seq = results[key]['sequential_s']
+        par = results[key]['parallel_s']
+        spd = results[key]['speedup']
+        print(f"{setting:<16} | {seq:<14.4f} | {par:<14.4f} | {spd:<9.2f}x")
+    print("="*65 + "\n")
+def plot_results(results, output_path: str):
+    labels = ['With Cache', 'Without Cache']
+    seq_times = [results['with_cache']['sequential_s'], results['without_cache']['sequential_s']]
+    par_times = [results['with_cache']['parallel_s'], results['without_cache']['parallel_s']]
+    x = np.arange(len(labels))
+    width = 0.35
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.bar(x - width/2, seq_times, width, label='Sequential', color='#4C72B0')
+    ax.bar(x + width/2, par_times, width, label='Parallel', color='#DD8452')
+    ax.set_ylabel('Execution Time (seconds)')
+    ax.set_title('Text2SQL Reward Execution: Sequential vs Parallel')
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels)
+    ax.legend()
+    for container in ax.containers:
+        ax.bar_label(container, fmt='%.2f', padding=3)
+    fig.tight_layout()
+    plt.savefig(output_path, dpi=300)
+    plt.close()
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark SQL Execution Reward")
+    parser.add_argument("--n", type=int, default=1000, help="Number of rollouts to benchmark")
+    parser.add_argument("--max-workers", type=int, default=20, help="Max workers for parallel execution")
+    parser.add_argument("--heavy-n", type=int, default=200_000, help="Recursive CTE upper bound (controls heaviness)")
+    parser.add_argument("--skip-profile", action="store_true", help="Skip the CPU profiling section for faster startup")
+    parser.add_argument("--profile-n", type=int, default=20, help="Number of rollouts to use for CPU profiling")
+    args = parser.parse_args()
+    os.makedirs(str(PROJECT_ROOT / "results"), exist_ok=True)
+    rollouts = generate_mock_rollouts(args.n, heavy_n=args.heavy_n)
+    if not args.skip_profile:
+        profile_bottlenecks(rollouts, sample_size=args.profile_n)
+    print("Starting Main Scalability Benchmarks...")
+    print("Running Experiment A: Cache ENABLED...")
+    results_with_cache = run_benchmark_for_setting(rollouts, use_cache=True, max_workers=args.max_workers)
+    print("Running Experiment B: Cache DISABLED...")
+    results_without_cache = run_benchmark_for_setting(rollouts, use_cache=False, max_workers=args.max_workers)
+    final_results = {
+        "with_cache": results_with_cache,
+        "without_cache": results_without_cache
+    }
+    json_path = str(PROJECT_ROOT / "results" / "task1_results.json")
+    with open(json_path, 'w') as f:
+        json.dump(final_results, f, indent=4)
+    print_comparison_table(final_results)
+    plot_results(final_results, str(PROJECT_ROOT / "results" / "task1_plot.png"))
+if __name__ == "__main__":
+    main()

scripts/benchmark_quantization.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from __future__ import annotations
+import argparse
+import json
+import os
+import time
+from pathlib import Path
+from typing import Dict, List, Tuple
+import numpy as np
+import torch
+from src.execution_reward import execution_reward
+from src.prompting import encode_prompt
+from src.quantization_utils import load_fp32_model, load_quant_artifact
+def _load_dev_items(root: Path, n: int, seed: int = 42) -> List[dict]:
+    data = json.loads((root / "data" / "dev.json").read_text())
+    if n >= len(data):
+        return data
+    rng = np.random.default_rng(seed)
+    idxs = rng.choice(len(data), size=n, replace=False)
+    return [data[int(i)] for i in idxs]
+def _bench_variant(name: str, tok, model, items: List[dict], device: str) -> Dict[str, float]:
+    latencies: List[float] = []
+    ex = 0
+    # Warmup (1 item)
+    if items:
+        it = items[0]
+        _ = encode_prompt(tok, it["question"], it["db_id"], device=device, max_input_tokens=512).unsqueeze(0)
+    for it in items:
+        db_id = it["db_id"]
+        q = it["question"]
+        gold = it["query"]
+        db_path = str(Path("data") / "database" / db_id / f"{db_id}.sqlite")
+        input_ids = encode_prompt(tok, q, db_id, device=device, max_input_tokens=512).unsqueeze(0)
+        t0 = time.perf_counter()
+        out = model.generate(input_ids=input_ids, max_new_tokens=120, num_beams=8, repetition_penalty=1.2)
+        dt = time.perf_counter() - t0
+        latencies.append(dt)
+        pred = tok.decode(out[0], skip_special_tokens=True).strip()
+        r = execution_reward(pred, db_path, gold)
+        if float(r) >= 1.0:
+            ex += 1
+    p50 = float(np.percentile(latencies, 50)) if latencies else 0.0
+    p90 = float(np.percentile(latencies, 90)) if latencies else 0.0
+    mean = float(np.mean(latencies)) if latencies else 0.0
+    return {
+        "n": float(len(items)),
+        "ex": float(ex / max(len(items), 1)),
+        "lat_mean_s": mean,
+        "lat_p50_s": p50,
+        "lat_p90_s": p90,
+    }
+def main() -> None:
+    p = argparse.ArgumentParser(description="Benchmark fp32 vs quantized artifacts (CPU-focused).")
+    p.add_argument("--base_model", default=os.environ.get("BASE_MODEL", "Salesforce/codet5-base"))
+    p.add_argument("--adapter", default="", help="Optional adapter for fp32 baseline.")
+    p.add_argument("--artifact_int8", default="", help="Artifact dir exported by scripts/quantize_export.py")
+    p.add_argument("--artifact_int8_decoder", default="", help="Artifact dir for decoder-only int8")
+    p.add_argument("--num_samples", type=int, default=100)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--out", default="results/task5_quant_bench.json")
+    p.add_argument("--local_only", action="store_true")
+    args = p.parse_args()
+    device = "cpu"
+    root = Path(".")
+    items = _load_dev_items(root, args.num_samples, args.seed)
+    report: Dict[str, Dict[str, float]] = {}
+    tok, fp32 = load_fp32_model(
+        args.base_model,
+        adapter_path=args.adapter.strip() or None,
+        device=device,
+        local_only=args.local_only,
+    )
+    report["fp32"] = _bench_variant("fp32", tok, fp32, items, device)
+    if args.artifact_int8:
+        tok8, m8, _meta = load_quant_artifact(args.artifact_int8, device=device, local_only=True)
+        report["int8_dynamic"] = _bench_variant("int8_dynamic", tok8, m8, items, device)
+    if args.artifact_int8_decoder:
+        tokd, md, _meta = load_quant_artifact(args.artifact_int8_decoder, device=device, local_only=True)
+        report["int8_decoder_dynamic"] = _bench_variant("int8_decoder_dynamic", tokd, md, items, device)
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(report, indent=2))
+    print(json.dumps(report, indent=2))
+if __name__ == "__main__":
+    torch.set_grad_enabled(False)
+    main()

scripts/benchmark_rollout_generation.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from __future__ import annotations
+import argparse
+import json
+import os
+import time
+from pathlib import Path
+from typing import List
+import numpy as np
+import torch
+from src.prompting import encode_prompt
+from src.quantization_utils import load_fp32_model, load_quant_artifact
+def _load_items(root: Path, n: int, seed: int = 42) -> List[dict]:
+    data = json.loads((root / "data" / "dev.json").read_text())
+    if n >= len(data):
+        return data
+    rng = np.random.default_rng(seed)
+    idxs = rng.choice(len(data), size=n, replace=False)
+    return [data[int(i)] for i in idxs]
+def _bench_generate(tok, model, items: List[dict], device: str) -> float:
+    t0 = time.perf_counter()
+    for it in items:
+        input_ids = encode_prompt(tok, it["question"], it["db_id"], device=device, max_input_tokens=512).unsqueeze(0)
+        _ = model.generate(input_ids=input_ids, max_new_tokens=64, num_beams=4)
+    return time.perf_counter() - t0
+def main() -> None:
+    p = argparse.ArgumentParser(description="Benchmark rollout generation latency for RL loops.")
+    p.add_argument("--base_model", default=os.environ.get("BASE_MODEL", "Salesforce/codet5-base"))
+    p.add_argument("--adapter", default="")
+    p.add_argument("--artifact", default="", help="Quantized artifact dir (optional).")
+    p.add_argument("--num_rollouts", type=int, default=128)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--local_only", action="store_true")
+    args = p.parse_args()
+    device = "cpu"
+    root = Path(".")
+    items = _load_items(root, args.num_rollouts, args.seed)
+    tok, fp32 = load_fp32_model(
+        args.base_model,
+        adapter_path=args.adapter.strip() or None,
+        device=device,
+        local_only=args.local_only,
+    )
+    t_fp32 = _bench_generate(tok, fp32, items, device)
+    print(f"fp32: {t_fp32:.2f}s for {len(items)} rollouts ({len(items)/max(t_fp32,1e-9):.2f} rollouts/s)")
+    if args.artifact:
+        tokq, mq, meta = load_quant_artifact(args.artifact, device=device, local_only=True)
+        t_q = _bench_generate(tokq, mq, items, device)
+        mode = meta.get("mode", "quant")
+        print(f"{mode}: {t_q:.2f}s for {len(items)} rollouts ({len(items)/max(t_q,1e-9):.2f} rollouts/s)")
+if __name__ == "__main__":
+    torch.set_grad_enabled(False)
+    main()

scripts/error_dashboard.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import json
+from collections import Counter
+# ==============================
+# LOAD LOGS
+# ==============================
+with open("results/error_logs.json") as f:
+    logs = json.load(f)
+total_errors = len(logs)
+# ==============================
+# ERROR DISTRIBUTION
+# ==============================
+error_counts = Counter([e["error_type"] for e in logs])
+print("\n" + "="*50)
+print("📊 TEXT-to-SQL ERROR DASHBOARD")
+print("="*50)
+print(f"\n🔢 Total Errors Logged: {total_errors}")
+print("\n📊 ERROR DISTRIBUTION:")
+print("-"*30)
+for k, v in error_counts.items():
+    percent = (v / total_errors) * 100
+    print(f"{k:<20} : {v:>4} ({percent:.1f}%)")
+# ==============================
+# TOP ERROR
+# ==============================
+top_error = error_counts.most_common(1)[0]
+print("\n🔥 MOST COMMON ERROR:")
+print("-"*30)
+print(f"{top_error[0]} ({top_error[1]} times)")
+# ==============================
+# SQL OPERATION ANALYSIS
+# ==============================
+join_count = 0
+where_count = 0
+group_count = 0
+order_count = 0
+for e in logs:
+    sql = e["sql"].lower()
+    if "join" in sql:
+        join_count += 1
+    if "where" in sql:
+        where_count += 1
+    if "group by" in sql:
+        group_count += 1
+    if "order by" in sql:
+        order_count += 1
+print("\n🧠 SQL OPERATION ANALYSIS:")
+print("-"*30)
+print(f"JOIN used in     : {join_count} queries")
+print(f"WHERE used in    : {where_count} queries")
+print(f"GROUP BY used in : {group_count} queries")
+print(f"ORDER BY used in : {order_count} queries")
+# ==============================
+# SAMPLE ERRORS
+# ==============================
+print("\n🧪 SAMPLE ERROR CASES:")
+print("-"*50)
+for i, e in enumerate(logs[:3], 1):
+    print(f"\nCase {i}:")
+    print(f"Q   : {e['question']}")
+    print(f"SQL : {e['sql']}")
+    print(f"Type: {e['error_type']}")
+# ==============================
+# FINAL INSIGHT
+# ==============================
+print("\n📌 FINAL INSIGHT:")
+print("-"*30)
+if top_error[0] == "wrong_column":
+    print("⚠️ Model struggles with column selection (schema understanding issue).")
+elif top_error[0] == "wrong_table":
+    print("⚠️ Model struggles with correct table mapping.")
+elif top_error[0] == "syntax_error":
+    print("⚠️ Model generates invalid SQL syntax.")
+else:
+    print("⚠️ Mixed errors — needs general improvement.")
+print("\n" + "="*50)
+print("✅ DASHBOARD COMPLETE")
+print("="*50)

scripts/evaluate.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+import os
+import sqlite3
+from contextlib import closing
+from typing import Dict, List
+import torch
+from datasets import load_dataset
+from peft import PeftModel
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from trl import AutoModelForSeq2SeqLMWithValueHead
+import sys
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(PROJECT_ROOT)
+from src.execution_reward import execution_reward  # noqa: E402
+BASE_MODEL = os.environ.get("BASE_MODEL", "t5-small")
+DB_ROOT = os.path.join(PROJECT_ROOT, "data", "database")
+# Prefer RL best model if present; otherwise fall back.
+RL_DIR = os.path.join(PROJECT_ROOT, "outputs", "rlhf_text2sql", "best_model")
+if not os.path.isdir(RL_DIR):
+    RL_DIR = os.path.join(PROJECT_ROOT, "outputs", "rlhf_text2sql")
+SPLIT = "train[:100]"  # quick sanity check
+MAX_NEW_TOKENS = 128
+PREFIX = "translate English to SQL:"
+MAX_SCHEMA_CHARS = 1500
+MAX_INPUT_TOKENS = 512
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+print("Using device:", device)
+def get_db_path(db_id: str) -> str:
+    return os.path.join(DB_ROOT, db_id, f"{db_id}.sqlite")
+_SCHEMA_CACHE: Dict[str, str] = {}
+def get_db_schema_text(db_path: str) -> str:
+    if db_path in _SCHEMA_CACHE:
+        return _SCHEMA_CACHE[db_path]
+    schema_text = ""
+    try:
+        with closing(sqlite3.connect(db_path)) as conn:
+            cur = conn.cursor()
+            tables = cur.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
+            ).fetchall()
+            for (tname,) in tables:
+                cols = cur.execute(f'PRAGMA table_info(\"{tname}\")').fetchall()
+                col_names = [c[1] for c in cols if c and isinstance(c[1], str)]
+                schema_text += f"{tname}({', '.join(col_names)}) "
+    except Exception:
+        schema_text = ""
+    if len(schema_text) > MAX_SCHEMA_CHARS:
+        schema_text = schema_text[:MAX_SCHEMA_CHARS]
+    _SCHEMA_CACHE[db_path] = schema_text
+    return schema_text
+def encode_prompt(tokenizer, question: str, schema: str) -> torch.Tensor:
+    schema = (schema or "")[:MAX_SCHEMA_CHARS]
+    prefix_schema = f"{PREFIX}\n\nSchema:\n"
+    mid = "\n\nQuestion:\n"
+    suffix = f"{question}\n\nSQL:"
+    prefix_ids = tokenizer.encode(prefix_schema, add_special_tokens=False)
+    schema_ids = tokenizer.encode(schema, add_special_tokens=False)
+    mid_ids = tokenizer.encode(mid, add_special_tokens=False)
+    suffix_ids = tokenizer.encode(suffix, add_special_tokens=False)
+    eos_id = tokenizer.eos_token_id
+    max_without_eos = MAX_INPUT_TOKENS - (1 if eos_id is not None else 0)
+    fixed_len = len(prefix_ids) + len(mid_ids) + len(suffix_ids)
+    if fixed_len > max_without_eos:
+        keep = max(0, max_without_eos - (len(prefix_ids) + len(mid_ids)))
+        suffix_ids = suffix_ids[:keep]
+        fixed_len = len(prefix_ids) + len(mid_ids) + len(suffix_ids)
+    remaining_for_schema = max_without_eos - fixed_len
+    if remaining_for_schema < 0:
+        remaining_for_schema = 0
+    schema_ids = schema_ids[:remaining_for_schema]
+    ids = (prefix_ids + schema_ids + mid_ids + suffix_ids)[:max_without_eos]
+    if eos_id is not None:
+        ids = ids + [eos_id]
+    return torch.tensor(ids, dtype=torch.long).to(device)
+def load_model_and_tokenizer():
+    # Try loading the PPO-saved value-head model directly.
+    try:
+        tok = AutoTokenizer.from_pretrained(RL_DIR)
+        mdl = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(RL_DIR).to(device)
+        return tok, mdl
+    except Exception:
+        pass
+    # Fallback: treat RL_DIR as a LoRA adapter directory.
+    tok = AutoTokenizer.from_pretrained(BASE_MODEL)
+    if tok.pad_token_id is None:
+        tok.pad_token = tok.eos_token
+    base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+    try:
+        base = PeftModel.from_pretrained(base, RL_DIR)
+    except Exception:
+        # Final fallback: use SFT adapter (if RL adapter not found)
+        sft_dir = os.path.join(PROJECT_ROOT, "checkpoints", "sft_adapter")
+        base = PeftModel.from_pretrained(base, sft_dir)
+    return tok, base
+def main() -> None:
+    tokenizer, model = load_model_and_tokenizer()
+    model.eval()
+    ds = load_dataset("spider", split=SPLIT)
+    correct = 0
+    valid = 0
+    for i, ex in enumerate(ds, start=1):
+        question = ex["question"]
+        gold_sql = ex["query"]
+        db_id = ex["db_id"]
+        db_path = get_db_path(db_id)
+        schema = get_db_schema_text(db_path)
+        inp = encode_prompt(tokenizer, question, schema)
+        with torch.no_grad():
+            out = model.generate(
+                input_ids=inp.unsqueeze(0),
+                max_new_tokens=MAX_NEW_TOKENS,
+                do_sample=False,
+                num_beams=1,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        pred_sql = tokenizer.decode(out[0], skip_special_tokens=True)
+        r = execution_reward(pred_sql, db_path, gold_sql)
+        if r > -1.0:
+            valid += 1
+        if r >= 1.0:
+            correct += 1
+        if i % 25 == 0:
+            print(f"Evaluated {i}/{len(ds)}")
+    n = len(ds)
+    print("\nRESULTS")
+    print(f"examples: {n}")
+    print(f"execution_accuracy: {correct/n:.3f}")
+    print(f"valid_sql_rate: {valid/n:.3f}")
+if __name__ == "__main__":
+    main()

scripts/plot_task2.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+# ==========================================
+# 1. EXTRACTED DATA FROM TERMINAL
+# ==========================================
+# Error Distribution Data
+error_types = ['wrong_column', 'wrong_table', 'ambiguous_column', 'other']
+error_counts = [61, 11, 4, 1]
+# SQL Operation Analysis Data
+sql_ops = ['WHERE', 'JOIN', 'ORDER BY', 'GROUP BY']
+op_counts = [55, 36, 20, 14]
+# ==========================================
+# 2. SET UP THE DASHBOARD LAYOUT
+# ==========================================
+# Use a clean, modern aesthetic
+sns.set_theme(style="whitegrid")
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+# ==========================================
+# 3. PLOT 1: ERROR DISTRIBUTION (Horizontal Bar)
+# ==========================================
+sns.barplot(x=error_counts, y=error_types, ax=ax1, palette="flare")
+ax1.set_title('Primary Cause of Failure (Total: 77 Errors)', fontsize=14, pad=15, fontweight='bold')
+ax1.set_xlabel('Number of Queries')
+ax1.set_ylabel('')
+# Add actual numbers next to the bars
+for i, v in enumerate(error_counts):
+    ax1.text(v + 1.5, i, f"{v}", color='#333333', va='center', fontweight='bold')
+# ==========================================
+# 4. PLOT 2: SQL OPERATIONS (Vertical Bar)
+# ==========================================
+sns.barplot(x=sql_ops, y=op_counts, ax=ax2, palette="crest")
+ax2.set_title('Clauses Present in Failed Queries', fontsize=14, pad=15, fontweight='bold')
+ax2.set_ylabel('Frequency')
+ax2.set_xlabel('')
+# Add actual numbers on top of the bars
+for i, v in enumerate(op_counts):
+    ax2.text(i, v + 1, str(v), color='#333333', ha='center', fontweight='bold')
+# ==========================================
+# 5. RENDER AND SAVE
+# ==========================================
+plt.suptitle('Text-to-SQL Error Diagnostic Dashboard', fontsize=18, fontweight='heavy', y=1.05)
+sns.despine(left=True, bottom=True) # Removes clunky borders
+plt.tight_layout()
+# Save the plot as a high-res image for your report!
+plt.savefig('error_diagnostic_plot.png', dpi=300, bbox_inches='tight')
+print("✅ Plot successfully saved as 'error_diagnostic_plot.png'")
+# Display the plot
+plt.show()

scripts/plot_task3.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import matplotlib.pyplot as plt
+labels = ["Without", "With"]
+constraint = [0, 88]
+plt.figure()
+plt.bar(labels, constraint)
+plt.title("Constraint Satisfaction (Task 3)")
+plt.ylabel("Percentage")
+plt.savefig("task3_constraint.png")
+plt.show()

scripts/plot_task3_plotly.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# ==========================================
+# 1. YOUR DATA
+# ==========================================
+models = ['FP32 (Base)', 'INT8 Dynamic', 'INT8 Decoder-Only']
+# Accuracy (multiplied by 100 for percentage)
+accuracy = [36.0, 36.0, 38.0]
+# Latency metrics
+lat_mean = [3.11, 1.65, 1.66]
+lat_p50  = [2.94, 1.54, 1.56]
+lat_p90  = [4.64, 2.44, 2.48]
+# ==========================================
+# 2. SET UP THE SIDE-BY-SIDE LAYOUT
+# ==========================================
+fig = make_subplots(
+    rows=1, cols=2,
+    subplot_titles=(
+        "<b>Model Accuracy (Execution)</b>",
+        "<b>Inference Latency Profile</b>"
+    ),
+    horizontal_spacing=0.1
+)
+# ==========================================
+# 3. LEFT CHART: ACCURACY
+# ==========================================
+fig.add_trace(go.Bar(
+    x=models,
+    y=accuracy,
+    name="Execution Accuracy",
+    marker_color=['#94a3b8', '#38bdf8', '#10b981'], # Gray, Blue, Green
+    text=[f"{val:.1f}%" for val in accuracy],
+    textposition='auto',
+    textfont=dict(size=14, color='white', family="Arial Black"),
+    showlegend=False
+), row=1, col=1)
+# ==========================================
+# 4. RIGHT CHART: LATENCY PROFILE
+# ==========================================
+# P50 Latency
+fig.add_trace(go.Bar(
+    x=models, y=lat_p50,
+    name="Median (P50)",
+    marker_color="#ece80a" # Light Blue
+), row=1, col=2)
+# Mean Latency
+fig.add_trace(go.Bar(
+    x=models, y=lat_mean,
+    name="Mean Latency",
+    marker_color="#3b4da9" # Standard Blue
+), row=1, col=2)
+# P90 Latency
+fig.add_trace(go.Bar(
+    x=models, y=lat_p90,
+    name="90th Percentile (P90)",
+    marker_color="#d974e2" # Dark Blue
+), row=1, col=2)
+# ==========================================
+# 5. APPLY ULTRA-MODERN STYLING
+# ==========================================
+fig.update_layout(
+    title=dict(
+        text="<b>Task 5: FP32 vs. INT8 Quantization Performance</b>",
+        font=dict(size=22, color='#1e293b'),
+        x=0.5
+    ),
+    plot_bgcolor='white',
+    paper_bgcolor='white',
+    barmode='group',
+    legend=dict(
+        orientation="h",
+        yanchor="bottom", y=1.05,
+        xanchor="center", x=0.8,
+        bgcolor='rgba(255,255,255,0.8)'
+    ),
+    font=dict(family="-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif"),
+    margin=dict(t=120, b=60, l=60, r=40)
+)
+# Style Left Axes
+fig.update_yaxes(title_text="<b>Accuracy (%)</b>", range=[0, 45], gridcolor='#f1f5f9', row=1, col=1)
+fig.update_xaxes(tickfont=dict(weight='bold'), row=1, col=1)
+# Style Right Axes
+fig.update_yaxes(title_text="<b>Seconds per Query</b>", gridcolor='#f1f5f9', row=1, col=2)
+fig.update_xaxes(tickfont=dict(weight='bold'), row=1, col=2)
+# ==========================================
+# 6. RENDER AND SAVE
+# ==========================================
+html_file = "task5_quantization_dashboard.html"
+fig.write_html(html_file)
+print(f"✅ Interactive Plotly Dashboard saved to: {html_file}")
+fig.show()

scripts/quantize_export.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from __future__ import annotations
+import argparse
+import os
+from pathlib import Path
+import torch
+from src.quantization_utils import (
+    load_bnb_quantized_model,
+    load_fp32_model,
+    quantize_dynamic_int8,
+    quantize_dynamic_int8_decoder_only,
+    save_quant_artifact,
+)
+def main() -> None:
+    p = argparse.ArgumentParser(description="Export quantized Seq2Seq model artifacts for CPU inference.")
+    p.add_argument("--base_model", default=os.environ.get("BASE_MODEL", "Salesforce/codet5-base"))
+    p.add_argument("--adapter", default="", help="Optional LoRA adapter directory.")
+    p.add_argument("--out_dir", required=True, help="Output directory for artifact.")
+    p.add_argument(
+        "--mode",
+        required=True,
+        choices=["fp32", "int8_dynamic", "int8_decoder_dynamic", "int8_bnb", "int4_bnb"],
+    )
+    p.add_argument("--device", default="cpu", help="cpu|cuda (bnb requires cuda)")
+    p.add_argument("--local_only", action="store_true", help="Do not hit network; use HF cache only.")
+    args = p.parse_args()
+    adapter = args.adapter.strip() or None
+    out_dir = Path(args.out_dir)
+    if args.mode == "fp32":
+        tok, model = load_fp32_model(args.base_model, adapter_path=adapter, device=args.device, local_only=args.local_only)
+        save_quant_artifact(out_dir, mode="fp32", base_model=args.base_model, adapter_path=adapter, tokenizer=tok, model=model)
+        return
+    if args.mode == "int8_dynamic":
+        tok, model = load_fp32_model(args.base_model, adapter_path=adapter, device="cpu", local_only=args.local_only)
+        model = quantize_dynamic_int8(model)
+        save_quant_artifact(out_dir, mode="int8_dynamic", base_model=args.base_model, adapter_path=adapter, tokenizer=tok, model=model)
+        return
+    if args.mode == "int8_decoder_dynamic":
+        tok, model = load_fp32_model(args.base_model, adapter_path=adapter, device="cpu", local_only=args.local_only)
+        model = quantize_dynamic_int8_decoder_only(model)
+        save_quant_artifact(
+            out_dir,
+            mode="int8_decoder_dynamic",
+            base_model=args.base_model,
+            adapter_path=adapter,
+            tokenizer=tok,
+            model=model,
+        )
+        return
+    if args.mode == "int8_bnb":
+        tok, model = load_bnb_quantized_model(
+            args.base_model,
+            adapter_path=adapter,
+            device=args.device,
+            local_only=args.local_only,
+            load_in_8bit=True,
+        )
+        # Note: saving bnb quantized weights in a portable way is non-trivial; we still save state_dict for reference.
+        save_quant_artifact(out_dir, mode="int8_bnb", base_model=args.base_model, adapter_path=adapter, tokenizer=tok, model=model)
+        return
+    if args.mode == "int4_bnb":
+        tok, model = load_bnb_quantized_model(
+            args.base_model,
+            adapter_path=adapter,
+            device=args.device,
+            local_only=args.local_only,
+            load_in_4bit=True,
+        )
+        save_quant_artifact(out_dir, mode="int4_bnb", base_model=args.base_model, adapter_path=adapter, tokenizer=tok, model=model)
+        return
+if __name__ == "__main__":
+    torch.set_grad_enabled(False)
+    main()

scripts/quantized_infer_harness.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from __future__ import annotations
+import argparse
+import json
+import time
+from pathlib import Path
+from src.quantized_text2sql_engine import QuantizedText2SQLEngine
+def main() -> None:
+    p = argparse.ArgumentParser(description="Production-style inference harness for quantized artifacts.")
+    p.add_argument("--artifact", required=True, help="Quant artifact dir from scripts/quantize_export.py")
+    p.add_argument("--num_samples", type=int, default=128)
+    p.add_argument("--out", default="results/task5_quant_infer.json")
+    args = p.parse_args()
+    root = Path(".")
+    dev = json.loads((root / "data" / "dev.json").read_text())
+    dev = dev[: args.num_samples]
+    engine = QuantizedText2SQLEngine(args.artifact, device="cpu")
+    pairs = [(x["question"], x["db_id"]) for x in dev]
+    t0 = time.perf_counter()
+    results = engine.ask_batch_execute(pairs)
+    dt = time.perf_counter() - t0
+    out = {
+        "n": len(results),
+        "seconds": dt,
+        "qps": len(results) / max(dt, 1e-9),
+        "artifact": args.artifact,
+        "meta": engine.meta,
+        "results": results[:10],  # sample
+    }
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(out, indent=2))
+    print(json.dumps(out, indent=2))
+if __name__ == "__main__":
+    main()

src/__pycache__/execution_reward.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

src/__pycache__/quantization_utils.cpython-310.pyc ADDED Viewed

Binary file (6.23 kB). View file

src/__pycache__/quantized_text2sql_engine.cpython-310.pyc ADDED Viewed

Binary file (8.95 kB). View file

src/__pycache__/schema_encoder.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

src/__pycache__/schema_utils.cpython-310.pyc ADDED Viewed

Binary file (3.64 kB). View file

src/__pycache__/sql_validator.cpython-310.pyc ADDED Viewed

Binary file (5.29 kB). View file

src/__pycache__/text2sql_engine.cpython-310.pyc ADDED Viewed

Binary file (8.34 kB). View file

src/ask.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+TERMINAL CHAT WITH DATABASE
+Run:
+python src/ask.py chinook_1
+"""
+import sys
+from text2sql_engine import get_engine
+# -------------------------------
+# Pretty table printer
+# -------------------------------
+def print_table(cols, rows, limit=20):
+    if not rows or not cols:
+        print("No results\n")
+        return
+    cols = [str(c) for c in cols]
+    widths = [max(len(c), 12) for c in cols]
+    for r in rows[:limit]:
+        for i, val in enumerate(r):
+            widths[i] = max(widths[i], len(str(val)))
+    header = " | ".join(cols[i].ljust(widths[i]) for i in range(len(cols)))
+    print("\n" + header)
+    print("-" * len(header))
+    for r in rows[:limit]:
+        print(" | ".join(str(r[i]).ljust(widths[i]) for i in range(len(cols))))
+    if len(rows) > limit:
+        print(f"\n... showing first {limit} rows of {len(rows)}")
+    print()
+# -------------------------------
+# Main loop
+# -------------------------------
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python src/ask.py <db_id>")
+        return
+    db_id = sys.argv[1].strip()
+    print("Loading model... (first time takes 20-40s)")
+    engine = get_engine()
+    print(f"\nConnected to database: {db_id}")
+    print("Type 'exit' to quit\n")
+    while True:
+        try:
+            q = input("Ask> ").strip()
+            if not q:
+                continue
+            if q.lower() in ["exit", "quit"]:
+                break
+            result = engine.ask(q, db_id)
+            if result is None:
+                print("Model returned no output\n")
+                continue
+            print("\nGenerated SQL:")
+            print(result.get("sql", "<no sql>"))
+            if result.get("error"):
+                print("\nSQL Error:")
+                print(result["error"])
+            else:
+                print_table(
+                    result.get("columns", []),
+                    result.get("rows", []),
+                )
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            print("\nRuntime error:", e, "\n")
+    print("\nBye!")
+if __name__ == "__main__":
+    main()

src/component_analysis.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import json
+import sqlite3
+import torch
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+DB_ROOT = PROJECT_ROOT / "data" / "database"
+# -------------------------------
+# Extract SQL components
+# -------------------------------
+def extract_components(sql):
+    sql = sql.lower()
+    return {
+        "select": "select" in sql,
+        "where": "where" in sql,
+        "group": "group by" in sql,
+        "order": "order by" in sql,
+        "and_or": (" and " in sql) or (" or " in sql),
+        "join": "join" in sql
+    }
+# -------------------------------
+# Fallback Difficulty Estimator
+# -------------------------------
+def estimate_difficulty(sql):
+    """Fallback if 'difficulty' is missing from the JSON."""
+    sql = sql.lower()
+    joins = sql.count("join")
+    conditions = sql.count("and") + sql.count("or")
+    if "intersect" in sql or "except" in sql or "union" in sql or joins > 2:
+        return "extra"
+    elif joins == 2 or ("group by" in sql and conditions > 0):
+        return "hard"
+    elif joins == 1 or "group by" in sql or "order by" in sql:
+        return "medium"
+    else:
+        return "easy"
+# -------------------------------
+# Load schema
+# -------------------------------
+def load_schema(db_path):
+    conn = sqlite3.connect(db_path)
+    conn.text_factory = lambda b: b.decode(errors='ignore')
+    cursor = conn.cursor()
+    tables = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table';"
+    ).fetchall()
+    schema = ""
+    for (table,) in tables:
+        cols = cursor.execute(f"PRAGMA table_info({table});").fetchall()
+        col_names = [c[1] for c in cols]
+        schema += f"{table}({', '.join(col_names)})\n"
+    conn.close()
+    return schema
+# -------------------------------
+# Prompt
+# -------------------------------
+def build_prompt(question, schema):
+    return f"""Database Schema:
+{schema}
+Translate English to SQL:
+{question}
+SQL:
+"""
+# -------------------------------
+# Main
+# -------------------------------
+def main():
+    adapter = "checkpoints/rl_step_1800"
+    base_model = "Salesforce/codet5-base"
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    print("Loading tokenizer and models...")
+    tokenizer = AutoTokenizer.from_pretrained(adapter)
+    base = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
+    model = PeftModel.from_pretrained(base, adapter).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    dev_json = PROJECT_ROOT / "data" / "dev.json"
+    with open(dev_json) as f:
+        dev = json.load(f)[:1000]  # Adjust number to test more/less
+    components_list = ["select", "where", "group", "order", "and_or", "join"]
+    difficulties_list = ["easy", "medium", "hard", "extra"]
+    # Nested dictionary for components
+    stats = {
+        comp: {diff: {"correct": 0, "total": 0} for diff in difficulties_list}
+        for comp in components_list
+    }
+    # 🚀 NEW: Trackers for OVERALL accuracy by difficulty
+    overall_correct = {diff: 0 for diff in difficulties_list}
+    overall_total = {diff: 0 for diff in difficulties_list}
+    print(f"\nRunning grouped evaluation on {len(dev)} examples...\n")
+    for i, ex in enumerate(dev, 1):
+        question = ex["question"]
+        gold_sql = ex["query"]
+        db_id = ex["db_id"]
+        # Determine difficulty
+        difficulty = ex.get("difficulty", estimate_difficulty(gold_sql))
+        if difficulty not in difficulties_list:
+            difficulty = "medium"
+        db_path = DB_ROOT / db_id / f"{db_id}.sqlite"
+        schema = load_schema(db_path)
+        prompt = build_prompt(question, schema)
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=1000,
+                num_beams=4,
+                do_sample=False
+            )
+        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "SQL:" in pred_sql:
+            pred_sql = pred_sql.split("SQL:")[-1]
+        # --- 1. Update Overall Accuracy Trackers ---
+        overall_total[difficulty] += 1
+        # Simple string match for quick overall accuracy
+        if pred_sql.strip().lower() == gold_sql.strip().lower():
+            overall_correct[difficulty] += 1
+        # --- 2. Update Component Stats ---
+        pred_comp = extract_components(pred_sql)
+        gold_comp = extract_components(gold_sql)
+        for comp in components_list:
+            if gold_comp[comp]:  # If the gold SQL required this component
+                stats[comp][difficulty]["total"] += 1
+                if pred_comp[comp]: # If the model successfully generated it
+                    stats[comp][difficulty]["correct"] += 1
+        if i % 20 == 0:
+            print(f"Processed {i}/{len(dev)}")
+    # -------------------------------
+    # Plotting (Grouped Bar Chart)
+    # -------------------------------
+    x = np.arange(len(components_list))
+    width = 0.2
+    def get_acc(diff):
+        return [
+            (stats[comp][diff]["correct"] / stats[comp][diff]["total"] * 100) if stats[comp][diff]["total"] > 0 else 0
+            for comp in components_list
+        ]
+    acc_easy = get_acc("easy")
+    acc_medium = get_acc("medium")
+    acc_hard = get_acc("hard")
+    acc_extra = get_acc("extra")
+    fig, ax = plt.subplots(figsize=(14, 7))
+    bars1 = ax.bar(x - 1.5 * width, acc_easy, width, label='Easy', color='#2ecc71')
+    bars2 = ax.bar(x - 0.5 * width, acc_medium, width, label='Medium', color='#f1c40f')
+    bars3 = ax.bar(x + 0.5 * width, acc_hard, width, label='Hard', color='#e67e22')
+    bars4 = ax.bar(x + 1.5 * width, acc_extra, width, label='Extra', color='#e74c3c')
+    ax.set_ylabel('Accuracy (%)', fontsize=12)
+    ax.set_title('SQL Component Match Accuracy by Difficulty Level', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels([c.upper() for c in components_list], fontsize=11)
+    ax.legend(title="Query Difficulty")
+    ax.set_ylim(0, 115)
+    def autolabel(rects):
+        for rect in rects:
+            height = rect.get_height()
+            if height > 0:
+                ax.annotate(f'{int(height)}%',
+                            xy=(rect.get_x() + rect.get_width() / 2, height),
+                            xytext=(0, 3),
+                            textcoords="offset points",
+                            ha='center', va='bottom', fontsize=8, rotation=90)
+    autolabel(bars1)
+    autolabel(bars2)
+    autolabel(bars3)
+    autolabel(bars4)
+    ax.yaxis.grid(True, linestyle='--', alpha=0.7)
+    plt.tight_layout()
+    plt.savefig("component_by_difficulty_plot.png", dpi=300)
+    # -------------------------------
+    # 🚀 Terminal Printout
+    # -------------------------------
+    print("\n✅ Saved merged plot -> component_by_difficulty_plot.png")
+    print("\n========================================")
+    print("🏆 OVERALL AVERAGE ACCURACY BY DIFFICULTY")
+    print("========================================")
+    for diff in difficulties_list:
+        if overall_total[diff] > 0:
+            avg = round((overall_correct[diff] / overall_total[diff]) * 100, 2)
+            print(f"{diff.capitalize():<8}: {avg:>5}%  ({overall_correct[diff]}/{overall_total[diff]} queries)")
+        else:
+            print(f"{diff.capitalize():<8}:   N/A  (0 queries)")
+    print("========================================\n")
+if __name__ == "__main__":
+    main()

src/constrained_decoding.py ADDED Viewed

	@@ -0,0 +1,1058 @@

+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node: _TrieNode = self
+#         for tid in token_ids:
+#             tid_i = int(tid)
+#             nxt = node.children.get(tid_i)
+#             if nxt is None:
+#                 nxt = _TrieNode()
+#                 node.children[tid_i] = nxt
+#             node = nxt
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node: _TrieNode = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))  # type: ignore[assignment]
+#             if node is None:
+#                 return None
+#         return node
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     # Leading space encourages word-start markers (e.g. "Ġ" in RoBERTa BPE).
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for n in names:
+#         if not n:
+#             continue
+#         try:
+#             ids = _encode_identifier(tokenizer, n)
+#         except Exception:
+#             continue
+#         if ids:
+#             trie.insert(ids)
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     # Allow common delimiters so the model can end an identifier.
+#     toks = [",", ")", "(", "\n", ".", ";"]
+#     ids: Set[int] = set()
+#     for t in toks:
+#         try:
+#             for tid in tokenizer.encode(t, add_special_tokens=False):
+#                 ids.add(int(tid))
+#         except Exception:
+#             continue
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_TOKENSET_LOCK = threading.Lock()
+# _DB_TOKENSETS: Dict[str, _PerDbTokenSets] = {}
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_TOKENSET_LOCK:
+#         cached = _DB_TOKENSETS.get(graph.db_path)
+#         if cached is not None and cached.fp == graph.fingerprint:
+#             return cached
+#     out = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_TOKENSET_LOCK:
+#         _DB_TOKENSETS[graph.db_path] = out
+#     return out
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     """
+#     Schema-aware constrained decoding per item in the generation batch.
+#     Uses a tokenizer-based trie so multi-token identifiers can be constrained.
+#     """
+#     def __init__(self, tokenizer, db_paths: Sequence[str], *, max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = int(max_prefix_tokens)
+#         self._graphs = [build_constraint_graph(p) for p in self.db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         if input_ids.dim() != 2 or scores.dim() != 2:
+#             return scores
+#         batch = input_ids.size(0)
+#         if batch != len(self._graphs):
+#             return scores
+#         for i in range(batch):
+#             tail_ids = input_ids[i, -self.max_prefix_tokens :].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             if expected == "table":
+#                 m = re.search(r"(?:from|join)\s+([A-Za-z_][A-Za-z0-9_]*)$", prefix_text, flags=re.I)
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(r"(?:from|join)\s*$", prefix_text, flags=re.I):
+#                     continue
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 m = re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s+([A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?)$",
+#                     prefix_text,
+#                     flags=re.I,
+#                 )
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s*$", prefix_text, flags=re.I
+#                 ):
+#                     continue
+#                 trie = self._token_sets[i].column_trie
+#             if not partial:
+#                 prefix_token_ids: List[int] = []
+#             else:
+#                 try:
+#                     prefix_token_ids = _encode_identifier(self.tokenizer, partial)
+#                 except Exception:
+#                     continue
+#             node = trie.walk(prefix_token_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = sorted(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next_t = torch.tensor(allowed_next, dtype=torch.long, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next_t, allow_always]) if allow_always.numel() else allowed_next_t
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+# # Backwards-compatible names used elsewhere in the repo.
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self._proc = BatchSchemaConstrainedLogitsProcessor(tokenizer, [schema_graph._graph.db_path])
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         return self._proc(input_ids, scores)
+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# # =========================================================
+# # 🔍 IDENTIFIER TYPE DETECTION
+# # =========================================================
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# # =========================================================
+# # 🌳 TRIE STRUCTURE
+# # =========================================================
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node = self
+#         for tid in token_ids:
+#             tid = int(tid)
+#             if tid not in node.children:
+#                 node.children[tid] = _TrieNode()
+#             node = node.children[tid]
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))
+#             if node is None:
+#                 return None
+#         return node
+# # =========================================================
+# # 🔤 TOKEN ENCODING
+# # =========================================================
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for name in names:
+#         try:
+#             ids = _encode_identifier(tokenizer, name)
+#             if ids:
+#                 trie.insert(ids)
+#         except Exception:
+#             continue
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     tokens = [",", ")", "(", ".", ";", "\n"]
+#     ids: Set[int] = set()
+#     for t in tokens:
+#         try:
+#             ids.update(tokenizer.encode(t, add_special_tokens=False))
+#         except:
+#             pass
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# # =========================================================
+# # 📦 PER-DB CACHE
+# # =========================================================
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_CACHE: Dict[str, _PerDbTokenSets] = {}
+# _DB_LOCK = threading.Lock()
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_LOCK:
+#         cached = _DB_CACHE.get(graph.db_path)
+#         if cached and cached.fp == graph.fingerprint:
+#             return cached
+#     obj = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_LOCK:
+#         _DB_CACHE[graph.db_path] = obj
+#     return obj
+# # =========================================================
+# # 🚀 MAIN LOGITS PROCESSOR
+# # =========================================================
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, db_paths: Sequence[str], max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = max_prefix_tokens
+#         self._graphs = [build_constraint_graph(p) for p in db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#         # 📊 Metrics (IMPORTANT FOR REPORT)
+#         self.total_steps = 0
+#         self.constrained_steps = 0
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+#         batch = input_ids.size(0)
+#         for i in range(batch):
+#             self.total_steps += 1
+#             tail_ids = input_ids[i, -self.max_prefix_tokens:].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             self.constrained_steps += 1
+#             # =========================
+#             # SELECT TRIE
+#             # =========================
+#             if expected == "table":
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 trie = self._token_sets[i].column_trie
+#             # =========================
+#             # PARTIAL TOKEN MATCH
+#             # =========================
+#             match = re.search(r"([A-Za-z_][A-Za-z0-9_]*)$", prefix_text)
+#             partial = match.group(1) if match else ""
+#             try:
+#                 prefix_ids = _encode_identifier(self.tokenizer, partial) if partial else []
+#             except:
+#                 continue
+#             node = trie.walk(prefix_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = list(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next = torch.tensor(allowed_next, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next, allow_always])
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+#     # =========================================================
+#     # 📊 METRICS FOR REPORT
+#     # =========================================================
+#     def get_constraint_stats(self):
+#         if self.total_steps == 0:
+#             return 0
+#         return self.constrained_steps / self.total_steps
+# # =========================================================
+# # 🔁 BACKWARD COMPATIBILITY
+# # =========================================================
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self.proc = BatchSchemaConstrainedLogitsProcessor(
+#             tokenizer, [schema_graph._graph.db_path]
+#         )
+#     def __call__(self, input_ids, scores):
+#         return self.proc(input_ids, scores)
+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node: _TrieNode = self
+#         for tid in token_ids:
+#             tid_i = int(tid)
+#             nxt = node.children.get(tid_i)
+#             if nxt is None:
+#                 nxt = _TrieNode()
+#                 node.children[tid_i] = nxt
+#             node = nxt
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node: _TrieNode = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))  # type: ignore[assignment]
+#             if node is None:
+#                 return None
+#         return node
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     # Leading space encourages word-start markers (e.g. "Ġ" in RoBERTa BPE).
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for n in names:
+#         if not n:
+#             continue
+#         try:
+#             ids = _encode_identifier(tokenizer, n)
+#         except Exception:
+#             continue
+#         if ids:
+#             trie.insert(ids)
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     # Allow common delimiters so the model can end an identifier.
+#     toks = [",", ")", "(", "\n", ".", ";"]
+#     ids: Set[int] = set()
+#     for t in toks:
+#         try:
+#             for tid in tokenizer.encode(t, add_special_tokens=False):
+#                 ids.add(int(tid))
+#         except Exception:
+#             continue
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_TOKENSET_LOCK = threading.Lock()
+# _DB_TOKENSETS: Dict[str, _PerDbTokenSets] = {}
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_TOKENSET_LOCK:
+#         cached = _DB_TOKENSETS.get(graph.db_path)
+#         if cached is not None and cached.fp == graph.fingerprint:
+#             return cached
+#     out = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_TOKENSET_LOCK:
+#         _DB_TOKENSETS[graph.db_path] = out
+#     return out
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     """
+#     Schema-aware constrained decoding per item in the generation batch.
+#     Uses a tokenizer-based trie so multi-token identifiers can be constrained.
+#     """
+#     def __init__(self, tokenizer, db_paths: Sequence[str], *, max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = int(max_prefix_tokens)
+#         self._graphs = [build_constraint_graph(p) for p in self.db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         if input_ids.dim() != 2 or scores.dim() != 2:
+#             return scores
+#         batch = input_ids.size(0)
+#         if batch != len(self._graphs):
+#             return scores
+#         for i in range(batch):
+#             tail_ids = input_ids[i, -self.max_prefix_tokens :].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             if expected == "table":
+#                 m = re.search(r"(?:from|join)\s+([A-Za-z_][A-Za-z0-9_]*)$", prefix_text, flags=re.I)
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(r"(?:from|join)\s*$", prefix_text, flags=re.I):
+#                     continue
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 m = re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s+([A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?)$",
+#                     prefix_text,
+#                     flags=re.I,
+#                 )
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s*$", prefix_text, flags=re.I
+#                 ):
+#                     continue
+#                 trie = self._token_sets[i].column_trie
+#             if not partial:
+#                 prefix_token_ids: List[int] = []
+#             else:
+#                 try:
+#                     prefix_token_ids = _encode_identifier(self.tokenizer, partial)
+#                 except Exception:
+#                     continue
+#             node = trie.walk(prefix_token_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = sorted(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next_t = torch.tensor(allowed_next, dtype=torch.long, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next_t, allow_always]) if allow_always.numel() else allowed_next_t
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+# # Backwards-compatible names used elsewhere in the repo.
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self._proc = BatchSchemaConstrainedLogitsProcessor(tokenizer, [schema_graph._graph.db_path])
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         return self._proc(input_ids, scores)
+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# # =========================================================
+# # 🔍 IDENTIFIER TYPE DETECTION
+# # =========================================================
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# # =========================================================
+# # 🌳 TRIE STRUCTURE
+# # =========================================================
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node = self
+#         for tid in token_ids:
+#             tid = int(tid)
+#             if tid not in node.children:
+#                 node.children[tid] = _TrieNode()
+#             node = node.children[tid]
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))
+#             if node is None:
+#                 return None
+#         return node
+# # =========================================================
+# # 🔤 TOKEN ENCODING
+# # =========================================================
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for name in names:
+#         try:
+#             ids = _encode_identifier(tokenizer, name)
+#             if ids:
+#                 trie.insert(ids)
+#         except Exception:
+#             continue
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     tokens = [",", ")", "(", ".", ";", "\n"]
+#     ids: Set[int] = set()
+#     for t in tokens:
+#         try:
+#             ids.update(tokenizer.encode(t, add_special_tokens=False))
+#         except:
+#             pass
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# # =========================================================
+# # 📦 PER-DB CACHE
+# # =========================================================
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_CACHE: Dict[str, _PerDbTokenSets] = {}
+# _DB_LOCK = threading.Lock()
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_LOCK:
+#         cached = _DB_CACHE.get(graph.db_path)
+#         if cached and cached.fp == graph.fingerprint:
+#             return cached
+#     obj = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_LOCK:
+#         _DB_CACHE[graph.db_path] = obj
+#     return obj
+# # =========================================================
+# # 🚀 MAIN LOGITS PROCESSOR
+# # =========================================================
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, db_paths: Sequence[str], max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = max_prefix_tokens
+#         self._graphs = [build_constraint_graph(p) for p in db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#         # 📊 Metrics (IMPORTANT FOR REPORT)
+#         self.total_steps = 0
+#         self.constrained_steps = 0
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+#         batch = input_ids.size(0)
+#         for i in range(batch):
+#             self.total_steps += 1
+#             tail_ids = input_ids[i, -self.max_prefix_tokens:].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             self.constrained_steps += 1
+#             # =========================
+#             # SELECT TRIE
+#             # =========================
+#             if expected == "table":
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 trie = self._token_sets[i].column_trie
+#             # =========================
+#             # PARTIAL TOKEN MATCH
+#             # =========================
+#             match = re.search(r"([A-Za-z_][A-Za-z0-9_]*)$", prefix_text)
+#             partial = match.group(1) if match else ""
+#             try:
+#                 prefix_ids = _encode_identifier(self.tokenizer, partial) if partial else []
+#             except:
+#                 continue
+#             node = trie.walk(prefix_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = list(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next = torch.tensor(allowed_next, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next, allow_always])
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+#     # =========================================================
+#     # 📊 METRICS FOR REPORT
+#     # =========================================================
+#     def get_constraint_stats(self):
+#         if self.total_steps == 0:
+#             return 0
+#         return self.constrained_steps / self.total_steps
+# # =========================================================
+# # 🔁 BACKWARD COMPATIBILITY
+# # =========================================================
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self.proc = BatchSchemaConstrainedLogitsProcessor(
+#             tokenizer, [schema_graph._graph.db_path]
+#         )
+#     def __call__(self, input_ids, scores):
+#         return self.proc(input_ids, scores)
+# ********* after task 3
+import re
+import threading
+from functools import lru_cache
+import torch
+from transformers import LogitsProcessor
+from src.schema_utils import get_constraint_graph
+_TOKEN_CACHE_LOCK = threading.Lock()
+_TOKEN_ID_CACHE = {}  # (id(tokenizer), db_path) -> (allowed_ids_tensor, always_allow_ids_tensor)
+def _encode_variants(tokenizer, text: str) -> list[int]:
+    ids: list[int] = []
+    for variant in (text, " " + text):
+        try:
+            ids.extend(tokenizer.encode(variant, add_special_tokens=False))
+        except Exception:
+            continue
+    # de-dup while keeping order
+    seen = set()
+    out = []
+    for i in ids:
+        if int(i) not in seen:
+            seen.add(int(i))
+            out.append(int(i))
+    return out
+def _always_allow_ids(tokenizer) -> list[int]:
+    """
+    Tokens we should never block, otherwise decoding can get stuck or generate garbage:
+    - EOS/PAD
+    - punctuation/operators needed for SQL formatting
+    - digits/quotes
+    """
+    ids: list[int] = []
+    for special in [getattr(tokenizer, "eos_token_id", None), getattr(tokenizer, "pad_token_id", None)]:
+        if special is not None:
+            ids.append(int(special))
+    # Common SQL punctuation/operators
+    pieces = [
+        " ", "\n", "\t",
+        ",", ".", "(", ")", ";",
+        "=", "!=", "<>", "<", ">", "<=", ">=",
+        "*", "+", "-", "/", "%",
+        "'", '"',
+    ]
+    for p in pieces:
+        ids.extend(_encode_variants(tokenizer, p))
+    # digits
+    for d in "0123456789":
+        ids.extend(_encode_variants(tokenizer, d))
+    seen = set()
+    out = []
+    for i in ids:
+        if int(i) not in seen:
+            seen.add(int(i))
+            out.append(int(i))
+    return out
+def _infer_expected_identifier_tail(tail_text: str):
+    """
+    Returns ("table"|"column", partial_or_empty) if the tail looks like it's currently
+    emitting a table/column identifier. Otherwise returns None.
+    """
+    t = re.sub(r"\s+", " ", (tail_text or "")).lower()
+    m = re.search(r"(?:from|join)\s+([a-z_][a-z0-9_]*)?$", t)
+    if m:
+        partial = m.group(1) or ""
+        # ensure we are actually after keyword (not elsewhere)
+        if re.search(r"(?:from|join)\s*$", t) or partial:
+            return "table", partial
+    m = re.search(
+        r"(?:select|where|on|group by|order by|having)\s+([a-z_][a-z0-9_]*(?:\.[a-z_][a-z0-9_]*)?)?$",
+        t,
+    )
+    if m:
+        partial = m.group(1) or ""
+        if re.search(r"(?:select|where|on|group by|order by|having)\s*$", t) or partial:
+            return "column", partial
+    return None
+class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+    def __init__(self, tokenizer, db_path):
+        self.tokenizer = tokenizer
+        graph = get_constraint_graph(db_path)
+        key = (id(tokenizer), str(db_path))
+        with _TOKEN_CACHE_LOCK:
+            cached = _TOKEN_ID_CACHE.get(key)
+        if cached is None:
+            allowed_tokens = set(graph.get("tables", set())) | set(graph.get("columns", set()))
+            sql_keywords = {
+                "select", "from", "where", "join", "on",
+                "group", "by", "order", "limit", "having",
+                "and", "or", "desc", "asc",
+                "count", "avg", "min", "max", "sum",
+                "distinct", "as", "in", "like", "between",
+                "is", "null",
+            }
+            allowed_tokens |= sql_keywords
+            allowed_ids: list[int] = []
+            for tok in sorted(allowed_tokens):
+                allowed_ids.extend(_encode_variants(tokenizer, tok))
+            always_ids = _always_allow_ids(tokenizer)
+            allowed_ids_t = torch.tensor(sorted(set(allowed_ids)), dtype=torch.long)
+            always_ids_t = torch.tensor(sorted(set(always_ids)), dtype=torch.long)
+            cached = (allowed_ids_t, always_ids_t)
+            with _TOKEN_CACHE_LOCK:
+                _TOKEN_ID_CACHE[key] = cached
+        self._allowed_ids_t, self._always_ids_t = cached
+    def __call__(self, input_ids, scores):
+        # Decode only a tail window for speed (beam search calls this a lot).
+        try:
+            tail_ids = input_ids[0][-128:]
+        except Exception:
+            tail_ids = input_ids[0]
+        tail = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+        inferred = _infer_expected_identifier_tail(tail)
+        if inferred is None:
+            return scores
+        keep = torch.cat([self._allowed_ids_t.to(scores.device), self._always_ids_t.to(scores.device)])
+        if keep.numel() == 0:
+            return scores
+        kept_scores = scores[:, keep].clone()
+        scores[:] = -float("inf")
+        scores[:, keep] = kept_scores
+        return scores

src/constrained_decoding_sample.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node: _TrieNode = self
+#         for tid in token_ids:
+#             tid_i = int(tid)
+#             nxt = node.children.get(tid_i)
+#             if nxt is None:
+#                 nxt = _TrieNode()
+#                 node.children[tid_i] = nxt
+#             node = nxt
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node: _TrieNode = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))  # type: ignore[assignment]
+#             if node is None:
+#                 return None
+#         return node
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     # Leading space encourages word-start markers (e.g. "Ġ" in RoBERTa BPE).
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for n in names:
+#         if not n:
+#             continue
+#         try:
+#             ids = _encode_identifier(tokenizer, n)
+#         except Exception:
+#             continue
+#         if ids:
+#             trie.insert(ids)
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     # Allow common delimiters so the model can end an identifier.
+#     toks = [",", ")", "(", "\n", ".", ";"]
+#     ids: Set[int] = set()
+#     for t in toks:
+#         try:
+#             for tid in tokenizer.encode(t, add_special_tokens=False):
+#                 ids.add(int(tid))
+#         except Exception:
+#             continue
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_TOKENSET_LOCK = threading.Lock()
+# _DB_TOKENSETS: Dict[str, _PerDbTokenSets] = {}
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_TOKENSET_LOCK:
+#         cached = _DB_TOKENSETS.get(graph.db_path)
+#         if cached is not None and cached.fp == graph.fingerprint:
+#             return cached
+#     out = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_TOKENSET_LOCK:
+#         _DB_TOKENSETS[graph.db_path] = out
+#     return out
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     """
+#     Schema-aware constrained decoding per item in the generation batch.
+#     Uses a tokenizer-based trie so multi-token identifiers can be constrained.
+#     """
+#     def __init__(self, tokenizer, db_paths: Sequence[str], *, max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = int(max_prefix_tokens)
+#         self._graphs = [build_constraint_graph(p) for p in self.db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         if input_ids.dim() != 2 or scores.dim() != 2:
+#             return scores
+#         batch = input_ids.size(0)
+#         if batch != len(self._graphs):
+#             return scores
+#         for i in range(batch):
+#             tail_ids = input_ids[i, -self.max_prefix_tokens :].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             if expected == "table":
+#                 m = re.search(r"(?:from|join)\s+([A-Za-z_][A-Za-z0-9_]*)$", prefix_text, flags=re.I)
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(r"(?:from|join)\s*$", prefix_text, flags=re.I):
+#                     continue
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 m = re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s+([A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?)$",
+#                     prefix_text,
+#                     flags=re.I,
+#                 )
+#                 partial = m.group(1) if m else None
+#                 if partial is None and not re.search(
+#                     r"(?:select|where|on|group by|order by|having)\s*$", prefix_text, flags=re.I
+#                 ):
+#                     continue
+#                 trie = self._token_sets[i].column_trie
+#             if not partial:
+#                 prefix_token_ids: List[int] = []
+#             else:
+#                 try:
+#                     prefix_token_ids = _encode_identifier(self.tokenizer, partial)
+#                 except Exception:
+#                     continue
+#             node = trie.walk(prefix_token_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = sorted(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next_t = torch.tensor(allowed_next, dtype=torch.long, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next_t, allow_always]) if allow_always.numel() else allowed_next_t
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+# # Backwards-compatible names used elsewhere in the repo.
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self._proc = BatchSchemaConstrainedLogitsProcessor(tokenizer, [schema_graph._graph.db_path])
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+#         return self._proc(input_ids, scores)
+# from __future__ import annotations
+# import re
+# import threading
+# from dataclasses import dataclass
+# from typing import Dict, Iterable, List, Optional, Sequence, Set
+# import torch
+# from transformers.generation.logits_process import LogitsProcessor
+# from schema_constraints import ConstraintGraph, build_constraint_graph
+# # =========================================================
+# # 🔍 IDENTIFIER TYPE DETECTION
+# # =========================================================
+# def _infer_expected_identifier(prefix_text: str) -> Optional[str]:
+#     s = re.sub(r"\s+", " ", prefix_text.lower())
+#     last_from = s.rfind(" from ")
+#     last_join = s.rfind(" join ")
+#     last_select = s.rfind(" select ")
+#     last_where = s.rfind(" where ")
+#     last_on = s.rfind(" on ")
+#     last_group = s.rfind(" group by ")
+#     last_order = s.rfind(" order by ")
+#     last_having = s.rfind(" having ")
+#     last_table_kw = max(last_from, last_join)
+#     last_col_kw = max(last_select, last_where, last_on, last_group, last_order, last_having)
+#     if last_table_kw < 0 and last_col_kw < 0:
+#         return None
+#     if last_table_kw > last_col_kw:
+#         return "table"
+#     if last_col_kw > last_table_kw:
+#         return "column"
+#     return None
+# # =========================================================
+# # 🌳 TRIE STRUCTURE
+# # =========================================================
+# class _TrieNode:
+#     __slots__ = ("children", "terminal")
+#     def __init__(self) -> None:
+#         self.children: Dict[int, _TrieNode] = {}
+#         self.terminal: bool = False
+#     def insert(self, token_ids: Sequence[int]) -> None:
+#         node = self
+#         for tid in token_ids:
+#             tid = int(tid)
+#             if tid not in node.children:
+#                 node.children[tid] = _TrieNode()
+#             node = node.children[tid]
+#         node.terminal = True
+#     def walk(self, prefix: Sequence[int]) -> Optional["_TrieNode"]:
+#         node = self
+#         for tid in prefix:
+#             node = node.children.get(int(tid))
+#             if node is None:
+#                 return None
+#         return node
+# # =========================================================
+# # 🔤 TOKEN ENCODING
+# # =========================================================
+# def _encode_identifier(tokenizer, name: str) -> List[int]:
+#     return tokenizer.encode(" " + name, add_special_tokens=False)
+# def _build_trie(tokenizer, names: Iterable[str]) -> _TrieNode:
+#     trie = _TrieNode()
+#     for name in names:
+#         try:
+#             ids = _encode_identifier(tokenizer, name)
+#             if ids:
+#                 trie.insert(ids)
+#         except Exception:
+#             continue
+#     return trie
+# def _allow_always_token_ids(tokenizer) -> torch.Tensor:
+#     tokens = [",", ")", "(", ".", ";", "\n"]
+#     ids: Set[int] = set()
+#     for t in tokens:
+#         try:
+#             ids.update(tokenizer.encode(t, add_special_tokens=False))
+#         except:
+#             pass
+#     return torch.tensor(sorted(ids), dtype=torch.long)
+# # =========================================================
+# # 📦 PER-DB CACHE
+# # =========================================================
+# @dataclass
+# class _PerDbTokenSets:
+#     fp: str
+#     table_trie: _TrieNode
+#     column_trie: _TrieNode
+#     allow_always: torch.Tensor
+# _DB_CACHE: Dict[str, _PerDbTokenSets] = {}
+# _DB_LOCK = threading.Lock()
+# def _per_db_tokensets(tokenizer, graph: ConstraintGraph) -> _PerDbTokenSets:
+#     with _DB_LOCK:
+#         cached = _DB_CACHE.get(graph.db_path)
+#         if cached and cached.fp == graph.fingerprint:
+#             return cached
+#     obj = _PerDbTokenSets(
+#         fp=graph.fingerprint,
+#         table_trie=_build_trie(tokenizer, graph.tables),
+#         column_trie=_build_trie(tokenizer, graph.all_columns),
+#         allow_always=_allow_always_token_ids(tokenizer),
+#     )
+#     with _DB_LOCK:
+#         _DB_CACHE[graph.db_path] = obj
+#     return obj
+# # =========================================================
+# # 🚀 MAIN LOGITS PROCESSOR
+# # =========================================================
+# class BatchSchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, db_paths: Sequence[str], max_prefix_tokens: int = 48):
+#         self.tokenizer = tokenizer
+#         self.db_paths = list(db_paths)
+#         self.max_prefix_tokens = max_prefix_tokens
+#         self._graphs = [build_constraint_graph(p) for p in db_paths]
+#         self._token_sets = [_per_db_tokensets(tokenizer, g) for g in self._graphs]
+#         # 📊 Metrics (IMPORTANT FOR REPORT)
+#         self.total_steps = 0
+#         self.constrained_steps = 0
+#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+#         batch = input_ids.size(0)
+#         for i in range(batch):
+#             self.total_steps += 1
+#             tail_ids = input_ids[i, -self.max_prefix_tokens:].tolist()
+#             prefix_text = self.tokenizer.decode(tail_ids, skip_special_tokens=True)
+#             expected = _infer_expected_identifier(prefix_text)
+#             if expected is None:
+#                 continue
+#             self.constrained_steps += 1
+#             # =========================
+#             # SELECT TRIE
+#             # =========================
+#             if expected == "table":
+#                 trie = self._token_sets[i].table_trie
+#             else:
+#                 trie = self._token_sets[i].column_trie
+#             # =========================
+#             # PARTIAL TOKEN MATCH
+#             # =========================
+#             match = re.search(r"([A-Za-z_][A-Za-z0-9_]*)$", prefix_text)
+#             partial = match.group(1) if match else ""
+#             try:
+#                 prefix_ids = _encode_identifier(self.tokenizer, partial) if partial else []
+#             except:
+#                 continue
+#             node = trie.walk(prefix_ids)
+#             if node is None or node.terminal:
+#                 continue
+#             allowed_next = list(node.children.keys())
+#             if not allowed_next:
+#                 continue
+#             allowed_next = torch.tensor(allowed_next, device=scores.device)
+#             allow_always = self._token_sets[i].allow_always.to(scores.device)
+#             keep = torch.cat([allowed_next, allow_always])
+#             kept_scores = scores[i, keep].clone()
+#             scores[i, :] = -float("inf")
+#             scores[i, keep] = kept_scores
+#         return scores
+#     # =========================================================
+#     # 📊 METRICS FOR REPORT
+#     # =========================================================
+#     def get_constraint_stats(self):
+#         if self.total_steps == 0:
+#             return 0
+#         return self.constrained_steps / self.total_steps
+# # =========================================================
+# # 🔁 BACKWARD COMPATIBILITY
+# # =========================================================
+# class SchemaConstraintGraph:
+#     def __init__(self, db_path: str):
+#         self._graph = build_constraint_graph(db_path)
+#         self.tables = sorted(self._graph.tables)
+#         self.columns = sorted(self._graph.all_columns)
+# class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+#     def __init__(self, tokenizer, schema_graph: SchemaConstraintGraph):
+#         self.proc = BatchSchemaConstrainedLogitsProcessor(
+#             tokenizer, [schema_graph._graph.db_path]
+#         )
+#     def __call__(self, input_ids, scores):
+#         return self.proc(input_ids, scores)
+# ********* after task 3
+import re
+import torch
+from transformers import LogitsProcessor
+from src.schema_utils import get_constraint_graph
+def _infer_expected_identifier(prefix_text: str):
+    s = prefix_text.lower()
+    if " from " in s or " join " in s:
+        return "table"
+    if any(k in s for k in ["select", "where", "on", "group by", "order by"]):
+        return "column"
+    return None
+class SchemaConstrainedLogitsProcessor(LogitsProcessor):
+    def __init__(self, tokenizer, db_path):
+        self.tokenizer = tokenizer
+        graph = get_constraint_graph(db_path)
+        self.allowed_tokens = set(graph["tables"]) | set(graph["columns"])
+        self.sql_keywords = {
+            "select", "from", "where", "join", "on",
+            "group", "by", "order", "limit",
+            "and", "or", "desc", "asc",
+            "count", "avg", "min", "max", "sum", "*"
+        }
+        self.allowed_tokens |= self.sql_keywords
+        self.allowed_token_ids = set()
+        for token in self.allowed_tokens:
+            ids = tokenizer.encode(token, add_special_tokens=False)
+            for i in ids:
+                self.allowed_token_ids.add(i)
+    def __call__(self, input_ids, scores):
+        prefix = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        # 🔥 SOFT CONSTRAINT (FIX)
+        if len(prefix.strip()) < 10:
+            return scores
+        expected = _infer_expected_identifier(prefix)
+        if expected not in ["table", "column"]:
+            return scores
+        mask = torch.full_like(scores, float("-inf"))
+        for token_id in self.allowed_token_ids:
+            mask[:, token_id] = scores[:, token_id]
+        return mask

src/convert_to_hf_dataset.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from datasets import Dataset
+import pandas as pd
+df = pd.read_csv("../data/processed/train.csv")
+ds = Dataset.from_pandas(df)
+ds.save_to_disk("../data/processed/train")
+print("DONE")

src/eval_baseline_codet5.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import json
+import sqlite3
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# ---------------- PROMPT (same style as training) ----------------
+def build_prompt(question, schema):
+    return f"""translate English to SQL:
+Schema:
+{schema}
+Question:
+{question}
+SQL:"""
+# ---------------- LOAD SCHEMA ----------------
+def load_schema(db_path):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    tables = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table';"
+    ).fetchall()
+    schema = ""
+    for (table,) in tables:
+        cols = cursor.execute(f"PRAGMA table_info({table});").fetchall()
+        col_names = [c[1] for c in cols]
+        schema += f"{table}({', '.join(col_names)})\n"
+    conn.close()
+    return schema
+# ---------------- EXECUTION MATCH ----------------
+def execution_match(pred_sql, gold_sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        cur = conn.cursor()
+        cur.execute(pred_sql)
+        pred = cur.fetchall()
+        cur.execute(gold_sql)
+        gold = cur.fetchall()
+        conn.close()
+        return pred == gold
+    except Exception:
+        return False
+# ---------------- MAIN ----------------
+def main():
+    project_root = Path(__file__).resolve().parents[1]
+    dev_json = project_root / "data" / "dev.json"
+    db_root = project_root / "data" / "database"
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    print("Loading BASE CodeT5...")
+    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
+    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base").to(device)
+    model.eval()
+    with open(dev_json) as f:
+        dev = json.load(f)[:100]
+    correct = 0
+    print(f"\nEvaluating {len(dev)} samples...\n")
+    for i, ex in enumerate(dev, 1):
+        question = ex["question"]
+        db_id = ex["db_id"]
+        gold_sql = ex["query"]
+        db_path = db_root / db_id / f"{db_id}.sqlite"
+        schema = load_schema(db_path)
+        prompt = build_prompt(question, schema)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=80,
+                num_beams=4,
+                do_sample=False
+            )
+        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "SQL:" in pred_sql:
+            pred_sql = pred_sql.split("SQL:")[-1].strip()
+        if execution_match(pred_sql, gold_sql, db_path):
+            correct += 1
+        if i % 10 == 0:
+            print(f"{i}/100 | Accuracy: {correct/i:.3f}")
+    print("\n=============================")
+    print(f"BASE MODEL ACCURACY: {correct}% / 100 = {correct}%")
+    print("=============================")
+if __name__ == "__main__":
+    main()

src/eval_both_metrics.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import json
+import sqlite3
+import torch
+import re
+import time
+import argparse
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+DB_ROOT = PROJECT_ROOT / "data" / "database"
+# -------------------------------
+# 1. NORMALIZATION FOR EXACT MATCH
+# -------------------------------
+def normalize_sql(sql):
+    """Cleans SQL to make Exact Match grading fair (ignores spacing/cases)."""
+    sql = sql.replace('"', "'")        # Standardize quotes
+    sql = re.sub(r"\s+", " ", sql)     # Remove extra spaces/newlines
+    sql = sql.strip().lower()          # Lowercase everything
+    sql = sql.rstrip(";")              # Remove trailing semicolons
+    return sql
+# -------------------------------
+# 2. EXECUTION ACCURACY CHECK
+# -------------------------------
+def check_execution(pred_sql, gold_sql, db_path):
+    """Runs both queries and checks if the output rows/columns match."""
+    try:
+        conn = sqlite3.connect(db_path)
+        # Handle bad characters in Spider DBs
+        conn.text_factory = lambda b: b.decode(errors='ignore')
+        # 5-second timeout
+        start_time = time.monotonic()
+        def timeout_handler():
+            return 1 if (time.monotonic() - start_time) > 5.0 else 0
+        conn.set_progress_handler(timeout_handler, 10000)
+        cursor = conn.cursor()
+        # Get Predicted Result
+        cursor.execute(pred_sql)
+        pred_res = cursor.fetchall()
+        # Get Gold Result
+        cursor.execute(gold_sql)
+        gold_res = cursor.fetchall()
+        conn.close()
+        return pred_res == gold_res
+    except Exception:
+        return False
+# -------------------------------
+# 3. LOAD SCHEMA
+# -------------------------------
+def load_schema(db_path):
+    conn = sqlite3.connect(db_path)
+    conn.text_factory = lambda b: b.decode(errors='ignore')
+    cursor = conn.cursor()
+    tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
+    schema = ""
+    for (table,) in tables:
+        cols = cursor.execute(f"PRAGMA table_info({table});").fetchall()
+        col_names = [c[1] for c in cols]
+        schema += f"{table}({', '.join(col_names)})\n"
+    conn.close()
+    return schema
+# -------------------------------
+# 4. MAIN PIPELINE
+# -------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True, help="Path to your SFT or RLHF checkpoint")
+    parser.add_argument("--num_samples", type=int, default=1034, help="How many samples to evaluate")
+    args = parser.parse_args()
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    base_model = "Salesforce/codet5-base"
+    print(f"\n🚀 Loading Model from: {args.adapter}")
+    tokenizer = AutoTokenizer.from_pretrained(args.adapter)
+    base = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
+    model = PeftModel.from_pretrained(base, args.adapter).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    dev_json = PROJECT_ROOT / "data" / "dev.json"
+    with open(dev_json) as f:
+        dev = json.load(f)[:args.num_samples]
+    em_correct = 0
+    ex_correct = 0
+    total = len(dev)
+    print(f"\n📊 Evaluating {total} queries for BOTH Exact Match and Execution Accuracy...\n")
+    for i, ex in enumerate(dev, 1):
+        question = ex["question"]
+        gold_sql = ex["query"]
+        db_id = ex["db_id"]
+        db_path = DB_ROOT / db_id / f"{db_id}.sqlite"
+        # Generate SQL
+        schema = load_schema(db_path)
+        prompt = f"Database Schema:\n{schema}\nTranslate English to SQL:\n{question}\nSQL:\n"
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=100, num_beams=4, do_sample=False)
+        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "SQL:" in pred_sql:
+            pred_sql = pred_sql.split("SQL:")[-1].strip()
+        # --- METRIC 1: EXACT MATCH ---
+        is_em = (normalize_sql(pred_sql) == normalize_sql(gold_sql))
+        if is_em:
+            em_correct += 1
+        # --- METRIC 2: EXECUTION ACCURACY ---
+        is_ex = check_execution(pred_sql, gold_sql, db_path)
+        if is_ex:
+            ex_correct += 1
+        if i % 50 == 0 or i == total:
+            print(f"Progress: {i}/{total} | Current EM: {(em_correct/i)*100:.2f}% | Current EX: {(ex_correct/i)*100:.2f}%")
+    # Final Results
+    final_em = (em_correct / total) * 100
+    final_ex = (ex_correct / total) * 100
+    print("\n==========================================")
+    print(f"🎯 FINAL RESULTS FOR: {args.adapter}")
+    print("==========================================")
+    print(f"Exact Match (EM) Accuracy      : {final_em:.2f}%")
+    print(f"Execution (EX) Accuracy        : {final_ex:.2f}%")
+    print("==========================================\n")
+if __name__ == "__main__":
+    main()

src/eval_rl_fixed.py ADDED Viewed

	@@ -0,0 +1,756 @@

+# import json
+# import subprocess
+# import sys
+# import argparse
+# import random
+# import sqlite3
+# import time
+# import re
+# import os
+# from pathlib import Path
+# import torch
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# from peft import PeftModel
+# from prompting import encode_prompt
+# # -------------------------------
+# # NORMALIZATION
+# # -------------------------------
+# def normalize_sql(sql):
+#     sql = sql.replace('"', "'")
+#     sql = re.sub(r"\s+", " ", sql)
+#     return sql.strip().lower().rstrip(";")
+# # -------------------------------
+# # 🔥 SAFE RESULT NORMALIZATION (FIX)
+# # -------------------------------
+# def normalize_result(res):
+#     try:
+#         return sorted([str(r) for r in res])
+#     except:
+#         return []
+# # -------------------------------
+# # EXECUTION CHECK (FIXED)
+# # -------------------------------
+# def check_execution(pred_sql, gold_sql, db_path):
+#     try:
+#         conn = sqlite3.connect(db_path)
+#         conn.text_factory = lambda b: b.decode(errors='ignore')
+#         start_time = time.monotonic()
+#         def timeout_handler():
+#             return 1 if (time.monotonic() - start_time) > 2.0 else 0
+#         conn.set_progress_handler(timeout_handler, 10000)
+#         cursor = conn.cursor()
+#         cursor.execute(pred_sql)
+#         pred_res = cursor.fetchall()
+#         cursor.execute(gold_sql)
+#         gold_res = cursor.fetchall()
+#         conn.close()
+#         # 🔥 FIXED COMPARISON
+#         return normalize_result(pred_res) == normalize_result(gold_res)
+#     except Exception:
+#         return False
+# # -------------------------------
+# # SPIDER PARSER
+# # -------------------------------
+# def _parse_spider_accuracy(stdout: str, metric_type: str):
+#     for line in stdout.splitlines():
+#         if metric_type == "exec" and line.strip().startswith("execution"):
+#             try:
+#                 return float(line.split()[-1])
+#             except:
+#                 pass
+#         elif metric_type == "match" and line.strip().startswith("exact"):
+#             try:
+#                 return float(line.split()[-1])
+#             except:
+#                 pass
+#     return None
+# # -------------------------------
+# # MAIN
+# # -------------------------------
+# def main():
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--adapter", type=str, required=True)
+#     parser.add_argument("--num_samples", type=int, default=700)
+#     parser.add_argument("--shuffle_dev", action="store_true")
+#     parser.add_argument("--shuffle_seed", type=int, default=42)
+#     args = parser.parse_args()
+#     project_root = Path(__file__).resolve().parents[1]
+#     adapter_dir = project_root / args.adapter
+#     db_root = project_root / "data" / "database"
+#     table_json = project_root / "data" / "tables.json"
+#     dev_json = project_root / "data" / "dev.json"
+#     pred_path = project_root / "temp_predictions.txt"
+#     temp_gold_path = project_root / "temp_gold.sql"
+#     if not adapter_dir.exists():
+#         raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+#     device = "mps" if torch.backends.mps.is_available() else (
+#         "cuda" if torch.cuda.is_available() else "cpu"
+#     )
+#     print(f"Using device: {device}")
+#     BASE_MODEL = "Salesforce/codet5-base"
+#     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+#     if tokenizer.pad_token is None:
+#         tokenizer.pad_token = tokenizer.eos_token
+#     print(f"\n📦 Loading Model: {args.adapter}")
+#     base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+#     adapter_for_peft = os.path.relpath(adapter_dir, project_root)
+#     model = PeftModel.from_pretrained(
+#         base,
+#         adapter_for_peft,
+#         local_files_only=True
+#     ).to(device)
+#     model = model.merge_and_unload()
+#     model.eval()
+#     # -------------------------------
+#     # LOAD DATA
+#     # -------------------------------
+#     with dev_json.open() as f:
+#         dev = json.load(f)
+#     if args.shuffle_dev:
+#         rng = random.Random(args.shuffle_seed)
+#         rng.shuffle(dev)
+#     dev = dev[: args.num_samples]
+#     total = len(dev)
+#     gen_kwargs = dict(
+#         max_new_tokens=160,
+#         num_beams=8,
+#         length_penalty=0.8,
+#         do_sample=False,
+#         early_stopping=True,
+#         pad_token_id=tokenizer.pad_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
+#     )
+#     print(f"\n🚀 Evaluating {total} samples...\n")
+#     em_correct = 0
+#     ex_correct = 0
+#     with pred_path.open("w") as out_pred, temp_gold_path.open("w") as out_gold, torch.no_grad():
+#         for i, ex in enumerate(dev, start=1):
+#             db_id = ex["db_id"]
+#             question = ex["question"]
+#             gold_query = ex["query"]
+#             db_path = db_root / db_id / f"{db_id}.sqlite"
+#             # -------------------------------
+#             # GENERATE SQL
+#             # -------------------------------
+#             input_ids = encode_prompt(
+#                 tokenizer,
+#                 question,
+#                 db_id,
+#                 device=device,
+#                 max_input_tokens=512
+#             )
+#             input_ids = input_ids.unsqueeze(0).to(device)
+#             attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+#             outputs = model.generate(
+#                 input_ids=input_ids,
+#                 attention_mask=attention_mask,
+#                 **gen_kwargs
+#             )
+#             pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+#             # -------------------------------
+#             # SAVE FOR SPIDER EVAL
+#             # -------------------------------
+#             out_pred.write(f"{pred_sql}\n")
+#             out_gold.write(f"{gold_query}\t{db_id}\n")
+#             # -------------------------------
+#             # LIVE METRICS
+#             # -------------------------------
+#             if normalize_sql(pred_sql) == normalize_sql(gold_query):
+#                 em_correct += 1
+#             if check_execution(pred_sql, gold_query, db_path):
+#                 ex_correct += 1
+#             if i % 20 == 0 or i == total:
+#                 print(
+#                     f"Progress: {i}/{total} | "
+#                     f"EM: {(em_correct/i)*100:.2f}% | "
+#                     f"EX: {(ex_correct/i)*100:.2f}%"
+#                 )
+#     print("\n🚀 Running Official Spider Evaluation...\n")
+#     eval_script = project_root / "spider_eval" / "evaluation.py"
+#     # EXACT MATCH
+#     cmd_match = [
+#         sys.executable, str(eval_script),
+#         "--gold", str(temp_gold_path),
+#         "--pred", str(pred_path),
+#         "--etype", "match",
+#         "--db", str(db_root),
+#         "--table", str(table_json),
+#     ]
+#     proc_match = subprocess.run(cmd_match, capture_output=True, text=True)
+#     exact_acc = _parse_spider_accuracy(proc_match.stdout, "match")
+#     # EXECUTION
+#     cmd_exec = [
+#         sys.executable, str(eval_script),
+#         "--gold", str(temp_gold_path),
+#         "--pred", str(pred_path),
+#         "--etype", "exec",
+#         "--db", str(db_root),
+#         "--table", str(table_json),
+#     ]
+#     proc_exec = subprocess.run(cmd_exec, capture_output=True, text=True)
+#     exec_acc = _parse_spider_accuracy(proc_exec.stdout, "exec")
+#     print("==========================================")
+#     print(f"🎯 OFFICIAL SPIDER RESULTS FOR: {args.adapter}")
+#     print("==========================================")
+#     print(f"Exact Match Accuracy  : {exact_acc*100:.2f}%" if exact_acc else "EM parsing failed")
+#     print(f"Execution Accuracy    : {exec_acc*100:.2f}%" if exec_acc else "EX parsing failed")
+#     print("==========================================\n")
+# if __name__ == "__main__":
+#     main()
+# import json
+# import sqlite3
+# import re
+# import time
+# import sys
+# import argparse
+# from pathlib import Path
+# # ==========================================
+# # PATH SETUP
+# # ==========================================
+# PROJECT_ROOT = Path(__file__).resolve().parents[1]
+# if str(PROJECT_ROOT) not in sys.path:
+#     sys.path.insert(0, str(PROJECT_ROOT))
+# from src.text2sql_engine import get_engine
+# from src.sql_validator import validate_sql_schema
+# # ==========================================
+# # CONFIG
+# # ==========================================
+# DATA_PATH = PROJECT_ROOT / "data" / "dev.json"
+# DB_ROOT = PROJECT_ROOT / "data" / "database"
+# # ==========================================
+# # NORMALIZATION
+# # ==========================================
+# def normalize_sql(sql):
+#     if not isinstance(sql, str):
+#         return ""
+#     sql = sql.replace('"', "'")
+#     sql = re.sub(r"\s+", " ", sql)
+#     return sql.strip().lower().rstrip(";")
+# def normalize_result(res):
+#     try:
+#         return sorted([tuple(map(str, r)) for r in res])
+#     except:
+#         return []
+# # ==========================================
+# # EXECUTION
+# # ==========================================
+# def execute_sql(db_path, sql):
+#     try:
+#         conn = sqlite3.connect(db_path)
+#         start = time.time()
+#         def timeout():
+#             return 1 if (time.time() - start) > 2 else 0
+#         conn.set_progress_handler(timeout, 10000)
+#         cur = conn.cursor()
+#         cur.execute(sql)
+#         res = cur.fetchall()
+#         conn.close()
+#         return res
+#     except Exception:
+#         return None
+# # ==========================================
+# # EVALUATION
+# # ==========================================
+# def evaluate(engine, data, is_constrained=False, debug=False):
+#     attempted = 0
+#     total = 0
+#     exact_match = 0
+#     execution_match = 0
+#     constraint_ok = 0
+#     skipped_missing_db = 0
+#     skipped_exception = 0
+#     skipped_no_sql = 0
+#     total_time = 0
+#     for i, item in enumerate(data, 1):
+#         question = item.get("question", "")
+#         gold_sql = item.get("query", "")
+#         db_id = item.get("db_id", "")
+#         db_path = DB_ROOT / db_id / f"{db_id}.sqlite"
+#         if not db_path.exists():
+#             skipped_missing_db += 1
+#             continue
+#         try:
+#             start = time.time()
+#             result = engine.ask(question, db_id)
+#             total_time += (time.time() - start)
+#         except Exception:
+#             skipped_exception += 1
+#             continue
+#         if not isinstance(result, dict):
+#             continue
+#         pred_sql = result.get("sql", "")
+#         # DEBUG
+#         if debug:
+#             print(f"\nQ: {question}")
+#             print(f"PRED: {pred_sql}")
+#             print(f"GOLD: {gold_sql}")
+#         if not pred_sql:
+#             skipped_no_sql += 1
+#             continue
+#         attempted += 1
+#         total += 1
+#         # CONSTRAINT CHECK
+#         if is_constrained:
+#             try:
+#                 is_valid, _ = validate_sql_schema(pred_sql, str(db_path))
+#                 if is_valid:
+#                     constraint_ok += 1
+#             except:
+#                 pass
+#         # EXACT MATCH
+#         if normalize_sql(pred_sql) == normalize_sql(gold_sql):
+#             exact_match += 1
+#         # EXECUTION MATCH
+#         pred_res = execute_sql(str(db_path), pred_sql)
+#         gold_res = execute_sql(str(db_path), gold_sql)
+#         if pred_res is not None and gold_res is not None:
+#             if normalize_result(pred_res) == normalize_result(gold_res):
+#                 execution_match += 1
+#         # PROGRESS
+#         if i % 10 == 0:
+#             print(
+#                 f"[{i}/{len(data)}] "
+#                 f"EM: {exact_match/max(total,1):.3f} | "
+#                 f"EX: {execution_match/max(total,1):.3f} | "
+#                 f"Constraint: {(constraint_ok/max(total,1)) if is_constrained else 0:.3f}"
+#             )
+#     avg_latency = total_time / max(attempted, 1)
+#     return {
+#         "exact_match": exact_match / total if total > 0 else 0,
+#         "execution_accuracy": execution_match / total if total > 0 else 0,
+#         "constraint_rate": (constraint_ok / total if (is_constrained and total > 0) else 0),
+#         "avg_latency": avg_latency,
+#         "total": total,
+#         "attempted": attempted,
+#         "skipped_missing_db": skipped_missing_db,
+#         "skipped_exception": skipped_exception,
+#         "skipped_no_sql": skipped_no_sql,
+#     }
+# # ==========================================
+# # MAIN
+# # ==========================================
+# if __name__ == "__main__":
+#     ap = argparse.ArgumentParser()
+#     ap.add_argument("--num-samples", type=int, default=100)
+#     ap.add_argument("--adapter", type=str, default="checkpoints/best_rlhf_model")
+#     ap.add_argument("--debug", action="store_true")
+#     args = ap.parse_args()
+#     print(f"\n📥 Loading dataset from {DATA_PATH}...")
+#     with open(str(DATA_PATH)) as f:
+#         data = json.load(f)[: args.num_samples]
+#     # ==========================================
+#     # 🔴 BASE MODEL
+#     # ==========================================
+#     print("\n🚀 Running BASE MODEL...\n")
+#     engine_base = get_engine(
+#         adapter_path="checkpoints/sft_adapter_codet5"  ,  # 🔥 change this
+#         use_lora=True,
+#         use_constrained=False
+#     )
+#     res_base = evaluate(engine_base, data, is_constrained=False, debug=args.debug)
+#     # ==========================================
+#     # 🟡 RLHF (NO CONSTRAINT)
+#     # ==========================================
+#     print("\n🚀 Running RLHF (NO CONSTRAINT)...\n")
+#     engine_rlhf = get_engine(
+#        adapter_path="checkpoints/best_rlhf_model",
+#         use_lora=True,
+#         use_constrained=False
+#     )
+#     res_rlhf = evaluate(engine_rlhf, data, is_constrained=False, debug=args.debug)
+#     # ==========================================
+#     # 🟢 RLHF + CONSTRAINT
+#     # ==========================================
+#     print("\n🚀 Running RLHF + CONSTRAINED...\n")
+#     engine_const = get_engine(
+#         adapter_path="checkpoints/best_rlhf_model_2",
+#         use_lora=True,
+#         use_constrained=True
+#     )
+#     res_const = evaluate(engine_const, data, is_constrained=True, debug=args.debug)
+#     # ==========================================
+#     # FINAL RESULTS
+#     # ==========================================
+#     print("\n==========================================")
+#     print("🎯 FINAL RESULTS (3-WAY COMPARISON)")
+#     print("==========================================")
+#     print(f"Base Model       → EM: {res_base['exact_match']*100:.2f}% | "
+#           f"EX: {res_base['execution_accuracy']*100:.2f}%")
+#     print(f"RLHF             → EM: {res_rlhf['exact_match']*100:.2f}% | "
+#           f"EX: {res_rlhf['execution_accuracy']*100:.2f}%")
+#     print(f"RLHF + Constrain → EM: {res_const['exact_match']*100:.2f}% | "
+#           f"EX: {res_const['execution_accuracy']*100:.2f}% | "
+#           f"Constraint: {res_const['constraint_rate']*100:.2f}%")
+#     print("==========================================\n")
+import json
+import argparse
+import sqlite3
+import time
+import re
+import os
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+# Import handling
+try:
+    from prompting import encode_prompt
+    from src.sql_validator import validate_sql_schema
+except ImportError:
+    import sys
+    sys.path.append(str(Path(__file__).resolve().parents[1]))
+    from src.prompting import encode_prompt
+    from src.sql_validator import validate_sql_schema
+# =========================================================
+# ERROR LOGGING
+# =========================================================
+ERROR_LOG_FILE = "results/error_logs.json"
+def classify_error(sql, error_msg=""):
+    sql = sql.lower()
+    error_msg = str(error_msg).lower()
+    if "no such column" in error_msg:
+        return "wrong_column"
+    if "no such table" in error_msg:
+        return "wrong_table"
+    if "syntax error" in error_msg:
+        return "syntax_error"
+    if "ambiguous column" in error_msg:
+        return "ambiguous_column"
+    if "join" in sql and " on " not in sql:
+        return "missing_join"
+    return "other"
+def log_error(question, sql, error, error_type):
+    os.makedirs(os.path.dirname(ERROR_LOG_FILE), exist_ok=True)
+    entry = {
+        "question": question,
+        "sql": sql,
+        "error": str(error),
+        "error_type": error_type,
+        "timestamp": time.time()
+    }
+    logs = []
+    if os.path.exists(ERROR_LOG_FILE):
+        try:
+            with open(ERROR_LOG_FILE, "r") as f:
+                content = f.read().strip()
+                if content:
+                    logs = json.loads(content)
+        except:
+            logs = []
+    logs.append(entry)
+    with open(ERROR_LOG_FILE, "w") as f:
+        json.dump(logs, f, indent=2)
+# =========================================================
+# 🔥 FINAL FIX_SQL (BALANCED VERSION)
+# =========================================================
+def fix_sql(sql):
+    if not sql:
+        return "SELECT 1"
+    s = str(sql).strip()
+    # Extract SQL only
+    match = re.search(r"(?i)(select|with)[\s\S]*", s)
+    if match:
+        s = match.group(0)
+    s = s.split(";")[0].strip()
+    # NULL fixes
+    s = re.sub(r'(?i)=\s*null', 'IS NULL', s)
+    s = re.sub(r'(?i)!=\s*null', 'IS NOT NULL', s)
+    # Fix commas
+    s = re.sub(r',\s*,+', ',', s)
+    s = re.sub(r'(?i),\s*from', ' FROM', s)
+    # 🔥 LIGHT COLUMN SAFETY (main improvement)
+    if "select" in s.lower():
+        if len(re.findall(r'\w+\.\w+', s)) > 3:
+            s = re.sub(r'(?i)select\s+.*?\s+from', 'SELECT * FROM', s)
+    # 🔥 JOIN fix
+    if "join" in s.lower() and " on " not in s.lower():
+        s = re.sub(r'join\s+(\w+)', r'JOIN \1 ON 1=1', s, flags=re.I)
+    # Ensure valid SQL
+    if not s.lower().startswith(("select", "with")):
+        return "SELECT 1"
+    return s.strip()
+# =========================================================
+# NORMALIZATION
+# =========================================================
+def normalize_sql(sql):
+    if not sql:
+        return ""
+    return re.sub(r"\s+", " ", str(sql)).strip().lower()
+def normalize_result(res):
+    if not res:
+        return []
+    try:
+        normalized = [tuple(sorted(str(x) for x in row)) for row in res]
+        return sorted(normalized)
+    except:
+        return sorted([str(r) for r in res])
+# =========================================================
+# EXECUTION HELPERS
+# =========================================================
+def is_executable(sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        cur = conn.cursor()
+        cur.execute(sql)
+        conn.close()
+        return True
+    except:
+        return False
+def check_execution(pred_sql, gold_sql, db_path, question):
+    try:
+        conn = sqlite3.connect(db_path)
+        conn.text_factory = lambda b: b.decode(errors='ignore')
+        cur = conn.cursor()
+        cur.execute(gold_sql)
+        gold_res = cur.fetchall()
+        cur.execute(pred_sql)
+        pred_res = cur.fetchall()
+        conn.close()
+        return normalize_result(pred_res) == normalize_result(gold_res)
+    except Exception as e:
+        error_type = classify_error(pred_sql, str(e))
+        log_error(question, pred_sql, str(e), error_type)
+        return False
+# =========================================================
+# MAIN
+# =========================================================
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True)
+    parser.add_argument("--num_samples", type=int, default=700)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parent
+    if project_root.name in ["scripts", "src"]:
+        project_root = project_root.parent
+    db_root = project_root / "data" / "database"
+    dev_json = project_root / "data" / "dev.json"
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    print(f"Loading model on {device}...")
+    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
+    base_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base").to(device)
+    model = PeftModel.from_pretrained(base_model, args.adapter).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    with open(dev_json, "r") as f:
+        dev_data = json.load(f)[:args.num_samples]
+    em_correct = 0
+    ex_correct = 0
+    constraint_ok = 0
+    print(f"\n🚀 Evaluating {len(dev_data)} samples...\n")
+    for i, ex in enumerate(dev_data, 1):
+        db_id = ex["db_id"]
+        question = ex["question"]
+        gold_query = ex["query"]
+        db_path = db_root / db_id / f"{db_id}.sqlite"
+        input_tensor = encode_prompt(tokenizer, question, db_id, device=device).unsqueeze(0)
+        with torch.no_grad():
+            outputs = model.generate(
+                input_ids=input_tensor,
+                max_new_tokens=128,
+                num_beams=8,
+                num_return_sequences=8
+            )
+        best_sql = ""
+        # 🔥 EXECUTION-GUIDED SELECTION
+        for out in outputs:
+            raw_pred = tokenizer.decode(out, skip_special_tokens=True)
+            candidate_sql = fix_sql(raw_pred)
+            if is_executable(candidate_sql, str(db_path)):
+                best_sql = candidate_sql
+                break
+        if not best_sql:
+            best_sql = fix_sql(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        try:
+            is_valid, _ = validate_sql_schema(best_sql, str(db_path))
+        except:
+            is_valid = False
+        if is_valid:
+            constraint_ok += 1
+        if normalize_sql(best_sql) == normalize_sql(gold_query):
+            em_correct += 1
+        if check_execution(best_sql, gold_query, str(db_path), question):
+            ex_correct += 1
+        if i % 50 == 0:
+            print(f"{i}/{len(dev_data)} done")
+    print("\n========================================")
+    print("🎯 FINAL EVALUATION RESULTS")
+    print("========================================")
+    print(f"Exact Match (EM):      {(em_correct/len(dev_data))*100:.2f}%")
+    print(f"Execution Acc (EX):    {(ex_correct/len(dev_data))*100:.2f}%")
+    print(f"Constraint Rate:       {(constraint_ok/len(dev_data))*100:.2f}%")
+    print("========================================")
+    print(f"Errors logged to: {ERROR_LOG_FILE}")
+if __name__ == "__main__":
+    main()

src/eval_rl_t5.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# import sys
+# import os
+# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# import json
+# import subprocess
+# import argparse
+# from pathlib import Path
+# import torch
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# from peft import PeftModel
+# # IMPORTANT: must match training prompt format
+# from prompting import build_prompt
+# from schema_utils import get_schema as get_db_schema
+# def _parse_exec_accuracy(stdout: str):
+#     for line in stdout.splitlines():
+#         if line.strip().startswith("execution"):
+#             parts = line.split()
+#             try:
+#                 return float(parts[-1])
+#             except Exception:
+#                 return None
+#     return None
+# def main():
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--adapter", type=str, default="checkpoints/best_rlhf_model")
+#     parser.add_argument("--num_samples", type=int, default=200)
+#     args = parser.parse_args()
+#     project_root = Path(__file__).resolve().parents[1]
+#     adapter_dir = project_root / args.adapter
+#     if not adapter_dir.exists():
+#         raise FileNotFoundError(f"Adapter not found: {adapter_dir}")
+#     db_root = project_root / "data" / "database"
+#     table_json = project_root / "data" / "tables.json"
+#     dev_json = project_root / "data" / "dev.json"
+#     gold_sql = project_root / "data" / "dev_gold.sql"
+#     pred_path = project_root / "predictions_rl.txt"
+#     device = "mps" if torch.backends.mps.is_available() else "cpu"
+#     # ---- LOAD MODEL (CodeT5 + LoRA) ----
+#     base_model = "Salesforce/codet5-base"
+#     tokenizer = AutoTokenizer.from_pretrained(str(adapter_dir))
+#     base = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
+#     model = PeftModel.from_pretrained(base, str(adapter_dir)).to(device)
+#     # merge LoRA for faster inference
+#     model = model.merge_and_unload()
+#     model.eval()
+#     model.config.use_cache = True
+#     if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
+#         tokenizer.pad_token = tokenizer.eos_token
+#     # ---- LOAD DATA ----
+#     with dev_json.open() as f:
+#         dev = json.load(f)
+#     dev = dev[: args.num_samples]
+#     gen_kwargs = dict(
+#         max_new_tokens=120,
+#         do_sample=False,
+#         num_beams=1,
+#         pad_token_id=tokenizer.pad_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
+#     )
+#     print(f"Generating {len(dev)} predictions...")
+#     with pred_path.open("w") as out_f, torch.no_grad():
+#         for i, ex in enumerate(dev, start=1):
+#             db_id = ex["db_id"]
+#             question = ex["question"]
+#             db_path = db_root / db_id / f"{db_id}.sqlite"
+#             schema = get_db_schema(str(db_path))
+#             prompt = build_prompt(question, schema, use_schema=True)
+#             inputs = tokenizer(
+#                 prompt,
+#                 return_tensors="pt",
+#                 truncation=True,
+#                 max_length=512
+#             ).to(device)
+#             out = model.generate(**inputs, **gen_kwargs)
+#             pred_sql = tokenizer.decode(out[0], skip_special_tokens=True).strip()
+#             out_f.write(f"{pred_sql}\t{db_id}\n")
+#             if i % 20 == 0 or i == len(dev):
+#                 print(f"{i}/{len(dev)} done")
+#     # ---- SPIDER OFFICIAL EVAL ----
+#     eval_script = project_root / "spider_eval" / "evaluation.py"
+#     cmd = [
+#         sys.executable,
+#         str(eval_script),
+#         "--gold",
+#         str(gold_sql),
+#         "--pred",
+#         str(pred_path),
+#         "--etype",
+#         "exec",
+#         "--db",
+#         str(db_root),
+#         "--table",
+#         str(table_json),
+#     ]
+#     print("\nRunning Spider execution evaluation...\n")
+#     proc = subprocess.run(cmd, capture_output=True, text=True)
+#     if proc.returncode != 0:
+#         print(proc.stdout)
+#         print(proc.stderr)
+#         sys.exit(proc.returncode)
+#     print(proc.stdout)
+#     acc = _parse_exec_accuracy(proc.stdout)
+#     if acc is not None:
+#         print(f"\nFINAL EXECUTION ACCURACY: {acc*100:.2f}%")
+#     else:
+#         print("Could not parse execution accuracy")
+# if __name__ == "__main__":
+#     main()
+import json
+import sqlite3
+import argparse
+import time
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+# ---------------- PROMPT (FIXED TO PERFECTLY MATCH RLHF TRAINING) ----------------
+def build_prompt(question, schema):
+    return f"translate English to SQL:\n\nSchema:\n{schema}\n\nQuestion:\n{question}\n\nSQL:"
+# ---------------- LOAD SCHEMA (FIXED TO MATCH TRAINING FORMAT) ----------------
+def load_schema(db_path):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    tables = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table';"
+    ).fetchall()
+    schema = ""
+    for (table,) in tables:
+        cols = cursor.execute(f"PRAGMA table_info({table});").fetchall()
+        col_names = [c[1] for c in cols]
+        # Space-separated, not newline-separated, just like the RLHF script
+        schema += f"{table}({', '.join(col_names)}) "
+    conn.close()
+    return schema.strip()
+# ---------------- EXECUTION CHECK WITH TIMEOUT ----------------
+def execution_match(pred_sql, gold_sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        # --- 5-SECOND TIMEOUT SO THE SCRIPT DOESN'T HANG ---
+        start_time = time.monotonic()
+        def timeout_handler():
+            return 1 if (time.monotonic() - start_time) > 5.0 else 0
+        conn.set_progress_handler(timeout_handler, 10000)
+        cur = conn.cursor()
+        cur.execute(pred_sql)
+        pred = cur.fetchall()
+        cur.execute(gold_sql)
+        gold = cur.fetchall()
+        conn.close()
+        return pred == gold
+    except Exception:
+        return False
+# ---------------- MAIN ----------------
+def main():
+    parser = argparse.ArgumentParser()
+    # 🎯 Set the default directly to your best RLHF model!
+    parser.add_argument("--adapter", type=str, default="checkpoints/rlhf_t5_best")
+    parser.add_argument("--num_samples", type=int, default=1000)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    # Resolve adapter path safely
+    adapter_path = project_root / args.adapter
+    dev_json = project_root / "data" / "dev.json"
+    db_root = project_root / "data" / "database"
+    # 🎯 Added CUDA support
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    # load model
+    base_model = "t5-small"
+    print(f"Loading Base: {base_model}")
+    print(f"Loading Adapter: {adapter_path}")
+    tokenizer = AutoTokenizer.from_pretrained(str(adapter_path))
+    base = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
+    model = PeftModel.from_pretrained(base, str(adapter_path)).to(device)
+    model = model.merge_and_unload()
+    with open(dev_json) as f:
+        dev = json.load(f)[: args.num_samples]
+    correct = 0
+    print(f"Evaluating {len(dev)} examples...\n")
+    for i, ex in enumerate(dev, 1):
+        question = ex["question"]
+        db_id = ex["db_id"]
+        gold_sql = ex["query"]
+        db_path = db_root / db_id / f"{db_id}.sqlite"
+        schema = load_schema(db_path)
+        prompt = build_prompt(question, schema)
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=80,
+                do_sample=False,
+                num_beams=4,
+            )
+        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "SQL:" in pred_sql:
+            pred_sql = pred_sql.split("SQL:")[-1].strip()
+        match = execution_match(pred_sql, gold_sql, db_path)
+        if match:
+            correct += 1
+        if i % 10 == 0:
+            print(f"{i}/{len(dev)} | Acc: {correct/i:.3f}")
+    print("\n=============================")
+    print(f"FINAL EXECUTION ACCURACY: {correct/len(dev)*100:.2f}%")
+    print("=============================")
+if __name__ == "__main__":
+    main()

src/eval_single_model.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import json
+import subprocess
+import sys
+import argparse
+import random
+import sqlite3
+import time
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+# Assuming you have a prompting.py that has encode_prompt
+from prompting import encode_prompt
+# -------------------------------
+# LIVE CHECK HELPERS
+# -------------------------------
+def normalize_sql(sql):
+    sql = sql.replace('"', "'")
+    sql = re.sub(r"\s+", " ", sql)
+    return sql.strip().lower().rstrip(";")
+def check_execution(pred_sql, gold_sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        conn.text_factory = lambda b: b.decode(errors='ignore')
+        start_time = time.monotonic()
+        def timeout_handler():
+            return 1 if (time.monotonic() - start_time) > 2.0 else 0
+        conn.set_progress_handler(timeout_handler, 10000)
+        cursor = conn.cursor()
+        cursor.execute(pred_sql)
+        pred_res = cursor.fetchall()
+        cursor.execute(gold_sql)
+        gold_res = cursor.fetchall()
+        conn.close()
+        return sorted(pred_res) == sorted(gold_res)
+    except Exception:
+        return False
+# -------------------------------
+# SPIDER PARSER
+# -------------------------------
+def _parse_spider_accuracy(stdout: str, metric_type: str) -> float | None:
+    for line in stdout.splitlines():
+        if metric_type == "exec" and line.strip().startswith("execution"):
+            try: return float(line.split()[-1])
+            except: pass
+        elif metric_type == "match" and line.strip().startswith("exact"):
+            try: return float(line.split()[-1])
+            except: pass
+    return None
+# -------------------------------
+# MAIN
+# -------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True, help="Path to your checkpoint")
+    parser.add_argument("--base_model", type=str, required=True, help="E.g., facebook/bart-base, t5-small")
+    parser.add_argument("--model_name", type=str, required=True, help="Name for the plot label (e.g., 'BART RLHF')")
+    parser.add_argument("--num_samples", type=int, default=700)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    adapter_dir = project_root / args.adapter
+    db_root = project_root / "data" / "database"
+    table_json = project_root / "data" / "tables.json"
+    dev_json = project_root / "data" / "dev.json"
+    pred_path = project_root / "temp_predictions.txt"
+    temp_gold_path = project_root / "temp_gold.sql"
+    # NEW: Plot directory setup
+    plot_dir = project_root / "comparison_plots"
+    plot_dir.mkdir(parents=True, exist_ok=True)
+    results_json_path = plot_dir / "all_metrics.json"
+    if not adapter_dir.exists():
+        raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Loading Base Model: {args.base_model} on {device}...")
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    base = AutoModelForSeq2SeqLM.from_pretrained(args.base_model).to(device)
+    model = PeftModel.from_pretrained(base, str(adapter_dir)).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    with dev_json.open() as f:
+        dev = json.load(f)[: args.num_samples]
+    total = len(dev)
+    gen_kwargs = dict(
+        max_new_tokens=160,
+        num_beams=4,
+        do_sample=False,
+        early_stopping=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    print(f"\n🚀 Generating and live-tracking {total} samples...\n")
+    em_correct = 0
+    ex_correct = 0
+    with pred_path.open("w") as out_pred, temp_gold_path.open("w") as out_gold, torch.no_grad():
+        for i, ex in enumerate(dev, start=1):
+            db_id = ex["db_id"]
+            question = ex["question"]
+            gold_query = ex["query"]
+            db_path = db_root / db_id / f"{db_id}.sqlite"
+            # Generate
+            input_ids = encode_prompt(tokenizer, question, db_id, device=device, max_input_tokens=512)
+            input_ids = input_ids.unsqueeze(0).to(device)
+            attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)
+            pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+            out_pred.write(f"{pred_sql}\n")
+            out_gold.write(f"{gold_query}\t{db_id}\n")
+            # --- PRINT FIRST 3 EXAMPLES ---
+            if i <= 3:
+                print(f"--- 🔍 Example {i} ---")
+                print(f"Q   : {question}")
+                print(f"Gold: {gold_query}")
+                print(f"Pred: {pred_sql}")
+                print("-" * 25)
+            # --- LIVE TRACKING CHECKS ---
+            if normalize_sql(pred_sql) == normalize_sql(gold_query):
+                em_correct += 1
+            if check_execution(pred_sql, gold_query, db_path):
+                ex_correct += 1
+            if i % 50 == 0 or i == total:
+                print(f"Progress: {i}/{total} | Current EM: {(em_correct/i)*100:.2f}% | Current EX: {(ex_correct/i)*100:.2f}%")
+    print("\nRunning Official Spider Evaluations...")
+    eval_script = project_root / "spider_eval" / "evaluation.py"
+    proc_match = subprocess.run([sys.executable, str(eval_script), "--gold", str(temp_gold_path), "--pred", str(pred_path), "--etype", "match", "--db", str(db_root), "--table", str(table_json)], capture_output=True, text=True)
+    exact_acc = _parse_spider_accuracy(proc_match.stdout, "match")
+    proc_exec = subprocess.run([sys.executable, str(eval_script), "--gold", str(temp_gold_path), "--pred", str(pred_path), "--etype", "exec", "--db", str(db_root), "--table", str(table_json)], capture_output=True, text=True)
+    exec_acc = _parse_spider_accuracy(proc_exec.stdout, "exec")
+    print("\n==========================================")
+    print(f"🎯 RESULTS FOR: {args.model_name}")
+    print("==========================================")
+    exact_val = exact_acc * 100 if exact_acc else 0
+    exec_val = exec_acc * 100 if exec_acc else 0
+    print(f"Exact Match : {exact_val:.2f}%")
+    print(f"Execution   : {exec_val:.2f}%")
+    print("==========================================\n")
+    # -------------------------------
+    # SAVE JSON & GENERATE PLOT
+    # -------------------------------
+    if results_json_path.exists():
+        with open(results_json_path, 'r') as f:
+            all_results = json.load(f)
+    else:
+        all_results = {}
+    all_results[args.model_name] = {"EM": exact_val, "EX": exec_val}
+    with open(results_json_path, 'w') as f:
+        json.dump(all_results, f, indent=4)
+    labels = list(all_results.keys())
+    em_vals = [all_results[k]["EM"] for k in labels]
+    ex_vals = [all_results[k]["EX"] for k in labels]
+    x = np.arange(len(labels))
+    width = 0.35
+    plt.figure(figsize=(max(8, len(labels) * 1.5), 6))
+    plt.bar(x - width/2, em_vals, width, label='Exact Match', color='#3498db')
+    plt.bar(x + width/2, ex_vals, width, label='Execution', color='#2ecc71')
+    plt.ylabel('Accuracy (%)', fontweight='bold')
+    plt.title('Model Comparison: Exact Match vs Execution Accuracy', fontweight='bold', fontsize=14)
+    plt.xticks(x, labels, rotation=45, ha="right")
+    plt.legend()
+    plt.ylim(0, max(max(em_vals, default=0), max(ex_vals, default=0)) + 15)
+    plt.grid(axis='y', linestyle='--', alpha=0.7)
+    # Attach labels to bars
+    for i in range(len(labels)):
+        plt.text(x[i] - width/2, em_vals[i] + 1, f"{em_vals[i]:.1f}%", ha='center', fontsize=9)
+        plt.text(x[i] + width/2, ex_vals[i] + 1, f"{ex_vals[i]:.1f}%", ha='center', fontsize=9)
+    plt.tight_layout()
+    plot_path = plot_dir / "accuracy_comparison.png"
+    plt.savefig(plot_path, dpi=300)
+    print(f"📈 Updated comparison plot saved to: {plot_path}")
+if __name__ == "__main__":
+    main()

src/evaluate_model_codet5.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# from __future__ import annotations
+# import json
+# import subprocess
+# import sys
+# import argparse
+# import sqlite3
+# import random
+# from pathlib import Path
+# import torch
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# from peft import PeftModel
+# from prompting import encode_prompt
+# def _parse_exec_accuracy(stdout: str) -> float | None:
+#     for line in stdout.splitlines():
+#         if line.strip().startswith("execution"):
+#             try:
+#                 return float(line.split()[-1])
+#             except:
+#                 return None
+#     return None
+# def main():
+#     # ---------------- ARGUMENTS ----------------
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--adapter", type=str, default="checkpoints/sft_adapter_codet5")
+#     parser.add_argument("--num_samples", type=int, default=1000)
+#     parser.add_argument("--shuffle_dev", action="store_true")
+#     parser.add_argument("--shuffle_seed", type=int, default=42)
+#     parser.add_argument("--accuracy_log", type=str, default="")
+#     args = parser.parse_args()
+#     project_root = Path(__file__).resolve().parents[1]
+#     adapter_dir = project_root / args.adapter
+#     db_root = project_root / "data" / "database"
+#     table_json = project_root / "data" / "tables.json"
+#     dev_json = project_root / "data" / "dev.json"
+#     gold_sql = project_root / "data" / "dev_gold.sql"
+#     pred_path = project_root / "predictions.txt"
+#     if not adapter_dir.exists():
+#         raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+#     # ---------------- DEVICE ----------------
+#     device = "mps" if torch.backends.mps.is_available() else (
+#         "cuda" if torch.cuda.is_available() else "cpu"
+#     )
+#     print("Using device:", device)
+#     # ---------------- LOAD MODEL ----------------
+#     BASE_MODEL = "Salesforce/codet5-base"
+#     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+#     if tokenizer.pad_token is None:
+#         tokenizer.pad_token = tokenizer.eos_token
+#     base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+#     model = PeftModel.from_pretrained(base, str(adapter_dir)).to(device)
+#     model = model.merge_and_unload()
+#     model.eval()
+#     # ---------------- LOAD DATA ----------------
+#     with dev_json.open() as f:
+#         dev = json.load(f)
+#     if args.shuffle_dev:
+#         rng = random.Random(args.shuffle_seed)
+#         rng.shuffle(dev)
+#     dev = dev[: args.num_samples]
+#     # ---------------- GENERATION CONFIG ----------------
+#     gen_kwargs = dict(
+#         max_new_tokens=160,
+#         num_beams=4,
+#         do_sample=False,
+#         early_stopping=True,
+#         pad_token_id=tokenizer.pad_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
+#     )
+#     print("Generating predictions...\n")
+#     correct = 0
+#     total = len(dev)
+#     accuracy_log_fh = None
+#     if args.accuracy_log:
+#         accuracy_log_path = (project_root / args.accuracy_log).resolve()
+#         accuracy_log_path.parent.mkdir(parents=True, exist_ok=True)
+#         accuracy_log_fh = accuracy_log_path.open("w")
+#         print(f"Writing running accuracy log to: {accuracy_log_path}")
+#     with pred_path.open("w") as out_f, torch.no_grad():
+#         for i, ex in enumerate(dev, start=1):
+#             db_id = ex["db_id"]
+#             question = ex["question"]
+#             gold_query = ex["query"]
+#             input_ids = encode_prompt(
+#                 tokenizer,
+#                 question,
+#                 db_id,
+#                 device=device,
+#                 max_input_tokens=512,
+#             )
+#             input_ids = input_ids.unsqueeze(0).to(device)
+#             attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+#             outputs = model.generate(
+#                 input_ids=input_ids,
+#                 attention_mask=attention_mask,
+#                 **gen_kwargs
+#             )
+#             pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+#             out_f.write(f"{pred_sql}\t{db_id}\n")
+#             # ---------------- LIVE EXECUTION CHECK ----------------
+#             try:
+#                 db_path = db_root / db_id / f"{db_id}.sqlite"
+#                 conn = sqlite3.connect(db_path)
+#                 cursor = conn.cursor()
+#                 cursor.execute(pred_sql)
+#                 pred_rows = cursor.fetchall()
+#                 cursor.execute(gold_query)
+#                 gold_rows = cursor.fetchall()
+#                 conn.close()
+#                 if sorted(pred_rows) == sorted(gold_rows):
+#                     correct += 1
+#             except Exception:
+#                 pass  # execution failed
+#             # 🔥 PRINT EVERY 10
+#             if i % 10 == 0 or i == total:
+#                 current_acc = correct / i
+#                 line = f"{i}/{total} | Acc: {current_acc:.3f}"
+#                 print(line)
+#                 if accuracy_log_fh is not None:
+#                     accuracy_log_fh.write(line + "\n")
+#     if accuracy_log_fh is not None:
+#         accuracy_log_fh.close()
+#     print("\nGeneration finished.\n")
+#     # ---------------- OFFICIAL SPIDER EVAL ----------------
+#     eval_script = project_root / "spider_eval" / "evaluation.py"
+#     cmd = [
+#         sys.executable,
+#         str(eval_script),
+#         "--gold", str(gold_sql),
+#         "--pred", str(pred_path),
+#         "--etype", "exec",
+#         "--db", str(db_root),
+#         "--table", str(table_json),
+#     ]
+#     print("Running Spider evaluation...")
+#     proc = subprocess.run(cmd, capture_output=True, text=True)
+#     print(proc.stdout)
+#     exec_acc = _parse_exec_accuracy(proc.stdout)
+#     if exec_acc is not None:
+#         print(f"\n🎯 Official Execution Accuracy: {exec_acc*100:.2f}%")
+#     else:
+#         print("Could not parse accuracy.")
+# if __name__ == "__main__":
+#     main()
+import json
+import subprocess
+import sys
+import argparse
+import random
+import sqlite3
+import time
+import re
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+# Assuming you have a prompting.py that has encode_prompt
+from prompting import encode_prompt
+# -------------------------------
+# LIVE CHECK HELPERS
+# -------------------------------
+def normalize_sql(sql):
+    """Basic normalization for the live progress bar."""
+    sql = sql.replace('"', "'")
+    sql = re.sub(r"\s+", " ", sql)
+    return sql.strip().lower().rstrip(";")
+def check_execution(pred_sql, gold_sql, db_path):
+    """Basic execution check for the live progress bar."""
+    try:
+        conn = sqlite3.connect(db_path)
+        conn.text_factory = lambda b: b.decode(errors='ignore')
+        # 2-second timeout so the live tracker doesn't freeze forever
+        start_time = time.monotonic()
+        def timeout_handler():
+            return 1 if (time.monotonic() - start_time) > 2.0 else 0
+        conn.set_progress_handler(timeout_handler, 10000)
+        cursor = conn.cursor()
+        cursor.execute(pred_sql)
+        pred_res = cursor.fetchall()
+        cursor.execute(gold_sql)
+        gold_res = cursor.fetchall()
+        conn.close()
+        # Simple sorted check for the live tracker
+        return sorted(pred_res) == sorted(gold_res)
+    except Exception:
+        return False
+# -------------------------------
+# SPIDER PARSER
+# -------------------------------
+def _parse_spider_accuracy(stdout: str, metric_type: str) -> float | None:
+    for line in stdout.splitlines():
+        if metric_type == "exec" and line.strip().startswith("execution"):
+            try: return float(line.split()[-1])
+            except: pass
+        elif metric_type == "match" and line.strip().startswith("exact"):
+            try: return float(line.split()[-1])
+            except: pass
+    return None
+# -------------------------------
+# MAIN
+# -------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True, help="Path to your SFT or RLHF checkpoint")
+    parser.add_argument("--num_samples", type=int, default=1034, help="Number of samples to evaluate")
+    parser.add_argument("--shuffle_dev", action="store_true")
+    parser.add_argument("--shuffle_seed", type=int, default=42)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    adapter_dir = project_root / args.adapter
+    db_root = project_root / "data" / "database"
+    table_json = project_root / "data" / "tables.json"
+    dev_json = project_root / "data" / "dev.json"
+    pred_path = project_root / "temp_predictions.txt"
+    temp_gold_path = project_root / "temp_gold.sql"
+    if not adapter_dir.exists():
+        raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    BASE_MODEL = "Salesforce/codet5-base"
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"Loading Model: {args.adapter}...")
+    base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+    model = PeftModel.from_pretrained(base, str(adapter_dir)).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    with dev_json.open() as f:
+        dev = json.load(f)
+    if args.shuffle_dev:
+        rng = random.Random(args.shuffle_seed)
+        rng.shuffle(dev)
+    dev = dev[: args.num_samples]
+    total = len(dev)
+    gen_kwargs = dict(
+        max_new_tokens=160,
+        num_beams=4,
+        do_sample=False,
+        early_stopping=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    print(f"\n🚀 Generating and live-tracking {total} samples...\n")
+    em_correct = 0
+    ex_correct = 0
+    with pred_path.open("w") as out_pred, temp_gold_path.open("w") as out_gold, torch.no_grad():
+        for i, ex in enumerate(dev, start=1):
+            db_id = ex["db_id"]
+            question = ex["question"]
+            gold_query = ex["query"]
+            db_path = db_root / db_id / f"{db_id}.sqlite"
+            # Generate
+            input_ids = encode_prompt(tokenizer, question, db_id, device=device, max_input_tokens=512)
+            input_ids = input_ids.unsqueeze(0).to(device)
+            attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)
+            pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+            # Write to files for official spider eval later
+            out_pred.write(f"{pred_sql}\n")
+            out_gold.write(f"{gold_query}\t{db_id}\n")
+            # --- LIVE TRACKING CHECKS ---
+            if normalize_sql(pred_sql) == normalize_sql(gold_query):
+                em_correct += 1
+            if check_execution(pred_sql, gold_query, db_path):
+                ex_correct += 1
+            # Print progress every 50 loops
+            if i % 50 == 0 or i == total:
+                print(f"Progress: {i}/{total} | Current EM: {(em_correct/i)*100:.2f}% | Current EX: {(ex_correct/i)*100:.2f}%")
+    print("\nGeneration finished. Running Official Spider Evaluations for final numbers...\n")
+    eval_script = project_root / "spider_eval" / "evaluation.py"
+    # 1. RUN EXACT MATCH EVAL
+    cmd_match = [
+        sys.executable, str(eval_script),
+        "--gold", str(temp_gold_path),
+        "--pred", str(pred_path),
+        "--etype", "match",
+        "--db", str(db_root),
+        "--table", str(table_json),
+    ]
+    proc_match = subprocess.run(cmd_match, capture_output=True, text=True)
+    exact_acc = _parse_spider_accuracy(proc_match.stdout, "match")
+    # 2. RUN EXECUTION EVAL
+    cmd_exec = [
+        sys.executable, str(eval_script),
+        "--gold", str(temp_gold_path),
+        "--pred", str(pred_path),
+        "--etype", "exec",
+        "--db", str(db_root),
+        "--table", str(table_json),
+    ]
+    proc_exec = subprocess.run(cmd_exec, capture_output=True, text=True)
+    exec_acc = _parse_spider_accuracy(proc_exec.stdout, "exec")
+    print("==========================================")
+    print(f"🎯 OFFICIAL SPIDER RESULTS FOR: {args.adapter}")
+    print("==========================================")
+    if exact_acc is not None:
+        print(f"Exact Set Match Accuracy  : {exact_acc*100:.2f}%")
+    else:
+        print("Exact Set Match Accuracy  : Could not parse output")
+    if exec_acc is not None:
+        print(f"Execution Accuracy        : {exec_acc*100:.2f}%")
+    else:
+        print("Execution Accuracy        : Could not parse output")
+    print("==========================================\n")
+if __name__ == "__main__":
+    main()

src/evaluate_model_t5_small_sft.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from __future__ import annotations
+import json
+import subprocess
+import sys
+import argparse
+import re
+import sqlite3
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+from prompting import encode_prompt
+# ---------------- PARSE ACC ----------------
+def _parse_exec_accuracy(stdout: str) -> float | None:
+    for line in stdout.splitlines():
+        if line.strip().startswith("execution"):
+            try:
+                return float(line.split()[-1])
+            except:
+                return None
+    return None
+# ---------------- CLEAN SQL ----------------
+def clean_prediction(pred_sql: str) -> str:
+    pred_sql = pred_sql.strip()
+    if "SQL:" in pred_sql:
+        pred_sql = pred_sql.split("SQL:")[-1]
+    pred_sql = pred_sql.replace('"', "'")
+    pred_sql = re.sub(r"\s+", " ", pred_sql).strip()
+    if not pred_sql.endswith(";"):
+        pred_sql += ";"
+    return pred_sql
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, default="checkpoints/sft_t5")
+    parser.add_argument("--num_samples", type=int, default=1000)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    adapter_dir = project_root / args.adapter
+    db_root = project_root / "data/database"
+    table_json = project_root / "data/tables.json"
+    dev_json = project_root / "data/dev.json"
+    gold_sql = project_root / "data/dev_gold.sql"
+    pred_path = project_root / "pred.sql"
+    if not adapter_dir.exists():
+        raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+    # ---------------- DEVICE ----------------
+    device = "mps" if torch.backends.mps.is_available() else (
+        "cuda" if torch.cuda.is_available() else "cpu"
+    )
+    print("Using device:", device)
+    # ---------------- LOAD MODEL ----------------
+    BASE_MODEL = "t5-small"
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+    model = PeftModel.from_pretrained(base, str(adapter_dir)).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ---------------- LOAD DATA ----------------
+    with dev_json.open() as f:
+        dev = json.load(f)[: args.num_samples]
+    print("Generating predictions...\n")
+    correct = 0
+    total = len(dev)
+    # ---------------- GENERATE + LIVE EXEC ----------------
+    with pred_path.open("w") as out_f, torch.no_grad():
+        for i, ex in enumerate(dev, start=1):
+            db_id = ex["db_id"]
+            question = ex["question"]
+            gold_query = ex["query"]
+            prompt_ids = encode_prompt(
+                tokenizer,
+                question,
+                db_id,
+                device=device,
+                max_input_tokens=512,
+            )
+            input_ids = prompt_ids.unsqueeze(0).to(device)
+            attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+            outputs = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=160,
+                num_beams=4,
+                do_sample=False,
+                early_stopping=True,
+            )
+            pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            pred_sql = clean_prediction(pred_sql)
+            out_f.write(pred_sql + "\n")
+            # -------- LIVE EXECUTION CHECK --------
+            try:
+                db_path = db_root / db_id / f"{db_id}.sqlite"
+                conn = sqlite3.connect(db_path)
+                cursor = conn.cursor()
+                cursor.execute(pred_sql)
+                pred_rows = cursor.fetchall()
+                cursor.execute(gold_query)
+                gold_rows = cursor.fetchall()
+                conn.close()
+                if sorted(pred_rows) == sorted(gold_rows):
+                    correct += 1
+            except Exception:
+                pass  # execution failed
+            # 🔥 PRINT EVERY 10
+            if i % 10 == 0 or i == total:
+                current_acc = correct / i
+                print(f"{i}/{total} | Acc: {current_acc:.3f}")
+    print("\nGeneration finished.\n")
+    # ---------------- SPIDER EVAL ----------------
+    eval_script = project_root / "spider_eval/evaluation.py"
+    cmd = [
+        sys.executable,
+        str(eval_script),
+        "--gold", str(gold_sql),
+        "--pred", str(pred_path),
+        "--etype", "exec",
+        "--db", str(db_root),
+        "--table", str(table_json),
+    ]
+    print("Running Spider evaluation...")
+    proc = subprocess.run(cmd, capture_output=True, text=True)
+    print(proc.stdout)
+    exec_acc = _parse_exec_accuracy(proc.stdout)
+    if exec_acc is not None:
+        print(f"\n🎯 Official Execution Accuracy: {exec_acc*100:.2f}%")
+    else:
+        print("Could not parse accuracy.")
+if __name__ == "__main__":
+    main()

src/evaluate_rl_bart.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import json
+import sqlite3
+import argparse
+import time
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+# ---------------- PROMPT (IDENTICAL TO TRAINING) ----------------
+def build_prompt(question, schema):
+    return f"""
+Database Schema:
+{schema}
+Translate English to SQL:
+{question}
+SQL:
+"""
+# ---------------- LOAD SCHEMA ----------------
+def load_schema(db_path):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    tables = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table';"
+    ).fetchall()
+    schema = ""
+    for (table,) in tables:
+        cols = cursor.execute(f"PRAGMA table_info({table});").fetchall()
+        col_names = [c[1] for c in cols]
+        schema += f"{table}({', '.join(col_names)})\n"
+    conn.close()
+    return schema
+# ---------------- EXECUTION CHECK WITH TIMEOUT ----------------
+def execution_match(pred_sql, gold_sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        # --- 5-SECOND TIMEOUT SO EVALUATION DOESN'T FREEZE ---
+        start_time = time.monotonic()
+        def timeout_handler():
+            return 1 if (time.monotonic() - start_time) > 5.0 else 0
+        conn.set_progress_handler(timeout_handler, 10000)
+        cur = conn.cursor()
+        cur.execute(pred_sql)
+        pred = cur.fetchall()
+        cur.execute(gold_sql)
+        gold = cur.fetchall()
+        conn.close()
+        return pred == gold
+    except Exception:
+        return False
+# ---------------- MAIN ----------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True)
+    parser.add_argument("--num_samples", type=int, default=1034)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    dev_json = project_root / "data" / "dev.json"
+    db_root = project_root / "data" / "database"
+    # 🎯 Added CUDA support for Nvidia GPUs
+    device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
+    # load model
+    base_model = "facebook/bart-base"
+    print(f"Loading Base: {base_model}")
+    print(f"Loading Adapter: {args.adapter}")
+    tokenizer = AutoTokenizer.from_pretrained(args.adapter)
+    base = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
+    model = PeftModel.from_pretrained(base, args.adapter).to(device)
+    model = model.merge_and_unload()
+    with open(dev_json) as f:
+        dev = json.load(f)[: args.num_samples]
+    correct = 0
+    print(f"Evaluating {len(dev)} examples...\n")
+    for i, ex in enumerate(dev, 1):
+        question = ex["question"]
+        db_id = ex["db_id"]
+        gold_sql = ex["query"]
+        db_path = db_root / db_id / f"{db_id}.sqlite"
+        schema = load_schema(db_path)
+        prompt = build_prompt(question, schema)
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=80,
+                do_sample=False,
+                num_beams=4,
+            )
+        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "SQL:" in pred_sql:
+            pred_sql = pred_sql.split("SQL:")[-1].strip()
+        match = execution_match(pred_sql, gold_sql, db_path)
+        if match:
+            correct += 1
+        if i % 10 == 0:
+            print(f"{i}/{len(dev)} | Acc: {correct/i:.3f}")
+    print("\n=============================")
+    print(f"FINAL EXECUTION ACCURACY: {correct/len(dev)*100:.2f}%")
+    print("=============================")
+if __name__ == "__main__":
+    main()

src/evaluate_sft_bart.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from __future__ import annotations
+import json
+import subprocess
+import sys
+import argparse
+import re
+import sqlite3
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+from prompting import encode_prompt
+# ---------------- SQL CLEAN ----------------
+def extract_sql(text: str) -> str:
+    text = text.strip()
+    if "SQL:" in text:
+        text = text.split("SQL:")[-1]
+    match = re.search(r"(SELECT .*?)(?:$)", text, re.IGNORECASE | re.DOTALL)
+    if match:
+        text = match.group(1)
+    text = text.replace('"', "'")
+    text = re.sub(r"\s+", " ", text).strip()
+    if not text.endswith(";"):
+        text += ";"
+    return text
+# ---------------- ROBUST ACC PARSER ----------------
+def parse_exec_accuracy(stdout: str):
+    for line in stdout.splitlines():
+        if "execution" in line.lower():
+            numbers = re.findall(r"\d+\.\d+", line)
+            if numbers:
+                return float(numbers[-1])
+    return None
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, default="checkpoints/sft_best_bart_2")
+    parser.add_argument("--num_samples", type=int, default=1000)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    adapter_dir = project_root / args.adapter
+    if not adapter_dir.exists():
+        raise FileNotFoundError(f"Adapter not found: {adapter_dir}")
+    db_root = project_root / "data/database"
+    table_json = project_root / "data/tables.json"
+    dev_json = project_root / "data/dev.json"
+    gold_sql_file = project_root / "data/dev_gold.sql"
+    pred_sql_file = project_root / "pred.sql"
+    device = "mps" if torch.backends.mps.is_available() else (
+        "cuda" if torch.cuda.is_available() else "cpu"
+    )
+    print("Using device:", device)
+    # -------- LOAD MODEL --------
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(adapter_dir)
+    BASE_MODEL = "facebook/bart-base"
+    print(f"Loading base model {BASE_MODEL}...")
+    base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+    print("Loading LoRA adapter...")
+    model = PeftModel.from_pretrained(base_model, adapter_dir).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # -------- LOAD DATA --------
+    with open(dev_json) as f:
+        dev = json.load(f)[: args.num_samples]
+    print("Generating SQL predictions...\n")
+    correct = 0
+    total = len(dev)
+    with open(pred_sql_file, "w") as f, torch.no_grad():
+        for i, ex in enumerate(dev, 1):
+            question = ex["question"]
+            db_id = ex["db_id"]
+            gold_query = ex["query"]
+            prompt_ids = encode_prompt(
+                tokenizer,
+                question,
+                db_id,
+                device=device,
+                max_input_tokens=512,
+            )
+            input_ids = prompt_ids.unsqueeze(0).to(device)
+            attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+            outputs = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=160,
+                num_beams=4,
+                do_sample=False,
+            )
+            pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            pred_sql = extract_sql(pred)
+            f.write(f"{pred_sql}\t{db_id}\n")
+            # -------- LIVE EXECUTION CHECK --------
+            try:
+                db_path = db_root / db_id / f"{db_id}.sqlite"
+                conn = sqlite3.connect(db_path)
+                cursor = conn.cursor()
+                cursor.execute(pred_sql)
+                pred_rows = cursor.fetchall()
+                cursor.execute(gold_query)
+                gold_rows = cursor.fetchall()
+                conn.close()
+                # order insensitive comparison
+                if sorted(pred_rows) == sorted(gold_rows):
+                    correct += 1
+            except Exception:
+                pass  # execution failed
+            if i % 10 == 0 or i == total:
+                current_acc = correct / i
+                print(f"{i}/{total} | Acc: {current_acc:.3f}")
+    print("\nGeneration finished.\n")
+    # -------- RUN OFFICIAL SPIDER EVAL --------
+    eval_script = project_root / "spider_eval/evaluation.py"
+    if (project_root / "spider_eval/evaluation_bart.py").exists():
+        eval_script = project_root / "spider_eval/evaluation_bart.py"
+    cmd = [
+        sys.executable,
+        str(eval_script),
+        "--gold", str(gold_sql_file),
+        "--pred", str(pred_sql_file),
+        "--etype", "exec",
+        "--db", str(db_root),
+        "--table", str(table_json),
+    ]
+    print(f"\nRunning Spider evaluation using {eval_script.name}...")
+    proc = subprocess.run(cmd, capture_output=True, text=True, errors="ignore")
+    if proc.returncode != 0:
+        print("\nSpider evaluation crashed.")
+        print(proc.stderr)
+        return
+    print("\n--- Spider Eval Output ---")
+    print("\n".join(proc.stdout.splitlines()[-20:]))
+    acc = parse_exec_accuracy(proc.stdout)
+    if acc is not None:
+        print(f"\n🎯 Official Execution Accuracy: {acc*100:.2f}%")
+    else:
+        print("\nCould not parse official accuracy.")
+if __name__ == "__main__":
+    main()

src/evaluate_without_constraied.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# *********** code till task 3 ************
+# import json
+# import subprocess
+# import sys
+# import argparse
+# import random
+# import sqlite3
+# import time
+# import re
+# import os
+# from pathlib import Path
+# import torch
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# from peft import PeftModel
+# from prompting import encode_prompt
+# # -------------------------------
+# # NORMALIZATION
+# # -------------------------------
+# def normalize_sql(sql):
+#     sql = sql.replace('"', "'")
+#     sql = re.sub(r"\s+", " ", sql)
+#     return sql.strip().lower().rstrip(";")
+# # -------------------------------
+# # 🔥 SAFE RESULT NORMALIZATION (FIX)
+# # -------------------------------
+# def normalize_result(res):
+#     try:
+#         return sorted([str(r) for r in res])
+#     except:
+#         return []
+# # -------------------------------
+# # EXECUTION CHECK (FIXED)
+# # -------------------------------
+# def check_execution(pred_sql, gold_sql, db_path):
+#     try:
+#         conn = sqlite3.connect(db_path)
+#         conn.text_factory = lambda b: b.decode(errors='ignore')
+#         start_time = time.monotonic()
+#         def timeout_handler():
+#             return 1 if (time.monotonic() - start_time) > 2.0 else 0
+#         conn.set_progress_handler(timeout_handler, 10000)
+#         cursor = conn.cursor()
+#         cursor.execute(pred_sql)
+#         pred_res = cursor.fetchall()
+#         cursor.execute(gold_sql)
+#         gold_res = cursor.fetchall()
+#         conn.close()
+#         # 🔥 FIXED COMPARISON
+#         return normalize_result(pred_res) == normalize_result(gold_res)
+#     except Exception:
+#         return False
+# # -------------------------------
+# # SPIDER PARSER
+# # -------------------------------
+# def _parse_spider_accuracy(stdout: str, metric_type: str):
+#     for line in stdout.splitlines():
+#         if metric_type == "exec" and line.strip().startswith("execution"):
+#             try:
+#                 return float(line.split()[-1])
+#             except:
+#                 pass
+#         elif metric_type == "match" and line.strip().startswith("exact"):
+#             try:
+#                 return float(line.split()[-1])
+#             except:
+#                 pass
+#     return None
+# # -------------------------------
+# # MAIN
+# # -------------------------------
+# def main():
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--adapter", type=str, required=True)
+#     parser.add_argument("--num_samples", type=int, default= 500)
+#     parser.add_argument("--shuffle_dev", action="store_true")
+#     parser.add_argument("--shuffle_seed", type=int, default=42)
+#     args = parser.parse_args()
+#     project_root = Path(__file__).resolve().parents[1]
+#     adapter_dir = project_root / args.adapter
+#     db_root = project_root / "data" / "database"
+#     table_json = project_root / "data" / "tables.json"
+#     dev_json = project_root / "data" / "dev.json"
+#     pred_path = project_root / "temp_predictions.txt"
+#     temp_gold_path = project_root / "temp_gold.sql"
+#     if not adapter_dir.exists():
+#         raise FileNotFoundError(f"Missing adapter dir: {adapter_dir}")
+#     device = "mps" if torch.backends.mps.is_available() else (
+#         "cuda" if torch.cuda.is_available() else "cpu"
+#     )
+#     print(f"Using device: {device}")
+#     BASE_MODEL = "Salesforce/codet5-base"
+#     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+#     if tokenizer.pad_token is None:
+#         tokenizer.pad_token = tokenizer.eos_token
+#     print(f"\n📦 Loading Model: {args.adapter}")
+#     base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
+#     adapter_for_peft = os.path.relpath(adapter_dir, project_root)
+#     model = PeftModel.from_pretrained(
+#         base,
+#         adapter_for_peft,
+#         local_files_only=True
+#     ).to(device)
+#     model = model.merge_and_unload()
+#     model.eval()
+#     # -------------------------------
+#     # LOAD DATA
+#     # -------------------------------
+#     with dev_json.open() as f:
+#         dev = json.load(f)
+#     if args.shuffle_dev:
+#         rng = random.Random(args.shuffle_seed)
+#         rng.shuffle(dev)
+#     dev = dev[: args.num_samples]
+#     total = len(dev)
+#     gen_kwargs = dict(
+#         max_new_tokens=160,
+#         num_beams=8,
+#         length_penalty=0.8,
+#         do_sample=False,
+#         early_stopping=True,
+#         pad_token_id=tokenizer.pad_token_id,
+#         eos_token_id=tokenizer.eos_token_id,
+#     )
+#     print(f"\n🚀 Evaluating {total} samples...\n")
+#     em_correct = 0
+#     ex_correct = 0
+#     with pred_path.open("w") as out_pred, temp_gold_path.open("w") as out_gold, torch.no_grad():
+#         for i, ex in enumerate(dev, start=1):
+#             db_id = ex["db_id"]
+#             question = ex["question"]
+#             gold_query = ex["query"]
+#             db_path = db_root / db_id / f"{db_id}.sqlite"
+#             # -------------------------------
+#             # GENERATE SQL
+#             # -------------------------------
+#             input_ids = encode_prompt(
+#                 tokenizer,
+#                 question,
+#                 db_id,
+#                 device=device,
+#                 max_input_tokens=512
+#             )
+#             input_ids = input_ids.unsqueeze(0).to(device)
+#             attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+#             outputs = model.generate(
+#                 input_ids=input_ids,
+#                 attention_mask=attention_mask,
+#                 **gen_kwargs
+#             )
+#             pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+#             # -------------------------------
+#             # SAVE FOR SPIDER EVAL
+#             # -------------------------------
+#             out_pred.write(f"{pred_sql}\n")
+#             out_gold.write(f"{gold_query}\t{db_id}\n")
+#             # -------------------------------
+#             # LIVE METRICS
+#             # -------------------------------
+#             if normalize_sql(pred_sql) == normalize_sql(gold_query):
+#                 em_correct += 1
+#             if check_execution(pred_sql, gold_query, db_path):
+#                 ex_correct += 1
+#             if i % 20 == 0 or i == total:
+#                 print(
+#                     f"Progress: {i}/{total} | "
+#                     f"EM: {(em_correct/i)*100:.2f}% | "
+#                     f"EX: {(ex_correct/i)*100:.2f}%"
+#                 )
+#     print("\n🚀 Running Official Spider Evaluation...\n")
+#     eval_script = project_root / "spider_eval" / "evaluation.py"
+#     # EXACT MATCH
+#     cmd_match = [
+#         sys.executable, str(eval_script),
+#         "--gold", str(temp_gold_path),
+#         "--pred", str(pred_path),
+#         "--etype", "match",
+#         "--db", str(db_root),
+#         "--table", str(table_json),
+#     ]
+#     proc_match = subprocess.run(cmd_match, capture_output=True, text=True)
+#     exact_acc = _parse_spider_accuracy(proc_match.stdout, "match")
+#     # EXECUTION
+#     cmd_exec = [
+#         sys.executable, str(eval_script),
+#         "--gold", str(temp_gold_path),
+#         "--pred", str(pred_path),
+#         "--etype", "exec",
+#         "--db", str(db_root),
+#         "--table", str(table_json),
+#     ]
+#     proc_exec = subprocess.run(cmd_exec, capture_output=True, text=True)
+#     exec_acc = _parse_spider_accuracy(proc_exec.stdout, "exec")
+#     print("==========================================")
+#     print(f"🎯 OFFICIAL SPIDER RESULTS FOR: {args.adapter}")
+#     print("==========================================")
+#     print(f"Exact Match Accuracy  : {exact_acc*100:.2f}%" if exact_acc else "EM parsing failed")
+#     print(f"Execution Accuracy    : {exec_acc*100:.2f}%" if exec_acc else "EX parsing failed")
+#     print("==========================================\n")
+# if __name__ == "__main__":
+#     main()
+# *********** for task 2 ****************************************
+import json
+import argparse
+import random
+import sqlite3
+import re
+import os
+from pathlib import Path
+from collections import defaultdict
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+from prompting import encode_prompt
+# -------------------------------
+# NORMALIZATION
+# -------------------------------
+def normalize_sql(sql):
+    sql = sql.replace('"', "'")
+    sql = re.sub(r"\s+", " ", sql)
+    return sql.strip().lower().rstrip(";")
+def normalize_result(res):
+    try:
+        return sorted([str(r) for r in res])
+    except:
+        return []
+# -------------------------------
+# STEP 1: EXECUTION
+# -------------------------------
+def execute_with_error(sql, db_path):
+    try:
+        conn = sqlite3.connect(db_path)
+        cur = conn.cursor()
+        cur.execute(sql)
+        res = cur.fetchall()
+        conn.close()
+        return res, None
+    except Exception as e:
+        return None, str(e)
+# -------------------------------
+# STEP 2: ERROR CLASSIFICATION
+# -------------------------------
+def classify_error(sql, error_msg):
+    if error_msg is None:
+        return "correct"
+    err = error_msg.lower()
+    sql_l = sql.lower()
+    if "syntax" in err:
+        return "syntax_error"
+    if "no such table" in err:
+        return "wrong_table"
+    if "no such column" in err:
+        return "wrong_column"
+    if "ambiguous" in err:
+        return "missing_join"
+    if "datatype mismatch" in err:
+        return "type_error"
+    if "where" not in sql_l and any(x in sql_l for x in ["=", ">", "<"]):
+        return "missing_where"
+    return "other"
+# -------------------------------
+# STEP 4: HINTS
+# -------------------------------
+def generate_hint(error_type):
+    hints = {
+        "missing_join": "Try using JOIN between related tables.",
+        "wrong_column": "Check column names in schema.",
+        "missing_where": "Add WHERE condition.",
+        "syntax_error": "Fix SQL syntax.",
+        "wrong_table": "Verify table names.",
+        "type_error": "Check data types.",
+        "other": "Review SQL logic."
+    }
+    return hints.get(error_type, "")
+# -------------------------------
+# STEP 2 EXTRA: LIGHT ATTRIBUTION
+# -------------------------------
+def extract_keywords(question):
+    return [w for w in re.findall(r"\w+", question.lower()) if len(w) > 3]
+# -------------------------------
+# MAIN
+# -------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--adapter", type=str, required=True)
+    parser.add_argument("--num_samples", type=int, default=200)
+    args = parser.parse_args()
+    project_root = Path(__file__).resolve().parents[1]
+    db_root = project_root / "data" / "database"
+    dev_json = project_root / "data" / "dev.json"
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
+    base = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base").to(device)
+    model = PeftModel.from_pretrained(
+        base,
+        os.path.relpath(project_root / args.adapter, project_root),
+        local_files_only=True
+    ).to(device)
+    model = model.merge_and_unload()
+    model.eval()
+    with open(dev_json) as f:
+        dev = json.load(f)
+    dev = dev[:args.num_samples]
+    # STORAGE
+    error_counter = defaultdict(int)
+    error_examples = defaultdict(list)
+    success_examples = []
+    hint_examples = defaultdict(list)
+    operation_counter = defaultdict(int)
+    attribution_map = defaultdict(list)
+    em, ex = 0, 0
+    print(f"\n🚀 Evaluating {len(dev)} samples...\n")
+    for i, sample in enumerate(dev, 1):
+        db_id = sample["db_id"]
+        q = sample["question"]
+        gold = sample["query"]
+        db_path = db_root / db_id / f"{db_id}.sqlite"
+        input_ids = encode_prompt(tokenizer, q, db_id, device=device).unsqueeze(0)
+        out = model.generate(input_ids=input_ids, max_new_tokens=120, num_beams=8)
+        pred = tokenizer.decode(out[0], skip_special_tokens=True).strip()
+        # operation analysis
+        s = pred.lower()
+        if "select" in s: operation_counter["SELECT"] += 1
+        if "where" in s: operation_counter["WHERE"] += 1
+        if "join" in s: operation_counter["JOIN"] += 1
+        if "group by" in s: operation_counter["GROUP_BY"] += 1
+        if "order by" in s: operation_counter["ORDER_BY"] += 1
+        pred_res, err = execute_with_error(pred, db_path)
+        gold_res, _ = execute_with_error(gold, db_path)
+        error_type = classify_error(pred, err)
+        error_counter[error_type] += 1
+        # attribution
+        if err:
+            attribution_map[error_type].append(extract_keywords(q))
+        # examples
+        if len(error_examples[error_type]) < 3:
+            error_examples[error_type].append(pred)
+        # hints
+        if error_type != "correct":
+            hint = generate_hint(error_type)
+            if len(hint_examples[error_type]) < 3:
+                hint_examples[error_type].append((pred, hint))
+        # metrics
+        if normalize_sql(pred) == normalize_sql(gold):
+            em += 1
+        if pred_res and gold_res and normalize_result(pred_res) == normalize_result(gold_res):
+            ex += 1
+            if len(success_examples) < 5:
+                success_examples.append(pred)
+        if i % 20 == 0:
+            print(f"[{i}] EM: {em/i:.2f} | EX: {ex/i:.2f}")
+    # -------------------------------
+    # OUTPUT
+    # -------------------------------
+    print("\n🎯 FINAL RESULTS")
+    print(f"EM: {em/len(dev)*100:.2f}%")
+    print(f"EX: {ex/len(dev)*100:.2f}%")
+    print("\n🔥 ERROR SUMMARY")
+    for k, v in error_counter.items():
+        print(k, ":", v)
+    print("\n🔥 ERROR EXAMPLES")
+    for k in error_examples:
+        print("\n", k)
+        for e in error_examples[k]:
+            print("  ", e)
+    print("\n🔥 HINTS")
+    for k in hint_examples:
+        print("\n", k)
+        for sql, h in hint_examples[k]:
+            print("  ", sql)
+            print("  →", h)
+    print("\n🔥 ATTRIBUTION (KEYWORDS)")
+    for k in attribution_map:
+        print(k, ":", attribution_map[k][:3])
+    print("\n🔥 SQL OPERATIONS")
+    for k, v in operation_counter.items():
+        print(k, ":", v)
+    # -------------------------------
+    # ADVERSARIAL
+    # -------------------------------
+    print("\n🔥 ADVERSARIAL TESTS")
+    adv = [
+        "Find most expensive product",
+        "Top 3 students by marks",
+        "Average salary per department"
+    ]
+    for q in adv:
+        inp = encode_prompt(tokenizer, q, dev[0]["db_id"], device=device).unsqueeze(0)
+        out = model.generate(input_ids=inp, max_new_tokens=120)
+        print("\nQ:", q)
+        print("SQL:", tokenizer.decode(out[0], skip_special_tokens=True))
+if __name__ == "__main__":
+    main()

src/execution_reward copy.py ADDED Viewed

	@@ -0,0 +1,831 @@

+# from __future__ import annotations
+# import hashlib
+# import os
+# import queue
+# import re
+# import sqlite3
+# import threading
+# import time
+# from concurrent.futures import ThreadPoolExecutor, as_completed
+# from dataclasses import dataclass
+# from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
+# # --- CACHE CONTROL ---
+# USE_CACHE = True
+# _REWARD_CACHE: Dict[str, float] = {}
+# def set_use_cache(enabled: bool):
+#     """Dynamically toggle the reward cache for benchmarks."""
+#     global USE_CACHE
+#     USE_CACHE = enabled
+# def _normalize_sql(sql: str) -> str:
+#     if not isinstance(sql, str):
+#         return ""
+#     s = sql.strip()
+#     if s.startswith("```"):
+#         s = re.sub(r"^```[a-zA-Z0-9_+-]*\n?", "", s).strip()
+#         s = re.sub(r"\n?```$", "", s).strip()
+#     if s.lower().startswith("sql:"):
+#         s = s[4:].strip()
+#     if ";" in s:
+#         s = s.split(";", 1)[0].strip()
+#     return s
+# def _connect_readonly(db_path: str) -> sqlite3.Connection:
+#     uri = f"file:{os.path.abspath(db_path)}?mode=ro"
+#     conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
+#     conn.execute("PRAGMA query_only = ON;")
+#     conn.execute("PRAGMA foreign_keys = ON;")
+#     return conn
+# DEFAULT_QUERY_TIMEOUT_S = 2.0
+# def _with_timeout(conn: sqlite3.Connection, timeout_s: float = DEFAULT_QUERY_TIMEOUT_S) -> None:
+#     start = time.monotonic()
+#     def _handler() -> int:
+#         return 1 if (time.monotonic() - start) > timeout_s else 0
+#     conn.set_progress_handler(_handler, 10_000)
+# def _list_tables(conn: sqlite3.Connection) -> List[str]:
+#     try:
+#         cur = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
+#         return [r[0] for r in cur.fetchall() if r and isinstance(r[0], str)]
+#     except sqlite3.Error:
+#         return []
+# def _contains_table_name(sql: str, table_names: Sequence[str]) -> bool:
+#     s = sql.lower()
+#     for t in table_names:
+#         tl = t.lower()
+#         if not tl:
+#             continue
+#         if re.search(rf"\b{re.escape(tl)}\b", s):
+#             return True
+#     return False
+# def _explain_query_plan(conn: sqlite3.Connection, sql: str) -> bool:
+#     try:
+#         _with_timeout(conn, timeout_s=DEFAULT_QUERY_TIMEOUT_S)
+#         conn.execute(f"EXPLAIN QUERY PLAN {sql}")
+#         return True
+#     except sqlite3.Error:
+#         return False
+# def _execute(conn: sqlite3.Connection, sql: str, max_rows: int = 1000) -> Tuple[bool, List[Tuple], Optional[str]]:
+#     try:
+#         _with_timeout(conn, timeout_s=DEFAULT_QUERY_TIMEOUT_S)
+#         cur = conn.execute(sql)
+#         rows = cur.fetchmany(max_rows)
+#         norm_rows = [tuple(r) for r in rows]
+#         return True, norm_rows, None
+#     except sqlite3.Error as e:
+#         return False, [], str(e)
+# _SQL_KEYWORDS_TO_IGNORE = {
+#     "select", "from", "where", "join", "inner", "left", "right", "full", "outer",
+#     "on", "group", "by", "order", "limit", "having", "distinct", "union", "intersect",
+#     "except", "as", "and", "or", "not", "in", "is", "null", "like", "between", "case",
+#     "when", "then", "else", "end", "asc", "desc"
+# }
+# _SQL_FUNCTIONS_TO_IGNORE = {
+#     "count", "avg", "min", "max", "sum", "lower", "upper", "substr", "coalesce",
+#     "round", "date", "datetime", "strftime"
+# }
+# # --- LIGHTWEIGHT PARSING ---
+# def is_valid_select(sql: str):
+#     sql = sql.strip().lower()
+#     return sql.startswith("select") or sql.startswith("with")
+# def extract_tables(sql: str) -> List[str]:
+#     sql = sql.lower()
+#     if "join" not in sql:
+#         tables = re.findall(r'from\s+(\w+)', sql)
+#         return list(set(tables))
+#     tables = re.findall(r'from\s+([a-zA-Z_][a-zA-Z0-9_]*)', sql)
+#     joins = re.findall(r'join\s+([a-zA-Z_][a-zA-Z0-9_]*)', sql)
+#     return list(set(tables + joins))
+# def extract_columns(sql: str) -> List[str]:
+#     sql = sql.lower()
+#     match = re.search(r'select\s+(.*?)\s+from', sql)
+#     if not match:
+#         return []
+#     cols = match.group(1)
+#     if cols.strip() == "*":
+#         return ["*"]
+#     return [c.strip() for c in cols.split(",")]
+# def _get_db_tables_and_columns(conn: sqlite3.Connection) -> Tuple[Set[str], Set[str]]:
+#     tables = set()
+#     columns = set()
+#     for t in _list_tables(conn):
+#         tl = t.lower()
+#         if not tl:
+#             continue
+#         tables.add(tl)
+#         try:
+#             cur = conn.execute(f'PRAGMA table_info("{t}")')
+#             for row in cur.fetchall():
+#                 if row and isinstance(row[1], str):
+#                     columns.add(row[1].lower())
+#         except sqlite3.Error:
+#             continue
+#     return tables, columns
+# def _safe_results_equal(a: List[Tuple], b: List[Tuple]) -> bool:
+#     return a == b
+# @dataclass
+# class RewardDebugStats:
+#     total: int = 0
+#     parsed_ok: int = 0
+#     table_match: int = 0
+#     column_match: int = 0
+#     executed_ok: int = 0
+#     exact_match: int = 0
+# _DEBUG = RewardDebugStats()
+# def reset_debug_metrics() -> None:
+#     global _DEBUG
+#     _DEBUG = RewardDebugStats()
+# def get_debug_metrics() -> dict:
+#     denom = max(_DEBUG.total, 1)
+#     return {
+#         "valid_sql_rate": _DEBUG.parsed_ok / denom,
+#         "table_match_rate": _DEBUG.table_match / denom,
+#         "column_match_rate": _DEBUG.column_match / denom,
+#         "execution_accuracy": _DEBUG.exact_match / denom,
+#     }
+# EXECUTION_ERROR = "EXECUTION_ERROR"
+# _RESULT_CACHE_LOCK = threading.Lock()
+# _RESULT_CACHE: "Dict[str, Union[List[Tuple], str]]" = {}
+# _RESULT_CACHE_MAX = 100_000
+# def clear_result_cache() -> None:
+#     """Clear both DB query cache and reward cache."""
+#     with _RESULT_CACHE_LOCK:
+#         _RESULT_CACHE.clear()
+#     _REWARD_CACHE.clear()
+# def _db_state_fingerprint(db_path: str) -> str:
+#     try:
+#         st = os.stat(db_path)
+#         return f"{st.st_mtime_ns}:{st.st_size}"
+#     except OSError:
+#         return "missing"
+# def _result_cache_key(db_path: str, sql: str) -> str:
+#     fp = _db_state_fingerprint(db_path)
+#     payload = f"{fp}\0{sql}".encode("utf-8", errors="ignore")
+#     return hashlib.sha256(payload).hexdigest()
+# class _ConnectionPool:
+#     def __init__(self, db_path: str, maxsize: int = 1) -> None:
+#         self.db_path = db_path
+#         self.pool = queue.LifoQueue(maxsize=maxsize)
+#         self.lock = threading.Lock()
+#     def acquire(self) -> sqlite3.Connection:
+#         try:
+#             return self.pool.get_nowait()
+#         except queue.Empty:
+#             with self.lock:
+#                 try:
+#                     return self.pool.get_nowait()
+#                 except queue.Empty:
+#                     return _connect_readonly(self.db_path)
+#     def release(self, conn: sqlite3.Connection) -> None:
+#         try:
+#             self.pool.put_nowait(conn)
+#         except queue.Full:
+#             try:
+#                 conn.close()
+#             except Exception:
+#                 pass
+# _POOL_LOCK = threading.Lock()
+# _POOLS: Dict[str, _ConnectionPool] = {}
+# def _get_pool(db_path: str) -> _ConnectionPool:
+#     with _POOL_LOCK:
+#         pool = _POOLS.get(db_path)
+#         if pool is None:
+#             pool = _ConnectionPool(db_path=db_path, maxsize=1)
+#             _POOLS[db_path] = pool
+#         return pool
+# class _PooledConnection:
+#     def __init__(self, db_path: str) -> None:
+#         self.db_path = db_path
+#         self.pool = _get_pool(db_path)
+#         self.conn: Optional[sqlite3.Connection] = None
+#     def __enter__(self) -> sqlite3.Connection:
+#         self.conn = self.pool.acquire()
+#         return self.conn
+#     def __exit__(self, exc_type, exc, tb) -> None:
+#         if self.conn is not None:
+#             self.pool.release(self.conn)
+#             self.conn = None
+# def _cache_get(key: str) -> Optional[Union[List[Tuple], str]]:
+#     with _RESULT_CACHE_LOCK:
+#         return _RESULT_CACHE.get(key)
+# def _cache_put(key: str, value: Union[List[Tuple], str]) -> None:
+#     with _RESULT_CACHE_LOCK:
+#         if len(_RESULT_CACHE) >= _RESULT_CACHE_MAX:
+#             _RESULT_CACHE.clear()
+#         _RESULT_CACHE[key] = value
+# def execute_sql(conn: sqlite3.Connection, sql: str, *, max_rows: int = 1000) -> Union[List[Tuple], str]:
+#     try:
+#         _with_timeout(conn, timeout_s=DEFAULT_QUERY_TIMEOUT_S)
+#         cur = conn.execute(sql)
+#         rows = cur.fetchmany(max_rows)
+#         return [tuple(r) for r in rows]
+#     except Exception:
+#         return EXECUTION_ERROR
+# def execute_sql_cached(db_path: str, sql: str, *, max_rows: int = 1000) -> Union[List[Tuple], str]:
+#     if not USE_CACHE:
+#         with _PooledConnection(db_path) as conn:
+#             return execute_sql(conn, sql, max_rows=max_rows)
+#     key = _result_cache_key(db_path, sql)
+#     cached = _cache_get(key)
+#     if cached is not None:
+#         return cached
+#     with _PooledConnection(db_path) as conn:
+#         res = execute_sql(conn, sql, max_rows=max_rows)
+#     _cache_put(key, res)
+#     return res
+# def execution_reward_timed(
+#     pred_sql: str, db_path: str, gold_sql: str, *, measure_plan: bool = False,
+# ) -> Tuple[float, Dict[str, float]]:
+#     timings = {"parse_s": 0.0, "plan_s": 0.0, "exec_s": 0.0}
+#     t0 = time.perf_counter()
+#     sql = _normalize_sql(pred_sql)
+#     gold = _normalize_sql(gold_sql)
+#     if not is_valid_select(sql):
+#         timings["parse_s"] = time.perf_counter() - t0
+#         return 0.0, timings
+#     t1 = time.perf_counter()
+#     timings["parse_s"] = t1 - t0
+#     if measure_plan:
+#         with _PooledConnection(db_path) as conn:
+#             p0 = time.perf_counter()
+#             _explain_query_plan(conn, sql)
+#             _explain_query_plan(conn, gold)
+#             timings["plan_s"] = time.perf_counter() - p0
+#     e0 = time.perf_counter()
+#     pred_res = execute_sql_cached(db_path, sql)
+#     if pred_res == EXECUTION_ERROR:
+#         timings["exec_s"] = time.perf_counter() - e0
+#         return 0.0, timings
+#     gold_res = execute_sql_cached(db_path, gold)
+#     timings["exec_s"] = time.perf_counter() - e0
+#     if gold_res == EXECUTION_ERROR:
+#         return 0.0, timings
+#     reward = -0.2
+#     reward += 0.2
+#     if _safe_results_equal(pred_res, gold_res):
+#         return 1.0, timings
+#     return max(-1.0, min(1.0, reward)), timings
+# def execution_reward(pred_sql: str, db_path: str, gold_sql: str) -> float:
+#     try:
+#         sql = _normalize_sql(pred_sql)
+#         gold = _normalize_sql(gold_sql)
+#         if not is_valid_select(sql):
+#             return -1.0
+#         reward = -0.2
+#         pred_tables = set(extract_tables(sql))
+#         gold_tables = set(extract_tables(gold))
+#         if pred_tables == gold_tables and len(gold_tables) > 0:
+#             reward += 0.3
+#         pred_cols = set(extract_columns(sql))
+#         gold_cols = set(extract_columns(gold))
+#         if gold_cols:
+#             overlap = len(pred_cols & gold_cols) / len(gold_cols)
+#             reward += 0.3 * overlap
+#         pred_res = execute_sql_cached(db_path, sql)
+#         if pred_res == EXECUTION_ERROR:
+#             return 0.0
+#         reward += 0.2
+#         gold_res = execute_sql_cached(db_path, gold)
+#         if gold_res == EXECUTION_ERROR:
+#             return 0.0
+#         if _safe_results_equal(pred_res, gold_res):
+#             return 1.0
+#         return max(-1.0, min(1.0, reward))
+#     except Exception:
+#         return 0.0
+# def cached_execution_reward(pred_sql: str, db_path: str, gold_sql: str) -> float:
+#     if not USE_CACHE:
+#         return execution_reward(pred_sql, db_path, gold_sql)
+#     key = f"{db_path}|{pred_sql}|{gold_sql}"
+#     if key not in _REWARD_CACHE:
+#         _REWARD_CACHE[key] = execution_reward(pred_sql, db_path, gold_sql)
+#     return _REWARD_CACHE[key]
+# def execution_reward_batch_sequential(rollouts: Sequence[Tuple[str, str, str]]) -> List[float]:
+#     return [cached_execution_reward(pred_sql, db_path, gold_sql) for pred_sql, db_path, gold_sql in rollouts]
+# def execution_reward_batch_parallel(rollouts: Sequence[Tuple[str, str, str]], *, max_workers: int = 20) -> List[float]:
+#     if not rollouts:
+#         return []
+#     unique_dbs = {db_path for _, db_path, _ in rollouts}
+#     worker_count = max(1, min(max_workers, len(unique_dbs)))
+#     results: List[Optional[float]] = [None] * len(rollouts)
+#     with ThreadPoolExecutor(max_workers=worker_count) as executor:
+#         futures = {
+#             executor.submit(cached_execution_reward, pred_sql, db_path, gold_sql): i
+#             for i, (pred_sql, db_path, gold_sql) in enumerate(rollouts)
+#         }
+#         for fut in as_completed(futures):
+#             idx = futures[fut]
+#             try:
+#                 results[idx] = float(fut.result())
+#             except Exception:
+#                 results[idx] = 0.0
+#     return [r if r is not None else 0.0 for r in results]
+from __future__ import annotations
+import os
+import re
+import sqlite3
+import threading
+import time
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from typing import Dict, List
+from src.sql_validator import validate_sql_schema
+# =========================================================
+# 🔥 CONFIG FLAGS
+# =========================================================
+USE_SCHEMA_VALIDATION = True
+USE_CACHE = True
+DEFAULT_QUERY_TIMEOUT_S = 2.0
+EXECUTION_ERROR = "EXECUTION_ERROR"
+_REWARD_CACHE: Dict[str, float] = {}
+# =========================================================
+# 🔥 TASK 2: ERROR ANALYSIS + LOGGING
+# =========================================================
+ERROR_LOG_FILE = "results/error_logs.json"
+def classify_error(sql: str) -> str:
+    sql = sql.lower()
+    if "join" in sql and " on " not in sql:
+        return "missing_join"
+    if "where" in sql and "=" not in sql and ">" not in sql and "<" not in sql:
+        return "wrong_where"
+    if "null" in sql:
+        return "null_handling"
+    if "group by" in sql and "count" not in sql:
+        return "wrong_groupby"
+    return "other"
+def get_hint(error_type: str) -> str:
+    hints = {
+        "missing_join": "Add proper JOIN condition using ON.",
+        "wrong_where": "Check WHERE clause conditions.",
+        "null_handling": "Handle NULL values using IS NULL.",
+        "wrong_groupby": "Use aggregation functions with GROUP BY.",
+        "other": "Check SQL syntax and logic."
+    }
+    return hints.get(error_type, "Check query.")
+def log_error(question: str, sql: str, error: str, error_type: str):
+    os.makedirs("results", exist_ok=True)
+    entry = {
+        "question": question,
+        "sql": sql,
+        "error": error,
+        "error_type": error_type,
+        "timestamp": time.time()
+    }
+    if os.path.exists(ERROR_LOG_FILE):
+        with open(ERROR_LOG_FILE, "r") as f:
+            logs = json.load(f)
+    else:
+        logs = []
+    logs.append(entry)
+    with open(ERROR_LOG_FILE, "w") as f:
+        json.dump(logs, f, indent=2)
+# =========================================================
+# CACHE/VALIDATION TOGGLES (Task 1)
+# =========================================================
+def set_use_cache(enabled: bool) -> None:
+    global USE_CACHE
+    USE_CACHE = bool(enabled)
+def set_use_schema_validation(enabled: bool) -> None:
+    global USE_SCHEMA_VALIDATION
+    USE_SCHEMA_VALIDATION = bool(enabled)
+# =========================================================
+# SQL CLEANING
+# =========================================================
+def _normalize_sql(sql: str) -> str:
+    if not isinstance(sql, str):
+        return ""
+    s = sql.strip()
+    if s.startswith("```"):
+        s = re.sub(r"^```[a-zA-Z0-9_+-]*\n?", "", s).strip()
+        s = re.sub(r"\n?```$", "", s).strip()
+    if s.lower().startswith("sql:"):
+        s = s[4:].strip()
+    if ";" in s:
+        s = s.split(";", 1)[0].strip()
+    return s
+# =========================================================
+# DB EXECUTION
+# =========================================================
+def _connect_readonly(db_path: str):
+    uri = f"file:{os.path.abspath(db_path)}?mode=ro"
+    conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
+    conn.execute("PRAGMA query_only = ON;")
+    conn.execute("PRAGMA foreign_keys = ON;")
+    return conn
+def _with_timeout(conn: sqlite3.Connection, timeout_s: float = DEFAULT_QUERY_TIMEOUT_S):
+    start = time.monotonic()
+    def handler():
+        return 1 if (time.monotonic() - start) > timeout_s else 0
+    conn.set_progress_handler(handler, 10_000)
+def execute_sql(conn, sql):
+    try:
+        _with_timeout(conn, timeout_s=DEFAULT_QUERY_TIMEOUT_S)
+        cur = conn.execute(sql)
+        return cur.fetchall()
+    except Exception:
+        return EXECUTION_ERROR
+_RESULT_CACHE = {}
+_RESULT_LOCK = threading.Lock()
+def execute_sql_cached(db_path, sql):
+    key = f"{db_path}|{sql}"
+    if USE_CACHE:
+        with _RESULT_LOCK:
+            if key in _RESULT_CACHE:
+                return _RESULT_CACHE[key]
+    conn = _connect_readonly(db_path)
+    result = execute_sql(conn, sql)
+    conn.close()
+    if USE_CACHE:
+        with _RESULT_LOCK:
+            _RESULT_CACHE[key] = result
+    return result
+def execute_sql_cached_conn(conn: sqlite3.Connection, db_path: str, sql: str):
+    """
+    Like execute_sql_cached(), but reuses an existing connection.
+    Intended for 1-thread-per-DB workloads (Task 1).
+    """
+    key = f"{db_path}|{sql}"
+    if USE_CACHE:
+        with _RESULT_LOCK:
+            if key in _RESULT_CACHE:
+                return _RESULT_CACHE[key]
+    result = execute_sql(conn, sql)
+    if USE_CACHE:
+        with _RESULT_LOCK:
+            _RESULT_CACHE[key] = result
+    return result
+def clear_result_cache() -> None:
+    global _RESULT_CACHE, _REWARD_CACHE
+    with _RESULT_LOCK:
+        _RESULT_CACHE.clear()
+    _REWARD_CACHE.clear()
+# =========================================================
+# SQL PARSING
+# =========================================================
+def is_valid_select(sql):
+    return sql.lower().startswith("select") or sql.lower().startswith("with")
+def extract_tables(sql):
+    return re.findall(r'from\s+(\w+)', sql.lower())
+def extract_columns(sql):
+    match = re.search(r'select\s+(.*?)\s+from', sql.lower())
+    if not match:
+        return []
+    cols = match.group(1)
+    return ["*"] if cols.strip() == "*" else [c.strip() for c in cols.split(",")]
+def get_sql_operations(sql: str):
+    sql = sql.lower()
+    ops = []
+    if "select" in sql: ops.append("SELECT")
+    if "where" in sql: ops.append("WHERE")
+    if "join" in sql: ops.append("JOIN")
+    if "group by" in sql: ops.append("GROUP_BY")
+    if "order by" in sql: ops.append("ORDER_BY")
+    return ops
+def _explain_query_plan(conn: sqlite3.Connection, sql: str) -> bool:
+    try:
+        _with_timeout(conn, timeout_s=DEFAULT_QUERY_TIMEOUT_S)
+        conn.execute(f"EXPLAIN QUERY PLAN {sql}")
+        return True
+    except Exception:
+        return False
+def execution_reward_timed(pred_sql: str, db_path: str, gold_sql: str, measure_plan: bool = False):
+    """
+    Returns (reward, timings) where timings keys: parse_s, plan_s, exec_s.
+    Used by Task-1 benchmark to profile bottlenecks.
+    """
+    timings = {"parse_s": 0.0, "plan_s": 0.0, "exec_s": 0.0}
+    t0 = time.perf_counter()
+    sql = _normalize_sql(pred_sql)
+    gold = _normalize_sql(gold_sql)
+    if not is_valid_select(sql):
+        timings["parse_s"] = time.perf_counter() - t0
+        return 0.0, timings
+    t1 = time.perf_counter()
+    timings["parse_s"] = t1 - t0
+    conn = _connect_readonly(db_path)
+    try:
+        if measure_plan:
+            p0 = time.perf_counter()
+            _explain_query_plan(conn, sql)
+            _explain_query_plan(conn, gold)
+            timings["plan_s"] = time.perf_counter() - p0
+        e0 = time.perf_counter()
+        pred_res = execute_sql_cached_conn(conn, db_path, sql)
+        if pred_res == EXECUTION_ERROR:
+            timings["exec_s"] = time.perf_counter() - e0
+            return 0.0, timings
+        gold_res = execute_sql_cached_conn(conn, db_path, gold)
+        timings["exec_s"] = time.perf_counter() - e0
+        if gold_res == EXECUTION_ERROR:
+            return 0.0, timings
+        reward = -0.2 + 0.2
+        if pred_res == gold_res:
+            return 1.0, timings
+        return max(-1.0, min(1.0, reward)), timings
+    finally:
+        try:
+            conn.close()
+        except Exception:
+            pass
+# =========================================================
+# 🔥 FINAL REWARD FUNCTION (TASK 2 INTEGRATED)
+# =========================================================
+def execution_reward(pred_sql: str, db_path: str, gold_sql: str) -> float:
+    try:
+        sql = _normalize_sql(pred_sql)
+        gold = _normalize_sql(gold_sql)
+        if not is_valid_select(sql):
+            return -1.0
+        reward = -0.2
+        # =========================
+        # SCHEMA VALIDATION (Task 3)
+        # =========================
+        if USE_SCHEMA_VALIDATION:
+            valid, _ = validate_sql_schema(sql, db_path)
+            if not valid:
+                error_type = classify_error(sql)
+                log_error("UNKNOWN", sql, "schema_invalid", error_type)
+                return 0.1
+        # =========================
+        # EXECUTION
+        # =========================
+        pred_res = execute_sql_cached(db_path, sql)
+        if pred_res == "EXECUTION_ERROR":
+            error_type = classify_error(sql)
+            log_error(
+                question="UNKNOWN",
+                sql=sql,
+                error="execution_error",
+                error_type=error_type
+            )
+            print(f"[ERROR] {error_type}")
+            print(f"[HINT] {get_hint(error_type)}")
+            return 0.1
+        reward += 0.2
+        gold_res = execute_sql_cached(db_path, gold)
+        if gold_res == "EXECUTION_ERROR":
+            return 0.1
+        if pred_res == gold_res:
+            return 1.0
+        return max(-1.0, min(1.0, reward))
+    except Exception as e:
+        log_error("UNKNOWN", pred_sql, str(e), "runtime_error")
+        return 0.0
+# =========================================================
+# BATCH EXECUTION (Task 1)
+# =========================================================
+def cached_execution_reward(pred_sql: str, db_path: str, gold_sql: str) -> float:
+    if not USE_CACHE:
+        return float(execution_reward(pred_sql, db_path, gold_sql))
+    key = f"{db_path}|{pred_sql}|{gold_sql}"
+    if key in _REWARD_CACHE:
+        return float(_REWARD_CACHE[key])
+    r = float(execution_reward(pred_sql, db_path, gold_sql))
+    _REWARD_CACHE[key] = r
+    return r
+def execution_reward_batch_sequential(rollouts):
+    return [cached_execution_reward(p, d, g) for (p, d, g) in rollouts]
+def execution_reward_batch_parallel(rollouts, max_workers=10):
+    results = [0.0] * len(rollouts)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(cached_execution_reward, p, d, g): i
+            for i, (p, d, g) in enumerate(rollouts)
+        }
+        for fut in as_completed(futures):
+            idx = futures[fut]
+            try:
+                results[idx] = fut.result()
+            except Exception:
+                results[idx] = 0.0
+    return results
+def execution_reward_batch_parallel_by_db(rollouts, max_workers: int = 20):
+    """
+    1 thread per DB path. Reuses a single readonly connection per DB worker.
+    Preserves input order.
+    """
+    if not rollouts:
+        return []
+    by_db = {}
+    for idx, (pred_sql, db_path, gold_sql) in enumerate(rollouts):
+        by_db.setdefault(db_path, []).append((idx, pred_sql, gold_sql))
+    results = [0.0 for _ in range(len(rollouts))]
+    def _reward_with_conn(conn: sqlite3.Connection, pred_sql: str, db_path: str, gold_sql: str) -> float:
+        try:
+            sql = _normalize_sql(pred_sql)
+            gold = _normalize_sql(gold_sql)
+            if not is_valid_select(sql):
+                return -1.0
+            reward = -0.2
+            if USE_SCHEMA_VALIDATION:
+                valid, _ = validate_sql_schema(sql, db_path)
+                if not valid:
+                    error_type = classify_error(sql)
+                    log_error("UNKNOWN", sql, "schema_invalid", error_type)
+                    return 0.1
+            pred_res = execute_sql_cached_conn(conn, db_path, sql)
+            if pred_res == EXECUTION_ERROR:
+                error_type = classify_error(sql)
+                log_error("UNKNOWN", sql, "execution_error", error_type)
+                return 0.1
+            reward += 0.2
+            gold_res = execute_sql_cached_conn(conn, db_path, gold)
+            if gold_res == EXECUTION_ERROR:
+                return 0.1
+            if pred_res == gold_res:
+                return 1.0
+            return max(-1.0, min(1.0, reward))
+        except Exception:
+            return 0.0
+    def _worker(db_path: str, items):
+        conn = _connect_readonly(db_path)
+        try:
+            for idx, pred, gold in items:
+                results[idx] = _reward_with_conn(conn, pred, db_path, gold)
+        finally:
+            try:
+                conn.close()
+            except Exception:
+                pass
+    with ThreadPoolExecutor(max_workers=int(max_workers)) as ex:
+        futures = [ex.submit(_worker, db_path, items) for db_path, items in by_db.items()]
+        for fut in as_completed(futures):
+            fut.result()
+    return results