Spaces:
Sleeping
Sleeping
File size: 2,326 Bytes
9b2cded | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | """Lightweight structural SQL features for the classification head."""
from __future__ import annotations
import re
from typing import List
AGG_FUNCS = ("COUNT", "SUM", "AVG", "MAX", "MIN")
WINDOW_FUNCS = ("ROW_NUMBER", "RANK", "DENSE_RANK", "OVER")
def _upper(sql: str) -> str:
return sql.upper()
def extract_sql_features(student_query: str, correct_query: str = "") -> List[float]:
"""
Rule-based signals that complement semantic embeddings.
Returns a fixed-length float vector.
"""
s = _upper(student_query)
c = _upper(correct_query) if correct_query else ""
has_agg = any(f" {f}(" in s or f"{f}(" in s for f in AGG_FUNCS)
has_group = "GROUP BY" in s
has_join = "JOIN" in s
has_on = " ON " in s
has_where = " WHERE " in s
has_having = " HAVING " in s
has_distinct = "DISTINCT" in s
has_subquery = "(" in s and "SELECT" in s[s.find("(") :]
has_window = "OVER" in s
has_null_eq = "= NULL" in s or "=NULL" in s
has_is_null = "IS NULL" in s or "IS NOT NULL" in s
has_select_star = bool(re.search(r"SELECT\s+\*", s))
has_or = " OR " in s
has_and = " AND " in s
correct_has_distinct = "DISTINCT" in c
correct_has_group = "GROUP BY" in c
correct_has_inner = "INNER JOIN" in c
student_has_left = "LEFT JOIN" in s
return [
float(has_agg),
float(has_agg and not has_group),
float(has_join and not has_on),
float(has_join),
float(has_where and has_having),
float(has_agg and has_where and not has_having),
float(has_distinct),
float(correct_has_distinct and not has_distinct),
float(has_subquery),
float(has_window),
float(has_null_eq),
float(has_is_null),
float(has_select_star),
float(has_or and has_and),
float(correct_has_inner and student_has_left),
float(len(s) / max(len(c), 1)), # length ratio vs reference
]
FEATURE_NAMES = [
"has_aggregate",
"agg_without_group_by",
"join_without_on",
"has_join",
"where_and_having",
"agg_in_where",
"has_distinct",
"missing_distinct_vs_correct",
"has_subquery",
"has_window",
"null_equals",
"is_null_check",
"select_star",
"and_or_mix",
"left_vs_inner_join",
"length_ratio",
]
|