File size: 2,326 Bytes
9b2cded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Lightweight structural SQL features for the classification head."""

from __future__ import annotations

import re
from typing import List

AGG_FUNCS = ("COUNT", "SUM", "AVG", "MAX", "MIN")
WINDOW_FUNCS = ("ROW_NUMBER", "RANK", "DENSE_RANK", "OVER")


def _upper(sql: str) -> str:
    return sql.upper()


def extract_sql_features(student_query: str, correct_query: str = "") -> List[float]:
    """
    Rule-based signals that complement semantic embeddings.
    Returns a fixed-length float vector.
    """
    s = _upper(student_query)
    c = _upper(correct_query) if correct_query else ""

    has_agg = any(f" {f}(" in s or f"{f}(" in s for f in AGG_FUNCS)
    has_group = "GROUP BY" in s
    has_join = "JOIN" in s
    has_on = " ON " in s
    has_where = " WHERE " in s
    has_having = " HAVING " in s
    has_distinct = "DISTINCT" in s
    has_subquery = "(" in s and "SELECT" in s[s.find("(") :]
    has_window = "OVER" in s
    has_null_eq = "= NULL" in s or "=NULL" in s
    has_is_null = "IS NULL" in s or "IS NOT NULL" in s
    has_select_star = bool(re.search(r"SELECT\s+\*", s))
    has_or = " OR " in s
    has_and = " AND " in s

    correct_has_distinct = "DISTINCT" in c
    correct_has_group = "GROUP BY" in c
    correct_has_inner = "INNER JOIN" in c
    student_has_left = "LEFT JOIN" in s

    return [
        float(has_agg),
        float(has_agg and not has_group),
        float(has_join and not has_on),
        float(has_join),
        float(has_where and has_having),
        float(has_agg and has_where and not has_having),
        float(has_distinct),
        float(correct_has_distinct and not has_distinct),
        float(has_subquery),
        float(has_window),
        float(has_null_eq),
        float(has_is_null),
        float(has_select_star),
        float(has_or and has_and),
        float(correct_has_inner and student_has_left),
        float(len(s) / max(len(c), 1)),  # length ratio vs reference
    ]


FEATURE_NAMES = [
    "has_aggregate",
    "agg_without_group_by",
    "join_without_on",
    "has_join",
    "where_and_having",
    "agg_in_where",
    "has_distinct",
    "missing_distinct_vs_correct",
    "has_subquery",
    "has_window",
    "null_equals",
    "is_null_check",
    "select_star",
    "and_or_mix",
    "left_vs_inner_join",
    "length_ratio",
]