File size: 3,641 Bytes
f8b106f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
features/schema_handler.py
===========================
Handles messy, real-world database schemas.

Real-world schemas often have:
  - Uppercase/lowercase inconsistencies (e.g., 'AMOUNT' vs 'amount')
  - Leading/trailing spaces (e.g., ' order_Date' instead of 'order_date')
  - Inconsistent naming (e.g., 'customerid' vs 'customer_id')

This module normalizes column names to a clean, consistent format.
"""


class SchemaHandler:
    """
    Normalizes and analyzes messy schema column names.
    """

    def normalize_schema(self, schema: list) -> list:
        """
        Clean up a list of column names:
          - Strip leading/trailing whitespace
          - Convert to lowercase

        Args:
            schema (list): Raw schema with potentially messy column names.

        Returns:
            list: Cleaned list of column names.
        """
        return [col.strip().lower() for col in schema]

    def detect_issues(self, schema: list) -> list:
        """
        Identify which column names have formatting problems.

        Args:
            schema (list): List of raw column names.

        Returns:
            list: List of problem descriptions.
        """
        issues = []

        for col in schema:
            problems = []

            # Check for leading or trailing spaces
            if col != col.strip():
                problems.append(f"extra whitespace in '{col}'")

            # Check for uppercase characters (everything should be lowercase for consistency)
            if col != col.lower() and col.strip() != col.strip().lower():
                problems.append(f"uppercase letters in '{col}'")

            # Check for double spaces inside the name
            if "  " in col:
                problems.append(f"double spaces in '{col}'")

            if problems:
                issues.extend(problems)

        return issues

    def find_closest_match(self, target: str, schema: list) -> str:
        """
        Given a column name, find the closest match in the schema.
        Used to help identify what 'customer_id' maps to in a messy schema.

        Args:
            target (str): The column name to look up.
            schema (list): List of available (possibly messy) column names.

        Returns:
            str: The best matching column name, or empty string if none found.
        """
        # Normalize the target for comparison
        target_clean = target.strip().lower().replace("_", "").replace(" ", "")

        for col in schema:
            col_clean = col.strip().lower().replace("_", "").replace(" ", "")
            if target_clean == col_clean:
                return col     # Return original messy version

        return ""

    def generate_mapping(self, schema: list) -> dict:
        """
        Create a mapping from messy column names to normalized ones.

        Args:
            schema (list): Messy schema column names.

        Returns:
            dict: {original: normalized} mapping.
        """
        return {col: col.strip().lower() for col in schema}

    def describe_schema_issues(self, schema: list) -> str:
        """
        Return a human-readable description of all schema problems.

        Args:
            schema (list): Raw schema column names.

        Returns:
            str: Description of detected schema issues.
        """
        issues = self.detect_issues(schema)
        if not issues:
            return "Schema looks clean — no formatting issues detected."

        lines = ["Schema issues detected:"]
        for issue in issues:
            lines.append(f"  - {issue}")
        return "\n".join(lines)