""" features/schema_handler.py =========================== Handles messy, real-world database schemas. Real-world schemas often have: - Uppercase/lowercase inconsistencies (e.g., 'AMOUNT' vs 'amount') - Leading/trailing spaces (e.g., ' order_Date' instead of 'order_date') - Inconsistent naming (e.g., 'customerid' vs 'customer_id') This module normalizes column names to a clean, consistent format. """ class SchemaHandler: """ Normalizes and analyzes messy schema column names. """ def normalize_schema(self, schema: list) -> list: """ Clean up a list of column names: - Strip leading/trailing whitespace - Convert to lowercase Args: schema (list): Raw schema with potentially messy column names. Returns: list: Cleaned list of column names. """ return [col.strip().lower() for col in schema] def detect_issues(self, schema: list) -> list: """ Identify which column names have formatting problems. Args: schema (list): List of raw column names. Returns: list: List of problem descriptions. """ issues = [] for col in schema: problems = [] # Check for leading or trailing spaces if col != col.strip(): problems.append(f"extra whitespace in '{col}'") # Check for uppercase characters (everything should be lowercase for consistency) if col != col.lower() and col.strip() != col.strip().lower(): problems.append(f"uppercase letters in '{col}'") # Check for double spaces inside the name if " " in col: problems.append(f"double spaces in '{col}'") if problems: issues.extend(problems) return issues def find_closest_match(self, target: str, schema: list) -> str: """ Given a column name, find the closest match in the schema. Used to help identify what 'customer_id' maps to in a messy schema. Args: target (str): The column name to look up. schema (list): List of available (possibly messy) column names. Returns: str: The best matching column name, or empty string if none found. """ # Normalize the target for comparison target_clean = target.strip().lower().replace("_", "").replace(" ", "") for col in schema: col_clean = col.strip().lower().replace("_", "").replace(" ", "") if target_clean == col_clean: return col # Return original messy version return "" def generate_mapping(self, schema: list) -> dict: """ Create a mapping from messy column names to normalized ones. Args: schema (list): Messy schema column names. Returns: dict: {original: normalized} mapping. """ return {col: col.strip().lower() for col in schema} def describe_schema_issues(self, schema: list) -> str: """ Return a human-readable description of all schema problems. Args: schema (list): Raw schema column names. Returns: str: Description of detected schema issues. """ issues = self.detect_issues(schema) if not issues: return "Schema looks clean — no formatting issues detected." lines = ["Schema issues detected:"] for issue in issues: lines.append(f" - {issue}") return "\n".join(lines)