File size: 6,285 Bytes
890025a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re
import pandas as pd
import numpy as np
import streamlit as st

# Define fallback cleaning function

@st.cache_data
def clean_dataframe_fallback(df):
        """Hardcoded data cleaning pipeline"""
    
        """Generic data cleaning pipeline with categorical preservation"""
        df_cleaned = df.copy()


        df_cleaned = df_cleaned.applymap(
        lambda x: re.sub(r"\(.*?\)", "", str(x)) if isinstance(x, str) else x)

        # Remove 'ref.' references
        df_cleaned = df_cleaned.applymap(
            lambda x: re.sub(r"ref\.", "", str(x), flags=re.IGNORECASE) if isinstance(x, str) else x)
        
        # Remove any other special characters except letters, digits, spaces, and dots
        df_cleaned = df_cleaned.applymap(
            lambda x: re.sub(r"[^\w\s\d\.]", "", str(x)).strip() if isinstance(x, str) else x
        )


        # Step 0 - Clean column names first
        df_cleaned.columns = [col.strip().lower().replace(' ', '_') for col in df_cleaned.columns]
        
            # Define measurement units to remove
        measurement_units = {
            'weight': r'\s*(kg|kilograms|lbs|pounds)$',
            'height': r'\s*(cm|centimeters|inches|feet|ft)$'
        }


        # Step 1 - Remove redundant columns
        # Preservation patterns for categorical columns
        preserve_pattern = re.compile(r'(name|brand|model|type|category|region|text|desc|color|size)', re.IGNORECASE)
        preserved_cols = [col for col in df_cleaned.columns if preserve_pattern.search(col)]
        
        # ID pattern detection
        id_pattern = re.compile(r'(_id|id_|num|no|number|identifier|code|idx|row)', re.IGNORECASE)
        id_cols = [col for col in df_cleaned.columns if id_pattern.search(col) and col not in preserved_cols]
        
        # Unique value columns
        unique_cols = [col for col in df_cleaned.columns 
                    if df_cleaned[col].nunique() == len(df_cleaned) 
                    and col not in preserved_cols]
        
        redundant_cols = list(set(id_cols + unique_cols))
        df_cleaned = df_cleaned.drop(columns=redundant_cols)
        print(f"Removed {len(redundant_cols)} redundant columns: {redundant_cols}")

        # Step 2 - Enhanced numeric detection with categorical protection
        for col in df_cleaned.columns:
            if col in preserved_cols:
                print(f"Preserving categorical column: {col}")
                continue  # Skip preserved columns

            if any(unit in col for unit in measurement_units.keys()):  
                pattern = measurement_units.get(col.split('_')[0], r'')
                df_cleaned[col] = df_cleaned[col].astype(str).str.replace(pattern, '', regex=True).str.strip()
                            


            if pd.api.types.is_numeric_dtype(df_cleaned[col]):
                continue
                
            # Strict numeric pattern detection
            non_null_count = df_cleaned[col].dropna().shape[0]
            sample_size = min(100, non_null_count)
            sample = df_cleaned[col].dropna().sample(sample_size, random_state=42)
            numeric_pattern = r'^[-+]?\d*\.?\d+$'  # Full string match
            num_matches = sample.astype(str).str.fullmatch(numeric_pattern).mean()
                
            if num_matches > 0.8:  # High threshold
                # Conservative cleaning
                cleaned = df_cleaned[col].replace(r'[^\d\.\-]', '', regex=True)
                converted = pd.to_numeric(cleaned, errors='coerce')
                success_rate = converted.notna().mean()
                
                if success_rate > 0.9:  # Strict success requirement
                    df_cleaned[col] = converted
                    print(f"Converted {col} to numeric (success: {success_rate:.1%})")

        # Step 3 - Date detection
        date_cols = []
        for col in df_cleaned.select_dtypes(exclude=np.number).columns:
            if col in preserved_cols:
                continue
            try:
                df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='raise')
                date_cols.append(col)
                print(f"Detected datetime: {col}")
            except:
                pass

# Example manual approach:
        currency_cols = [col for col in df_cleaned.columns if any(keyword in col.lower() for keyword in ["price", "gross", "budget"])]
        for col in currency_cols:
            df_cleaned[col] = df_cleaned[col].astype(str).str.replace(r'[^\d\.]', '', regex=True)  # remove everything except digits & dots
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')



        # Step 4 - Missing value handling
        numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
        categorical_cols = df_cleaned.select_dtypes(exclude=np.number).columns
        
        # Numeric imputation
        for col in numeric_cols:
            if df_cleaned[col].isna().any():
                df_cleaned[f'{col}_missing'] = df_cleaned[col].isna().astype(int)
                df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
        
        # Categorical imputation
        for col in categorical_cols:
            if df_cleaned[col].isna().any():
                mode_val = df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown'
                df_cleaned[col] = df_cleaned[col].fillna(mode_val)

        # Step 5 - Text normalization for non-preserved columns
        text_cols = [col for col in categorical_cols if col not in preserved_cols]
        for col in text_cols:
            df_cleaned[col] = df_cleaned[col].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x)).strip().lower())
            
        
        # Step 6 - Outlier handling (preserve categoricals)
        numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            if df_cleaned[col].nunique() > 10:
                q1 = df_cleaned[col].quantile(0.05)
                q3 = df_cleaned[col].quantile(0.95)
                df_cleaned[col] = np.clip(df_cleaned[col], q1, q3)

        # Step 7 - Final validation
        df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)
        
        return df_cleaned