File size: 7,329 Bytes
e13d87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
import numpy as np
from pathlib import Path
import os
import chardet
import csv

class CSVHelpers:
    """Helper utilities for CSV preprocessing and analysis."""
    
    @staticmethod
    def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
        """Detect the encoding of a CSV file."""
        with open(file_path, 'rb') as f:
            raw_data = f.read(sample_size)
            result = chardet.detect(raw_data)
            return result['encoding']
    
    @staticmethod
    def detect_delimiter(file_path: str, encoding: str = 'utf-8') -> str:
        """Detect the delimiter used in a CSV file."""
        with open(file_path, 'r', encoding=encoding) as csvfile:
            sample = csvfile.read(4096)
            
            # Check common delimiters
            for delimiter in [',', ';', '\t', '|']:
                sniffer = csv.Sniffer()
                try:
                    if delimiter in sample:
                        dialect = sniffer.sniff(sample, delimiters=delimiter)
                        return dialect.delimiter
                except:
                    continue
            
            # Default to comma if detection fails
            return ','
    
    @staticmethod
    def preprocess_csv(file_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
        """

        Preprocess a CSV file with automatic encoding and delimiter detection.

        Returns the DataFrame and metadata about the preprocessing.

        """
        # Detect encoding
        try:
            encoding = CSVHelpers.detect_encoding(file_path)
        except:
            encoding = 'utf-8'  # Default to UTF-8 if detection fails
        
        # Detect delimiter
        try:
            delimiter = CSVHelpers.detect_delimiter(file_path, encoding)
        except:
            delimiter = ','  # Default to comma if detection fails
        
        # Read the CSV with detected parameters
        df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False)
        
        # Basic preprocessing
        metadata = {
            "original_shape": df.shape,
            "encoding": encoding,
            "delimiter": delimiter,
            "columns": list(df.columns),
            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
        }
        
        # Handle missing values
        missing_counts = df.isna().sum()
        metadata["missing_values"] = {col: int(count) for col, count in missing_counts.items() if count > 0}
        
        # Handle duplicate rows
        duplicates = df.duplicated().sum()
        metadata["duplicate_rows"] = int(duplicates)
        
        return df, metadata
    
    @staticmethod
    def infer_column_types(df: pd.DataFrame) -> Dict[str, str]:
        """

        Infer semantic types of columns (beyond pandas dtypes).

        Examples: date, categorical, numeric, text, etc.

        """
        column_types = {}
        
        for column in df.columns:
            # Skip columns with all missing values
            if df[column].isna().all():
                column_types[column] = "unknown"
                continue
                
            # Get pandas dtype
            dtype = df[column].dtype
            
            # Check if datetime
            if pd.api.types.is_datetime64_dtype(df[column]):
                column_types[column] = "datetime"
            
            # Try to convert to datetime if string
            elif dtype == 'object':
                try:
                    # Sample non-null values
                    sample = df[column].dropna().head(10)
                    pd.to_datetime(sample)
                    column_types[column] = "potential_datetime"
                except:
                    # Check if categorical (few unique values)
                    unique_ratio = df[column].nunique() / len(df)
                    if unique_ratio < 0.1:  # Less than 10% unique values
                        column_types[column] = "categorical"
                    else:
                        column_types[column] = "text"
            
            # Numeric types
            elif pd.api.types.is_numeric_dtype(dtype):
                # Check if potential ID column
                if df[column].nunique() == len(df) and df[column].min() >= 0:
                    column_types[column] = "id"
                # Check if binary
                elif df[column].nunique() <= 2:
                    column_types[column] = "binary"
                # Check if integer
                elif pd.api.types.is_integer_dtype(dtype):
                    column_types[column] = "integer"
                else:
                    column_types[column] = "float"
            
            # Boolean type
            elif pd.api.types.is_bool_dtype(dtype):
                column_types[column] = "boolean"
            
            # Fallback
            else:
                column_types[column] = "unknown"
        
        return column_types
    
    @staticmethod
    def suggest_visualizations(df: pd.DataFrame) -> List[Dict[str, Any]]:
        """

        Suggest appropriate visualizations based on data types.

        Returns a list of visualization suggestions.

        """
        suggestions = []
        column_types = CSVHelpers.infer_column_types(df)
        numeric_columns = [col for col, type in column_types.items() 
                          if type in ["integer", "float"]]
        categorical_columns = [col for col, type in column_types.items() 
                              if type in ["categorical", "binary"]]
        datetime_columns = [col for col, type in column_types.items() 
                           if type in ["datetime", "potential_datetime"]]
        
        # Histogram for numeric columns
        for col in numeric_columns[:3]:  # Limit to first 3
            suggestions.append({
                "chart_type": "histogram",
                "column": col,
                "title": f"Distribution of {col}"
            })
        
        # Bar charts for categorical columns
        for col in categorical_columns[:3]:  # Limit to first 3
            suggestions.append({
                "chart_type": "bar",
                "x_column": col,
                "y_column": "count",
                "title": f"Count by {col}"
            })
        
        # Time series for datetime + numeric combinations
        if datetime_columns and numeric_columns:
            suggestions.append({
                "chart_type": "line",
                "x_column": datetime_columns[0],
                "y_column": numeric_columns[0],
                "title": f"{numeric_columns[0]} over Time"
            })
        
        # Scatter plots for numeric pairs
        if len(numeric_columns) >= 2:
            suggestions.append({
                "chart_type": "scatter",
                "x_column": numeric_columns[0],
                "y_column": numeric_columns[1],
                "title": f"{numeric_columns[1]} vs {numeric_columns[0]}"
            })
        
        return suggestions