from typing import Dict, List, Any, Optional, Tuple import pandas as pd import numpy as np from pathlib import Path import os import chardet import csv class CSVHelpers: """Helper utilities for CSV preprocessing and analysis.""" @staticmethod def detect_encoding(file_path: str, sample_size: int = 10000) -> str: """Detect the encoding of a CSV file.""" with open(file_path, 'rb') as f: raw_data = f.read(sample_size) result = chardet.detect(raw_data) return result['encoding'] @staticmethod def detect_delimiter(file_path: str, encoding: str = 'utf-8') -> str: """Detect the delimiter used in a CSV file.""" with open(file_path, 'r', encoding=encoding) as csvfile: sample = csvfile.read(4096) # Check common delimiters for delimiter in [',', ';', '\t', '|']: sniffer = csv.Sniffer() try: if delimiter in sample: dialect = sniffer.sniff(sample, delimiters=delimiter) return dialect.delimiter except: continue # Default to comma if detection fails return ',' @staticmethod def preprocess_csv(file_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]: """ Preprocess a CSV file with automatic encoding and delimiter detection. Returns the DataFrame and metadata about the preprocessing. """ # Detect encoding try: encoding = CSVHelpers.detect_encoding(file_path) except: encoding = 'utf-8' # Default to UTF-8 if detection fails # Detect delimiter try: delimiter = CSVHelpers.detect_delimiter(file_path, encoding) except: delimiter = ',' # Default to comma if detection fails # Read the CSV with detected parameters df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False) # Basic preprocessing metadata = { "original_shape": df.shape, "encoding": encoding, "delimiter": delimiter, "columns": list(df.columns), "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()} } # Handle missing values missing_counts = df.isna().sum() metadata["missing_values"] = {col: int(count) for col, count in missing_counts.items() if count > 0} # Handle duplicate rows duplicates = df.duplicated().sum() metadata["duplicate_rows"] = int(duplicates) return df, metadata @staticmethod def infer_column_types(df: pd.DataFrame) -> Dict[str, str]: """ Infer semantic types of columns (beyond pandas dtypes). Examples: date, categorical, numeric, text, etc. """ column_types = {} for column in df.columns: # Skip columns with all missing values if df[column].isna().all(): column_types[column] = "unknown" continue # Get pandas dtype dtype = df[column].dtype # Check if datetime if pd.api.types.is_datetime64_dtype(df[column]): column_types[column] = "datetime" # Try to convert to datetime if string elif dtype == 'object': try: # Sample non-null values sample = df[column].dropna().head(10) pd.to_datetime(sample) column_types[column] = "potential_datetime" except: # Check if categorical (few unique values) unique_ratio = df[column].nunique() / len(df) if unique_ratio < 0.1: # Less than 10% unique values column_types[column] = "categorical" else: column_types[column] = "text" # Numeric types elif pd.api.types.is_numeric_dtype(dtype): # Check if potential ID column if df[column].nunique() == len(df) and df[column].min() >= 0: column_types[column] = "id" # Check if binary elif df[column].nunique() <= 2: column_types[column] = "binary" # Check if integer elif pd.api.types.is_integer_dtype(dtype): column_types[column] = "integer" else: column_types[column] = "float" # Boolean type elif pd.api.types.is_bool_dtype(dtype): column_types[column] = "boolean" # Fallback else: column_types[column] = "unknown" return column_types @staticmethod def suggest_visualizations(df: pd.DataFrame) -> List[Dict[str, Any]]: """ Suggest appropriate visualizations based on data types. Returns a list of visualization suggestions. """ suggestions = [] column_types = CSVHelpers.infer_column_types(df) numeric_columns = [col for col, type in column_types.items() if type in ["integer", "float"]] categorical_columns = [col for col, type in column_types.items() if type in ["categorical", "binary"]] datetime_columns = [col for col, type in column_types.items() if type in ["datetime", "potential_datetime"]] # Histogram for numeric columns for col in numeric_columns[:3]: # Limit to first 3 suggestions.append({ "chart_type": "histogram", "column": col, "title": f"Distribution of {col}" }) # Bar charts for categorical columns for col in categorical_columns[:3]: # Limit to first 3 suggestions.append({ "chart_type": "bar", "x_column": col, "y_column": "count", "title": f"Count by {col}" }) # Time series for datetime + numeric combinations if datetime_columns and numeric_columns: suggestions.append({ "chart_type": "line", "x_column": datetime_columns[0], "y_column": numeric_columns[0], "title": f"{numeric_columns[0]} over Time" }) # Scatter plots for numeric pairs if len(numeric_columns) >= 2: suggestions.append({ "chart_type": "scatter", "x_column": numeric_columns[0], "y_column": numeric_columns[1], "title": f"{numeric_columns[1]} vs {numeric_columns[0]}" }) return suggestions