File size: 7,329 Bytes
e13d87a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
import numpy as np
from pathlib import Path
import os
import chardet
import csv
class CSVHelpers:
"""Helper utilities for CSV preprocessing and analysis."""
@staticmethod
def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
"""Detect the encoding of a CSV file."""
with open(file_path, 'rb') as f:
raw_data = f.read(sample_size)
result = chardet.detect(raw_data)
return result['encoding']
@staticmethod
def detect_delimiter(file_path: str, encoding: str = 'utf-8') -> str:
"""Detect the delimiter used in a CSV file."""
with open(file_path, 'r', encoding=encoding) as csvfile:
sample = csvfile.read(4096)
# Check common delimiters
for delimiter in [',', ';', '\t', '|']:
sniffer = csv.Sniffer()
try:
if delimiter in sample:
dialect = sniffer.sniff(sample, delimiters=delimiter)
return dialect.delimiter
except:
continue
# Default to comma if detection fails
return ','
@staticmethod
def preprocess_csv(file_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
"""
Preprocess a CSV file with automatic encoding and delimiter detection.
Returns the DataFrame and metadata about the preprocessing.
"""
# Detect encoding
try:
encoding = CSVHelpers.detect_encoding(file_path)
except:
encoding = 'utf-8' # Default to UTF-8 if detection fails
# Detect delimiter
try:
delimiter = CSVHelpers.detect_delimiter(file_path, encoding)
except:
delimiter = ',' # Default to comma if detection fails
# Read the CSV with detected parameters
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False)
# Basic preprocessing
metadata = {
"original_shape": df.shape,
"encoding": encoding,
"delimiter": delimiter,
"columns": list(df.columns),
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
}
# Handle missing values
missing_counts = df.isna().sum()
metadata["missing_values"] = {col: int(count) for col, count in missing_counts.items() if count > 0}
# Handle duplicate rows
duplicates = df.duplicated().sum()
metadata["duplicate_rows"] = int(duplicates)
return df, metadata
@staticmethod
def infer_column_types(df: pd.DataFrame) -> Dict[str, str]:
"""
Infer semantic types of columns (beyond pandas dtypes).
Examples: date, categorical, numeric, text, etc.
"""
column_types = {}
for column in df.columns:
# Skip columns with all missing values
if df[column].isna().all():
column_types[column] = "unknown"
continue
# Get pandas dtype
dtype = df[column].dtype
# Check if datetime
if pd.api.types.is_datetime64_dtype(df[column]):
column_types[column] = "datetime"
# Try to convert to datetime if string
elif dtype == 'object':
try:
# Sample non-null values
sample = df[column].dropna().head(10)
pd.to_datetime(sample)
column_types[column] = "potential_datetime"
except:
# Check if categorical (few unique values)
unique_ratio = df[column].nunique() / len(df)
if unique_ratio < 0.1: # Less than 10% unique values
column_types[column] = "categorical"
else:
column_types[column] = "text"
# Numeric types
elif pd.api.types.is_numeric_dtype(dtype):
# Check if potential ID column
if df[column].nunique() == len(df) and df[column].min() >= 0:
column_types[column] = "id"
# Check if binary
elif df[column].nunique() <= 2:
column_types[column] = "binary"
# Check if integer
elif pd.api.types.is_integer_dtype(dtype):
column_types[column] = "integer"
else:
column_types[column] = "float"
# Boolean type
elif pd.api.types.is_bool_dtype(dtype):
column_types[column] = "boolean"
# Fallback
else:
column_types[column] = "unknown"
return column_types
@staticmethod
def suggest_visualizations(df: pd.DataFrame) -> List[Dict[str, Any]]:
"""
Suggest appropriate visualizations based on data types.
Returns a list of visualization suggestions.
"""
suggestions = []
column_types = CSVHelpers.infer_column_types(df)
numeric_columns = [col for col, type in column_types.items()
if type in ["integer", "float"]]
categorical_columns = [col for col, type in column_types.items()
if type in ["categorical", "binary"]]
datetime_columns = [col for col, type in column_types.items()
if type in ["datetime", "potential_datetime"]]
# Histogram for numeric columns
for col in numeric_columns[:3]: # Limit to first 3
suggestions.append({
"chart_type": "histogram",
"column": col,
"title": f"Distribution of {col}"
})
# Bar charts for categorical columns
for col in categorical_columns[:3]: # Limit to first 3
suggestions.append({
"chart_type": "bar",
"x_column": col,
"y_column": "count",
"title": f"Count by {col}"
})
# Time series for datetime + numeric combinations
if datetime_columns and numeric_columns:
suggestions.append({
"chart_type": "line",
"x_column": datetime_columns[0],
"y_column": numeric_columns[0],
"title": f"{numeric_columns[0]} over Time"
})
# Scatter plots for numeric pairs
if len(numeric_columns) >= 2:
suggestions.append({
"chart_type": "scatter",
"x_column": numeric_columns[0],
"y_column": numeric_columns[1],
"title": f"{numeric_columns[1]} vs {numeric_columns[0]}"
})
return suggestions
|