ChatCSV / utils /csv_helper.py
Chamin09's picture
Upload 12 files
e13d87a verified
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
import numpy as np
from pathlib import Path
import os
import chardet
import csv
class CSVHelpers:
"""Helper utilities for CSV preprocessing and analysis."""
@staticmethod
def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
"""Detect the encoding of a CSV file."""
with open(file_path, 'rb') as f:
raw_data = f.read(sample_size)
result = chardet.detect(raw_data)
return result['encoding']
@staticmethod
def detect_delimiter(file_path: str, encoding: str = 'utf-8') -> str:
"""Detect the delimiter used in a CSV file."""
with open(file_path, 'r', encoding=encoding) as csvfile:
sample = csvfile.read(4096)
# Check common delimiters
for delimiter in [',', ';', '\t', '|']:
sniffer = csv.Sniffer()
try:
if delimiter in sample:
dialect = sniffer.sniff(sample, delimiters=delimiter)
return dialect.delimiter
except:
continue
# Default to comma if detection fails
return ','
@staticmethod
def preprocess_csv(file_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
"""
Preprocess a CSV file with automatic encoding and delimiter detection.
Returns the DataFrame and metadata about the preprocessing.
"""
# Detect encoding
try:
encoding = CSVHelpers.detect_encoding(file_path)
except:
encoding = 'utf-8' # Default to UTF-8 if detection fails
# Detect delimiter
try:
delimiter = CSVHelpers.detect_delimiter(file_path, encoding)
except:
delimiter = ',' # Default to comma if detection fails
# Read the CSV with detected parameters
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False)
# Basic preprocessing
metadata = {
"original_shape": df.shape,
"encoding": encoding,
"delimiter": delimiter,
"columns": list(df.columns),
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
}
# Handle missing values
missing_counts = df.isna().sum()
metadata["missing_values"] = {col: int(count) for col, count in missing_counts.items() if count > 0}
# Handle duplicate rows
duplicates = df.duplicated().sum()
metadata["duplicate_rows"] = int(duplicates)
return df, metadata
@staticmethod
def infer_column_types(df: pd.DataFrame) -> Dict[str, str]:
"""
Infer semantic types of columns (beyond pandas dtypes).
Examples: date, categorical, numeric, text, etc.
"""
column_types = {}
for column in df.columns:
# Skip columns with all missing values
if df[column].isna().all():
column_types[column] = "unknown"
continue
# Get pandas dtype
dtype = df[column].dtype
# Check if datetime
if pd.api.types.is_datetime64_dtype(df[column]):
column_types[column] = "datetime"
# Try to convert to datetime if string
elif dtype == 'object':
try:
# Sample non-null values
sample = df[column].dropna().head(10)
pd.to_datetime(sample)
column_types[column] = "potential_datetime"
except:
# Check if categorical (few unique values)
unique_ratio = df[column].nunique() / len(df)
if unique_ratio < 0.1: # Less than 10% unique values
column_types[column] = "categorical"
else:
column_types[column] = "text"
# Numeric types
elif pd.api.types.is_numeric_dtype(dtype):
# Check if potential ID column
if df[column].nunique() == len(df) and df[column].min() >= 0:
column_types[column] = "id"
# Check if binary
elif df[column].nunique() <= 2:
column_types[column] = "binary"
# Check if integer
elif pd.api.types.is_integer_dtype(dtype):
column_types[column] = "integer"
else:
column_types[column] = "float"
# Boolean type
elif pd.api.types.is_bool_dtype(dtype):
column_types[column] = "boolean"
# Fallback
else:
column_types[column] = "unknown"
return column_types
@staticmethod
def suggest_visualizations(df: pd.DataFrame) -> List[Dict[str, Any]]:
"""
Suggest appropriate visualizations based on data types.
Returns a list of visualization suggestions.
"""
suggestions = []
column_types = CSVHelpers.infer_column_types(df)
numeric_columns = [col for col, type in column_types.items()
if type in ["integer", "float"]]
categorical_columns = [col for col, type in column_types.items()
if type in ["categorical", "binary"]]
datetime_columns = [col for col, type in column_types.items()
if type in ["datetime", "potential_datetime"]]
# Histogram for numeric columns
for col in numeric_columns[:3]: # Limit to first 3
suggestions.append({
"chart_type": "histogram",
"column": col,
"title": f"Distribution of {col}"
})
# Bar charts for categorical columns
for col in categorical_columns[:3]: # Limit to first 3
suggestions.append({
"chart_type": "bar",
"x_column": col,
"y_column": "count",
"title": f"Count by {col}"
})
# Time series for datetime + numeric combinations
if datetime_columns and numeric_columns:
suggestions.append({
"chart_type": "line",
"x_column": datetime_columns[0],
"y_column": numeric_columns[0],
"title": f"{numeric_columns[0]} over Time"
})
# Scatter plots for numeric pairs
if len(numeric_columns) >= 2:
suggestions.append({
"chart_type": "scatter",
"x_column": numeric_columns[0],
"y_column": numeric_columns[1],
"title": f"{numeric_columns[1]} vs {numeric_columns[0]}"
})
return suggestions