ChatCSV / indexes /csv_index_builder.py
Chamin09's picture
Create csv_index_builder.py
5bc018f verified
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd
class EnhancedCSVReader:
"""Enhanced CSV reader with metadata extraction capabilities."""
def __init__(self):
"""Initialize the CSV reader."""
pass
def load_data(self, file_path: str) -> List[Dict]:
"""Load CSV file and extract documents with metadata."""
# Extract metadata
csv_metadata = self._extract_metadata(file_path)
# Read the CSV content
df = pd.read_csv(file_path)
# Convert each row to a document-like dictionary
documents = []
for _, row in df.head(10).iterrows(): # Sample first 10 rows
doc = {
"content": row.to_string(),
"metadata": csv_metadata.copy()
}
documents.append(doc)
# Add a schema document
schema_doc = {
"content": f"CSV Schema: {', '.join(df.columns)}",
"metadata": csv_metadata.copy()
}
documents.append(schema_doc)
return documents
def _extract_metadata(self, file_path: str) -> Dict:
"""Extract useful metadata from CSV file."""
df = pd.read_csv(file_path)
filename = Path(file_path).name
# Extract column information
columns = df.columns.tolist()
dtypes = {col: str(df[col].dtype) for col in columns}
# Extract sample values (first 3 non-null values per column)
samples = {}
for col in columns:
non_null_values = df[col].dropna().head(3).tolist()
samples[col] = [str(val) for val in non_null_values]
# Basic statistics
row_count = len(df)
return {
"filename": filename,
"columns": columns,
"dtypes": dtypes,
"samples": samples,
"row_count": row_count
}