from typing import Dict, List, Optional from pathlib import Path import pandas as pd class EnhancedCSVReader: """Enhanced CSV reader with metadata extraction capabilities.""" def __init__(self): """Initialize the CSV reader.""" pass def load_data(self, file_path: str) -> List[Dict]: """Load CSV file and extract documents with metadata.""" # Extract metadata csv_metadata = self._extract_metadata(file_path) # Read the CSV content df = pd.read_csv(file_path) # Convert each row to a document-like dictionary documents = [] for _, row in df.head(10).iterrows(): # Sample first 10 rows doc = { "content": row.to_string(), "metadata": csv_metadata.copy() } documents.append(doc) # Add a schema document schema_doc = { "content": f"CSV Schema: {', '.join(df.columns)}", "metadata": csv_metadata.copy() } documents.append(schema_doc) return documents def _extract_metadata(self, file_path: str) -> Dict: """Extract useful metadata from CSV file.""" df = pd.read_csv(file_path) filename = Path(file_path).name # Extract column information columns = df.columns.tolist() dtypes = {col: str(df[col].dtype) for col in columns} # Extract sample values (first 3 non-null values per column) samples = {} for col in columns: non_null_values = df[col].dropna().head(3).tolist() samples[col] = [str(val) for val in non_null_values] # Basic statistics row_count = len(df) return { "filename": filename, "columns": columns, "dtypes": dtypes, "samples": samples, "row_count": row_count }