File size: 1,987 Bytes
5bc018f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd
class EnhancedCSVReader:
"""Enhanced CSV reader with metadata extraction capabilities."""
def __init__(self):
"""Initialize the CSV reader."""
pass
def load_data(self, file_path: str) -> List[Dict]:
"""Load CSV file and extract documents with metadata."""
# Extract metadata
csv_metadata = self._extract_metadata(file_path)
# Read the CSV content
df = pd.read_csv(file_path)
# Convert each row to a document-like dictionary
documents = []
for _, row in df.head(10).iterrows(): # Sample first 10 rows
doc = {
"content": row.to_string(),
"metadata": csv_metadata.copy()
}
documents.append(doc)
# Add a schema document
schema_doc = {
"content": f"CSV Schema: {', '.join(df.columns)}",
"metadata": csv_metadata.copy()
}
documents.append(schema_doc)
return documents
def _extract_metadata(self, file_path: str) -> Dict:
"""Extract useful metadata from CSV file."""
df = pd.read_csv(file_path)
filename = Path(file_path).name
# Extract column information
columns = df.columns.tolist()
dtypes = {col: str(df[col].dtype) for col in columns}
# Extract sample values (first 3 non-null values per column)
samples = {}
for col in columns:
non_null_values = df[col].dropna().head(3).tolist()
samples[col] = [str(val) for val in non_null_values]
# Basic statistics
row_count = len(df)
return {
"filename": filename,
"columns": columns,
"dtypes": dtypes,
"samples": samples,
"row_count": row_count
}
|