|
from typing import Dict, List, Optional |
|
from pathlib import Path |
|
import pandas as pd |
|
|
|
class EnhancedCSVReader: |
|
"""Enhanced CSV reader with metadata extraction capabilities.""" |
|
|
|
def __init__(self): |
|
"""Initialize the CSV reader.""" |
|
pass |
|
|
|
def load_data(self, file_path: str) -> List[Dict]: |
|
"""Load CSV file and extract documents with metadata.""" |
|
|
|
csv_metadata = self._extract_metadata(file_path) |
|
|
|
|
|
df = pd.read_csv(file_path) |
|
|
|
|
|
documents = [] |
|
for _, row in df.head(10).iterrows(): |
|
doc = { |
|
"content": row.to_string(), |
|
"metadata": csv_metadata.copy() |
|
} |
|
documents.append(doc) |
|
|
|
|
|
schema_doc = { |
|
"content": f"CSV Schema: {', '.join(df.columns)}", |
|
"metadata": csv_metadata.copy() |
|
} |
|
documents.append(schema_doc) |
|
|
|
return documents |
|
|
|
def _extract_metadata(self, file_path: str) -> Dict: |
|
"""Extract useful metadata from CSV file.""" |
|
df = pd.read_csv(file_path) |
|
filename = Path(file_path).name |
|
|
|
|
|
columns = df.columns.tolist() |
|
dtypes = {col: str(df[col].dtype) for col in columns} |
|
|
|
|
|
samples = {} |
|
for col in columns: |
|
non_null_values = df[col].dropna().head(3).tolist() |
|
samples[col] = [str(val) for val in non_null_values] |
|
|
|
|
|
row_count = len(df) |
|
|
|
return { |
|
"filename": filename, |
|
"columns": columns, |
|
"dtypes": dtypes, |
|
"samples": samples, |
|
"row_count": row_count |
|
} |
|
|