File size: 1,729 Bytes
e13d87a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd
from llama_index.readers.file import CSVReader
from llama_index.schema import Document
class EnhancedCSVReader:
"""Enhanced CSV reader with metadata extraction capabilities."""
def __init__(self):
self.csv_reader = CSVReader()
def load_data(self, file_path: str) -> List[Document]:
"""Load CSV file and extract documents with metadata."""
# Load the CSV file
documents = self.csv_reader.load_data(file_path)
# Extract and add metadata
csv_metadata = self._extract_metadata(file_path)
# Enhance documents with metadata
for doc in documents:
doc.metadata.update(csv_metadata)
return documents
def _extract_metadata(self, file_path: str) -> Dict:
"""Extract useful metadata from CSV file."""
df = pd.read_csv(file_path)
filename = Path(file_path).name
# Extract column information
columns = df.columns.tolist()
dtypes = {col: str(df[col].dtype) for col in columns}
# Extract sample values (first 3 non-null values per column)
samples = {}
for col in columns:
non_null_values = df[col].dropna().head(3).tolist()
samples[col] = [str(val) for val in non_null_values]
# Basic statistics
row_count = len(df)
return {
"filename": filename,
"columns": columns,
"dtypes": dtypes,
"samples": samples,
"row_count": row_count
}
|