ChatCSV / indexes /csv_index_builder_bk.py
Chamin09's picture
Rename indexes/csv_index_builder.py to indexes/csv_index_builder_bk.py
fd61747 verified
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd
from llama_index.readers.file import CSVReader
from llama_index.schema import Document
class EnhancedCSVReader:
"""Enhanced CSV reader with metadata extraction capabilities."""
def __init__(self):
self.csv_reader = CSVReader()
def load_data(self, file_path: str) -> List[Document]:
"""Load CSV file and extract documents with metadata."""
# Load the CSV file
documents = self.csv_reader.load_data(file_path)
# Extract and add metadata
csv_metadata = self._extract_metadata(file_path)
# Enhance documents with metadata
for doc in documents:
doc.metadata.update(csv_metadata)
return documents
def _extract_metadata(self, file_path: str) -> Dict:
"""Extract useful metadata from CSV file."""
df = pd.read_csv(file_path)
filename = Path(file_path).name
# Extract column information
columns = df.columns.tolist()
dtypes = {col: str(df[col].dtype) for col in columns}
# Extract sample values (first 3 non-null values per column)
samples = {}
for col in columns:
non_null_values = df[col].dropna().head(3).tolist()
samples[col] = [str(val) for val in non_null_values]
# Basic statistics
row_count = len(df)
return {
"filename": filename,
"columns": columns,
"dtypes": dtypes,
"samples": samples,
"row_count": row_count
}