File size: 1,729 Bytes
e13d87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd
from llama_index.readers.file import CSVReader
from llama_index.schema import Document

class EnhancedCSVReader:
    """Enhanced CSV reader with metadata extraction capabilities."""
    
    def __init__(self):
        self.csv_reader = CSVReader()
        
    def load_data(self, file_path: str) -> List[Document]:
        """Load CSV file and extract documents with metadata."""
        # Load the CSV file
        documents = self.csv_reader.load_data(file_path)
        
        # Extract and add metadata
        csv_metadata = self._extract_metadata(file_path)
        
        # Enhance documents with metadata
        for doc in documents:
            doc.metadata.update(csv_metadata)
            
        return documents
    
    def _extract_metadata(self, file_path: str) -> Dict:
        """Extract useful metadata from CSV file."""
        df = pd.read_csv(file_path)
        filename = Path(file_path).name
        
        # Extract column information
        columns = df.columns.tolist()
        dtypes = {col: str(df[col].dtype) for col in columns}
        
        # Extract sample values (first 3 non-null values per column)
        samples = {}
        for col in columns:
            non_null_values = df[col].dropna().head(3).tolist()
            samples[col] = [str(val) for val in non_null_values]
            
        # Basic statistics
        row_count = len(df)
        
        return {
            "filename": filename,
            "columns": columns,
            "dtypes": dtypes,
            "samples": samples,
            "row_count": row_count
        }