File size: 1,987 Bytes
5bc018f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from typing import Dict, List, Optional
from pathlib import Path
import pandas as pd

class EnhancedCSVReader:
    """Enhanced CSV reader with metadata extraction capabilities."""
    
    def __init__(self):
        """Initialize the CSV reader."""
        pass
        
    def load_data(self, file_path: str) -> List[Dict]:
        """Load CSV file and extract documents with metadata."""
        # Extract metadata
        csv_metadata = self._extract_metadata(file_path)
        
        # Read the CSV content
        df = pd.read_csv(file_path)
        
        # Convert each row to a document-like dictionary
        documents = []
        for _, row in df.head(10).iterrows():  # Sample first 10 rows
            doc = {
                "content": row.to_string(),
                "metadata": csv_metadata.copy()
            }
            documents.append(doc)
        
        # Add a schema document
        schema_doc = {
            "content": f"CSV Schema: {', '.join(df.columns)}",
            "metadata": csv_metadata.copy()
        }
        documents.append(schema_doc)
            
        return documents
    
    def _extract_metadata(self, file_path: str) -> Dict:
        """Extract useful metadata from CSV file."""
        df = pd.read_csv(file_path)
        filename = Path(file_path).name
        
        # Extract column information
        columns = df.columns.tolist()
        dtypes = {col: str(df[col].dtype) for col in columns}
        
        # Extract sample values (first 3 non-null values per column)
        samples = {}
        for col in columns:
            non_null_values = df[col].dropna().head(3).tolist()
            samples[col] = [str(val) for val in non_null_values]
            
        # Basic statistics
        row_count = len(df)
        
        return {
            "filename": filename,
            "columns": columns,
            "dtypes": dtypes,
            "samples": samples,
            "row_count": row_count
        }