File size: 3,662 Bytes
c158aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from typing import Dict, List, Optional
from pathlib import Path
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from indexes.csv_index_builder import EnhancedCSVReader

class CSVIndexManager:
    """Manages creation and retrieval of indexes for CSV files."""
    
    def __init__(self):
        self.csv_reader = EnhancedCSVReader()
        self.indexes = {}
        self.vectorizer = TfidfVectorizer()
        self.document_vectors = {}
        self.all_documents = {}
        
    def create_index(self, file_path: str) -> bool:
        """Create index for a CSV file."""
        # Extract filename as identifier
        file_id = Path(file_path).stem
        
        # Load documents with metadata
        try:
            documents = self.csv_reader.load_data(file_path)
            
            # Store documents
            self.all_documents[file_id] = documents
            
            # Create document content for vectorization
            doc_contents = [doc["content"] for doc in documents]
            
            # Vectorize documents
            if doc_contents:
                # If this is our first document, fit the vectorizer
                if not self.document_vectors:
                    vectors = self.vectorizer.fit_transform(doc_contents)
                else:
                    # Otherwise, use the existing vocabulary
                    vectors = self.vectorizer.transform(doc_contents)
                
                self.document_vectors[file_id] = vectors
            
            # Store metadata
            self.indexes[file_id] = {
                "metadata": documents[0]["metadata"] if documents else {},
                "path": file_path
            }
            
            return True
            
        except Exception as e:
            print(f"Error creating index for {file_path}: {e}")
            return False
    
    def index_directory(self, directory_path: str) -> Dict[str, bool]:
        """Index all CSV files in a directory."""
        indexed_files = {}
        
        # Get all CSV files in directory
        csv_files = [f for f in os.listdir(directory_path) 
                    if f.lower().endswith('.csv')]
        
        # Create index for each CSV file
        for csv_file in csv_files:
            file_path = os.path.join(directory_path, csv_file)
            file_id = Path(file_path).stem
            success = self.create_index(file_path)
            indexed_files[file_id] = success
        
        return indexed_files
    
    def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]:
        """Find most relevant CSV files for a given query."""
        if not self.indexes:
            return []
        
        # Vectorize the query
        query_vector = self.vectorizer.transform([query])
        
        # Calculate similarity with each CSV's content
        similarities = {}
        for file_id, vectors in self.document_vectors.items():
            # Calculate max similarity across all documents in this CSV
            file_similarities = cosine_similarity(query_vector, vectors).flatten()
            similarities[file_id] = np.max(file_similarities)
        
        # Sort by similarity and return top_k
        sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
        return [file_id for file_id, _ in sorted_files[:top_k]]
    
    def get_documents(self, file_id: str) -> List[Dict]:
        """Get all documents for a specific CSV file."""
        return self.all_documents.get(file_id, [])