ChatCSV / indexes /index_manager.py
Chamin09's picture
Create index_manager.py
c158aae verified
from typing import Dict, List, Optional
from pathlib import Path
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from indexes.csv_index_builder import EnhancedCSVReader
class CSVIndexManager:
"""Manages creation and retrieval of indexes for CSV files."""
def __init__(self):
self.csv_reader = EnhancedCSVReader()
self.indexes = {}
self.vectorizer = TfidfVectorizer()
self.document_vectors = {}
self.all_documents = {}
def create_index(self, file_path: str) -> bool:
"""Create index for a CSV file."""
# Extract filename as identifier
file_id = Path(file_path).stem
# Load documents with metadata
try:
documents = self.csv_reader.load_data(file_path)
# Store documents
self.all_documents[file_id] = documents
# Create document content for vectorization
doc_contents = [doc["content"] for doc in documents]
# Vectorize documents
if doc_contents:
# If this is our first document, fit the vectorizer
if not self.document_vectors:
vectors = self.vectorizer.fit_transform(doc_contents)
else:
# Otherwise, use the existing vocabulary
vectors = self.vectorizer.transform(doc_contents)
self.document_vectors[file_id] = vectors
# Store metadata
self.indexes[file_id] = {
"metadata": documents[0]["metadata"] if documents else {},
"path": file_path
}
return True
except Exception as e:
print(f"Error creating index for {file_path}: {e}")
return False
def index_directory(self, directory_path: str) -> Dict[str, bool]:
"""Index all CSV files in a directory."""
indexed_files = {}
# Get all CSV files in directory
csv_files = [f for f in os.listdir(directory_path)
if f.lower().endswith('.csv')]
# Create index for each CSV file
for csv_file in csv_files:
file_path = os.path.join(directory_path, csv_file)
file_id = Path(file_path).stem
success = self.create_index(file_path)
indexed_files[file_id] = success
return indexed_files
def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]:
"""Find most relevant CSV files for a given query."""
if not self.indexes:
return []
# Vectorize the query
query_vector = self.vectorizer.transform([query])
# Calculate similarity with each CSV's content
similarities = {}
for file_id, vectors in self.document_vectors.items():
# Calculate max similarity across all documents in this CSV
file_similarities = cosine_similarity(query_vector, vectors).flatten()
similarities[file_id] = np.max(file_similarities)
# Sort by similarity and return top_k
sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
return [file_id for file_id, _ in sorted_files[:top_k]]
def get_documents(self, file_id: str) -> List[Dict]:
"""Get all documents for a specific CSV file."""
return self.all_documents.get(file_id, [])