|
from typing import Dict, List, Optional |
|
from pathlib import Path |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
from indexes.csv_index_builder import EnhancedCSVReader |
|
|
|
class CSVIndexManager: |
|
"""Manages creation and retrieval of indexes for CSV files.""" |
|
|
|
def __init__(self): |
|
self.csv_reader = EnhancedCSVReader() |
|
self.indexes = {} |
|
self.vectorizer = TfidfVectorizer() |
|
self.document_vectors = {} |
|
self.all_documents = {} |
|
|
|
def create_index(self, file_path: str) -> bool: |
|
"""Create index for a CSV file.""" |
|
|
|
file_id = Path(file_path).stem |
|
|
|
|
|
try: |
|
documents = self.csv_reader.load_data(file_path) |
|
|
|
|
|
self.all_documents[file_id] = documents |
|
|
|
|
|
doc_contents = [doc["content"] for doc in documents] |
|
|
|
|
|
if doc_contents: |
|
|
|
if not self.document_vectors: |
|
vectors = self.vectorizer.fit_transform(doc_contents) |
|
else: |
|
|
|
vectors = self.vectorizer.transform(doc_contents) |
|
|
|
self.document_vectors[file_id] = vectors |
|
|
|
|
|
self.indexes[file_id] = { |
|
"metadata": documents[0]["metadata"] if documents else {}, |
|
"path": file_path |
|
} |
|
|
|
return True |
|
|
|
except Exception as e: |
|
print(f"Error creating index for {file_path}: {e}") |
|
return False |
|
|
|
def index_directory(self, directory_path: str) -> Dict[str, bool]: |
|
"""Index all CSV files in a directory.""" |
|
indexed_files = {} |
|
|
|
|
|
csv_files = [f for f in os.listdir(directory_path) |
|
if f.lower().endswith('.csv')] |
|
|
|
|
|
for csv_file in csv_files: |
|
file_path = os.path.join(directory_path, csv_file) |
|
file_id = Path(file_path).stem |
|
success = self.create_index(file_path) |
|
indexed_files[file_id] = success |
|
|
|
return indexed_files |
|
|
|
def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]: |
|
"""Find most relevant CSV files for a given query.""" |
|
if not self.indexes: |
|
return [] |
|
|
|
|
|
query_vector = self.vectorizer.transform([query]) |
|
|
|
|
|
similarities = {} |
|
for file_id, vectors in self.document_vectors.items(): |
|
|
|
file_similarities = cosine_similarity(query_vector, vectors).flatten() |
|
similarities[file_id] = np.max(file_similarities) |
|
|
|
|
|
sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True) |
|
return [file_id for file_id, _ in sorted_files[:top_k]] |
|
|
|
def get_documents(self, file_id: str) -> List[Dict]: |
|
"""Get all documents for a specific CSV file.""" |
|
return self.all_documents.get(file_id, []) |
|
|