Spaces:
Running
Running
File size: 1,508 Bytes
f861dee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
class DocumentRetriever:
def __init__(self):
self.documents = []
self.vectorizer = TfidfVectorizer()
self.index = None
def load_documents(self, source_dir):
from pathlib import Path
data_dir = Path(source_dir)
if not data_dir.exists():
print(f"Source directory not found: {source_dir}")
return
for file in data_dir.glob("*.txt"):
with open(file, "r", encoding="utf-8") as f:
self.documents.append(f.read())
print(f"Loaded {len(self.documents)} documents.")
# Create the FAISS index
self._build_index()
def _build_index(self):
# Generate TF-IDF vectors for documents
doc_vectors = self.vectorizer.fit_transform(self.documents).toarray()
# Create FAISS index
self.index = faiss.IndexFlatL2(doc_vectors.shape[1])
self.index.add(doc_vectors.astype(np.float32))
def retrieve(self, query, top_k=5):
if not self.index:
return ["Document retrieval is not initialized."]
# Vectorize the query
query_vector = self.vectorizer.transform([query]).toarray().astype(np.float32)
# Perform FAISS search
distances, indices = self.index.search(query_vector, top_k)
# Return matching documents
return [self.documents[i] for i in indices[0] if i < len(self.documents)]
|