File size: 1,508 Bytes
f861dee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class DocumentRetriever:
    def __init__(self):
        self.documents = []
        self.vectorizer = TfidfVectorizer()
        self.index = None

    def load_documents(self, source_dir):
        from pathlib import Path

        data_dir = Path(source_dir)
        if not data_dir.exists():
            print(f"Source directory not found: {source_dir}")
            return

        for file in data_dir.glob("*.txt"):
            with open(file, "r", encoding="utf-8") as f:
                self.documents.append(f.read())

        print(f"Loaded {len(self.documents)} documents.")

        # Create the FAISS index
        self._build_index()

    def _build_index(self):
        # Generate TF-IDF vectors for documents
        doc_vectors = self.vectorizer.fit_transform(self.documents).toarray()

        # Create FAISS index
        self.index = faiss.IndexFlatL2(doc_vectors.shape[1])
        self.index.add(doc_vectors.astype(np.float32))

    def retrieve(self, query, top_k=5):
        if not self.index:
            return ["Document retrieval is not initialized."]

        # Vectorize the query
        query_vector = self.vectorizer.transform([query]).toarray().astype(np.float32)

        # Perform FAISS search
        distances, indices = self.index.search(query_vector, top_k)

        # Return matching documents
        return [self.documents[i] for i in indices[0] if i < len(self.documents)]