File size: 17,187 Bytes
ef6d407
 
 
 
 
 
 
 
 
 
 
c3c6fa1
 
 
 
 
 
 
 
ef6d407
 
 
85e0b8f
c3c6fa1
 
 
 
 
 
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
c3c6fa1
ef6d407
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
c3c6fa1
 
ef6d407
 
 
 
c3c6fa1
 
 
 
 
 
 
 
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
 
 
 
 
 
 
 
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
c3c6fa1
 
 
 
 
 
 
 
 
ef6d407
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
 
 
 
 
 
 
c3c6fa1
ef6d407
c3c6fa1
 
 
 
ef6d407
c3c6fa1
 
 
ef6d407
 
 
 
 
 
 
 
 
 
 
c3c6fa1
ef6d407
 
c3c6fa1
ef6d407
c3c6fa1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import os
import glob
import json
import numpy as np
import faiss

from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
import nltk

# Baixar o tokenizador de frases do NLTK (necessário apenas uma vez)
# try:
#     print("tentanto encontrar o tokenizador de frases do NLTK...")
#     nltk.data.find('tokenizers/punkt') or nltk.download('tokenizers/punkt_tab')
# except nltk.downloader.DownloadError:
#     print("Tokenizador de frases do NLTK não encontrado. Baixando...")
#     nltk.download('punkt_tab')

nltk.download("punkt")

# Configurações
# Configurações
RAG_DIR = "/home/user/app/RAG"
DATA_DIR = os.path.join(RAG_DIR, "data")
FAISS_INDEX_DIR = os.path.join(RAG_DIR, "FAISS")  # Renamed from FAISS_DIR for clarity
CONTEXT_FAISS_INDEX_PATH = os.path.join(FAISS_INDEX_DIR, "context_index.faiss")  # Renamed variable
CONTEXT_JSON_TEXT_PATH = os.path.join(FAISS_INDEX_DIR, "context_texts.json")  # Renamed variable
EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe"  # Renamed variable


def _load_embedding_model() -> SentenceTransformer:
    """
    Initializes and loads the specified SentenceTransformer embedding model.

    This model is used to convert text into numerical vectors (embeddings),
    which are essential for similarity search in the FAISS index.

    Returns:
        SentenceTransformer: An instance of the loaded SentenceTransformer model.
    """
    print(f"Carregando modelo de embeddings {EMBEDDING_MODEL_NAME}...")
    return SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)


def _load_existing_index_and_documents() -> tuple[list | None, faiss.Index | None]:
    """
    Attempts to load an existing FAISS index and its associated text documents
    if the index and JSON files already exist in the FAISS_INDEX_DIR.

    This function checks for persisted data to avoid costly recreation
    of the index with each initialization if the underlying data has not changed.

    Returns:
        tuple[list | None, faiss.Index | None]: A tuple containing the list of documents
                                                 and the FAISS index object if both are
                                                 successfully loaded. Otherwise,
                                                 returns (None, None).
    """
    if os.path.exists(CONTEXT_FAISS_INDEX_PATH) and os.path.exists(CONTEXT_JSON_TEXT_PATH):
        print("Carregando índice e documentos existentes...")
        try:
            faiss_index = faiss.read_index(CONTEXT_FAISS_INDEX_PATH)
            with open(CONTEXT_JSON_TEXT_PATH, "r", encoding="utf-8") as f:
                loaded_documents = json.load(f)
            print(f"Carregados {len(loaded_documents)} documentos do índice existente.")
            return loaded_documents, faiss_index
        except Exception as e:
            print(f"Erro ao carregar índice ou documentos existentes: {e}. Reconstruindo.")
            return None, None
    return None, None


def _load_source_documents() -> list[str]:
    """
    Loads and preprocesses text documents from the data folder (DATA_DIR).

    This function searches for all '.txt' files in the specified directory,
    reads their contents, and splits them into context units (paragraphs or blocks
    separated by double blank lines). Empty lines are filtered out.

    Returns:
        list[str]: A list of strings, where each string is a context unit
                   extracted from the documents.

    Raises:
        ValueError: If no '.txt' files are found in the data directory
                    or if no valid documents are loaded after processing.
    """
    file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt"))
    if not file_paths:
        raise ValueError(f"Nenhum arquivo .txt encontrado em {DATA_DIR}. Por favor, adicione documentos.")

    context_chunks = []
    for file_path in file_paths:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                # Splits by double newline, strips whitespace, and filters out empty strings
                context_chunks.extend(list(filter(None, map(str.strip, f.read().split("\n\n")))))
        except Exception as e:
            print(f"Erro ao ler o arquivo {file_path}: {e}")
            continue

    if not context_chunks:
        raise ValueError("Nenhum documento válido foi carregado após o processamento dos arquivos.")

    print(f"Carregados {len(context_chunks)} documentos.")
    return context_chunks


def _generate_text_embeddings(embedder_model: SentenceTransformer, text_documents: list[str]) -> np.ndarray:
    """
    Generates numerical embeddings for a list of text documents using the provided embedder.

    Embeddings are vector representations of text that capture its semantic meaning,
    allowing for similarity comparison.

    Args:
        embedder_model (SentenceTransformer): The pre-loaded embedding model.
        text_documents (list[str]): The list of text strings for which to generate embeddings.

    Returns:
        np.ndarray: A NumPy array of type float32 containing the generated embeddings.
                    Each row in the array corresponds to the embedding of a document.

    Raises:
        ValueError: If no embeddings can be generated (e.g., empty document list).
    """
    print("Gerando embeddings para os documentos...")
    batch_size = 32
    generated_embeddings_list = []
    for i in range(0, len(text_documents), batch_size):
        batch = text_documents[i : i + batch_size]
        try:
            if batch:  # Ensure the batch is not empty
                generated_embeddings_list.extend(embedder_model.encode(batch, show_progress_bar=False))
        except Exception as e:
            print(f"Erro ao gerar embeddings para lote {i//batch_size if batch_size > 0 else i}: {e}")
            # In case of error, fill with zero vectors of the correct dimension
            embedding_dim = embedder_model.get_sentence_embedding_dimension()
            generated_embeddings_list.extend([np.zeros(embedding_dim) for _ in batch])

    if not generated_embeddings_list:
        raise ValueError("Nenhum embedding foi gerado.")

    return np.array(generated_embeddings_list, dtype=np.float32)


def _create_faiss_index(document_embeddings: np.ndarray) -> faiss.Index:
    """
    Creates and populates a FAISS (Facebook AI Similarity Search) index from a set of embeddings.

    The FAISS index is a data structure optimized for performing efficient similarity searches
    in large collections of vectors.

    Args:
        document_embeddings (np.ndarray): A NumPy array containing the document embeddings.

    Returns:
        faiss.Index: The populated FAISS index object, ready for searches.
    """
    print("Criando índice FAISS...")
    dimension = document_embeddings.shape[1]
    # IndexFlatL2 uses Euclidean distance (L2) for similarity.
    # Smaller distances indicate greater similarity.
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(document_embeddings)
    return faiss_index


def initialize_rag_system() -> tuple[list[str], faiss.Index, SentenceTransformer]:
    """
    Initializes the complete RAG (Retrieval Augmented Generation) system.

    This process involves:
    1. Creating necessary directories for persistence.
    2. Loading the embedding model.
    3. Attempting to load an existing FAISS index and documents.
    4. If they don't exist or are corrupted, load documents from disk,
       generate their embeddings, create a new FAISS index, and save them for future use.

    Returns:
        tuple[list[str], faiss.Index, SentenceTransformer]: A tuple containing:
                                                             - The list of text documents.
                                                             - The FAISS index object.
                                                             - The loaded SentenceTransformer model.
    """
    print("Inicializando RAG...")
    os.makedirs(FAISS_INDEX_DIR, exist_ok=True)
    text_embedder = _load_embedding_model()
    context_documents, faiss_index = _load_existing_index_and_documents()

    if faiss_index is None:  # If the index doesn't exist or an error occurred loading it, rebuild
        print("Índice FAISS não encontrado ou corrompido. Reconstruindo...")
        context_documents = _load_source_documents()
        document_embeddings = _generate_text_embeddings(text_embedder, context_documents)
        faiss_index = _create_faiss_index(document_embeddings)

        faiss.write_index(faiss_index, CONTEXT_FAISS_INDEX_PATH)
        with open(CONTEXT_JSON_TEXT_PATH, "w", encoding="utf-8") as f:
            json.dump(context_documents, f, ensure_ascii=False, indent=4)  # Added indent for readability
        print("Novo índice e documentos salvos com sucesso.")

    return context_documents, faiss_index, text_embedder


def search_with_full_query(
    full_question_text: str,
    context_documents: list[str],
    faiss_index: faiss.Index,
    embedder_model: SentenceTransformer,
    k_results: int = 3,
) -> list[tuple[int, str, float]]:
    """
    Searches for the 'k_results' most relevant documents for the **entire question**,
    treating it as a single search unit. This function does not segment the question into sentences.

    Args:
        full_question_text (str): The complete user question.
        context_documents (list[str]): List of strings, where each string is a context document.
        faiss_index (faiss.Index): The loaded FAISS index containing document embeddings.
        embedder_model (SentenceTransformer): The embedding model used to encode the question.
        k_results (int, optional): The number of most relevant documents to return. Defaults to 3.

    Returns:
        list[tuple[int, str, float]]: A list of tuples, where each tuple contains:
                                        - The original index of the document in `context_documents`.
                                        - The text of the document.
                                        - The similarity distance (lower means more similar).
                                      Returns an empty list if the question is empty or an error occurs.
    """
    if not full_question_text or not full_question_text.strip():
        print("Pergunta vazia. Não é possível buscar contexto completo.")
        return []

    print(f"Buscando contexto completo para: '{full_question_text}'")
    try:
        query_embedding = np.array(embedder_model.encode([full_question_text]), dtype=np.float32)
        # D: distances, I: indices of neighbors
        distances, indices = faiss_index.search(query_embedding, k_results)

        results_list = []
        for j in range(len(indices[0])):
            document_index = indices[0][j]
            distance_score = distances[0][j]
            # Ensure the index is valid before adding
            if 0 <= document_index < len(context_documents):
                results_list.append((document_index, context_documents[document_index], distance_score))

        # FAISS results are already sorted by increasing distance (most similar first).
        return results_list

    except Exception as e:
        print(f"Erro ao buscar contexto completo: {e}")
        return []


def search_with_multiple_sentences(
    question_text: str,
    context_documents: list[str],
    faiss_index: faiss.Index,
    embedder_model: SentenceTransformer,
    k_per_sentence: int = 2,
) -> list[tuple[int, str, float]]:
    """
    Segments the question into sentences and searches for the 'k_per_sentence' most relevant
    documents for **EACH sentence**, then consolidates and returns only unique contexts.
    If a document is relevant to multiple sentences, the lowest distance (best relevance) is kept.

    Args:
        question_text (str): The user question, which may contain multiple sentences.
        context_documents (list[str]): List of strings, where each string is a context document.
        faiss_index (faiss.Index): The loaded FAISS index containing document embeddings.
        embedder_model (SentenceTransformer): The embedding model used to encode sentences.
        k_per_sentence (int, optional): The number of documents to search for each sentence
                                     of the question. Defaults to 2.

    Returns:
        list[tuple[int, str, float]]: A list of tuples (document_index, document_text, distance)
                                      with the most relevant unique contexts, sorted by distance
                                      (most relevant to least relevant). Returns an empty list
                                      if the question is empty or no sentences are detected.
    """
    if not question_text or not question_text.strip():
        print("Pergunta vazia. Não é possível buscar múltiplos contextos.")
        return []

    print(f"Buscando múltiplos contextos para: '{question_text}'")

    sentences = sent_tokenize(question_text, language="portuguese")
    if not sentences:
        print("Nenhuma frase detectada na pergunta para busca de múltiplos contextos.")
        return []

    # Dictionary to store the best result for each unique document:
    # {document_index: (document_index, text, distance)}
    # This ensures uniqueness and that the lowest distance (best relevance) is maintained.
    consolidated_contexts_map = {}

    try:
        for sentence in sentences:
            print(f"Processando frase para múltiplos contextos: '{sentence}'")
            if not sentence.strip():  # Skip empty sentences that might be produced by sent_tokenize
                continue
            query_embedding = np.array(embedder_model.encode([sentence]), dtype=np.float32)
            distances, indices = faiss_index.search(query_embedding, k_per_sentence)

            for j in range(len(indices[0])):
                document_index = indices[0][j]
                distance_score = distances[0][j]

                if 0 <= document_index < len(context_documents):
                    # If the document has already been found, update if the new distance is smaller (more relevant)
                    if (
                        document_index not in consolidated_contexts_map
                        or distance_score < consolidated_contexts_map[document_index][2]
                    ):
                        consolidated_contexts_map[document_index] = (
                            document_index,
                            context_documents[document_index],
                            distance_score,
                        )

        # Convert the dictionary of consolidated contexts back to a list
        results_list = list(consolidated_contexts_map.values())
        # Sort the final results by distance (from most relevant to least)
        results_list.sort(key=lambda x: x[2])

        return results_list

    except Exception as e:
        print(f"Erro ao buscar múltiplos contextos: {e}")
        return []


# --- Funções de Teste ---
def test_context_search_interactive():
    """
    Interactive test function to demonstrate context search capabilities.

    Allows the user to input questions and choose between 'Full Context Search'
    or 'Multiple Contexts Search' strategies, displaying the most relevant
    contexts found.
    """
    try:
        context_documents, faiss_index, text_embedder = initialize_rag_system()
    except Exception as e:
        print(f"Erro fatal ao inicializar RAG: {e}")
        return

    while True:
        user_question = input("\nDigite uma pergunta (ou 'sair' para encerrar): ")
        if user_question.lower() == "sair":
            break

        print("\nEscolha o tipo de busca:")
        print("1. Buscar Contexto Completo (pergunta inteira)")
        print("2. Buscar Múltiplos Contextos (segmentando a pergunta em frases)")
        search_choice = input("Opção (1 ou 2): ")

        retrieved_contexts = []
        if search_choice == "1":
            print(f"\nRealizando busca de contexto completo para: '{user_question}'")
            retrieved_contexts = search_with_full_query(
                user_question, context_documents, faiss_index, text_embedder, k_results=5
            )
        elif search_choice == "2":
            print(f"\nRealizando busca de múltiplos contextos para: '{user_question}'")
            retrieved_contexts = search_with_multiple_sentences(
                user_question, context_documents, faiss_index, text_embedder, k_per_sentence=3
            )
        else:
            print("Opção inválida. Tente novamente.")
            continue

        if not retrieved_contexts:
            print("Nenhum contexto encontrado.")
            continue

        print("\nContextos mais relevantes:")
        for doc_idx, text_content, distance_score in retrieved_contexts:
            print(f"\nÍndice Original do Documento: {doc_idx}, Distância: {distance_score:.4f}")
            print(f"Texto: {text_content[:500]}...")  # Limita o texto para melhor visualização
            print("-" * 50)


if __name__ == "__main__":
    test_context_search_interactive()