from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context import numpy as np import faiss import pickle import os import logging from helper import query_llm_with_context logging.basicConfig(level=logging.INFO) # Path for storing the FAISS index and document chunks index_path = "./faiss_index" chunks_path = "./document_chunks.pkl" pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf' print('Extracting text from pdf...') pdf_text = extract_text_from_pdf(pdf_path) print('Chunking pdf...') chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100) print('Embedding chunks...') embeddings = embedding_function(chunks) print(f"Embeddings type: {type(embeddings)}") print(f"First embedding type: {type(embeddings[0])}") print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}") # Convert embeddings to numpy array if they aren't already if not isinstance(embeddings, np.ndarray): print("Converting embeddings to numpy array...") embeddings = np.array(embeddings).astype('float32') # Get the dimension of the embeddings dimension = embeddings.shape[1] print(f"Embedding dimension: {dimension}") # Initialize FAISS index print('Initializing FAISS index...') index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search # Add vectors to the index print('Adding vectors to FAISS index...') index.add(embeddings) # Save the index print('Saving FAISS index...') faiss.write_index(index, index_path) # Save the document chunks for retrieval print('Saving document chunks...') with open(chunks_path, 'wb') as f: pickle.dump(chunks, f) print(f"Total vectors in index: {index.ntotal}") def retrieve_documents(query, n_results=5): # Generate embedding for the query query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32') # Search the index distances, indices = index.search(query_embedding, n_results) # Get the documents documents = [chunks[i] for i in indices[0]] # Convert distances to similarity scores (L2 distance: lower is better) # Normalize distances to [0, 1] range where 1 is most similar max_distance = np.max(distances) similarity_scores = [1 - (dist / max_distance) for dist in distances[0]] return documents, similarity_scores # Test the retrieval query="how has the profitability of the company been in last five years" print('Retrieving documents...') general_docs, general_scores = retrieve_documents(query, n_results=15) print(f"Number of docs returned for general query: {len(general_docs)}") # Print the results # for i, (doc, score) in enumerate(zip(general_docs, general_scores)): # print(f"\nResult {i+1} (Score: {score:.4f}):") # print(f"{doc[:200]}...") new_query=query+generate_hypothetical_answer(query) combined_context=retrieve_documents(new_query, n_results=15) answer = query_llm_with_context(query, combined_context, top_n=3) print('final_response:{answer}')