Spaces:
Sleeping
Sleeping
from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context | |
import numpy as np | |
import faiss | |
import pickle | |
import os | |
import logging | |
from helper import query_llm_with_context | |
logging.basicConfig(level=logging.INFO) | |
# Path for storing the FAISS index and document chunks | |
index_path = "./faiss_index" | |
chunks_path = "./document_chunks.pkl" | |
pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf' | |
print('Extracting text from pdf...') | |
pdf_text = extract_text_from_pdf(pdf_path) | |
print('Chunking pdf...') | |
chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100) | |
print('Embedding chunks...') | |
embeddings = embedding_function(chunks) | |
print(f"Embeddings type: {type(embeddings)}") | |
print(f"First embedding type: {type(embeddings[0])}") | |
print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}") | |
# Convert embeddings to numpy array if they aren't already | |
if not isinstance(embeddings, np.ndarray): | |
print("Converting embeddings to numpy array...") | |
embeddings = np.array(embeddings).astype('float32') | |
# Get the dimension of the embeddings | |
dimension = embeddings.shape[1] | |
print(f"Embedding dimension: {dimension}") | |
# Initialize FAISS index | |
print('Initializing FAISS index...') | |
index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search | |
# Add vectors to the index | |
print('Adding vectors to FAISS index...') | |
index.add(embeddings) | |
# Save the index | |
print('Saving FAISS index...') | |
faiss.write_index(index, index_path) | |
# Save the document chunks for retrieval | |
print('Saving document chunks...') | |
with open(chunks_path, 'wb') as f: | |
pickle.dump(chunks, f) | |
print(f"Total vectors in index: {index.ntotal}") | |
def retrieve_documents(query, n_results=5): | |
# Generate embedding for the query | |
query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32') | |
# Search the index | |
distances, indices = index.search(query_embedding, n_results) | |
# Get the documents | |
documents = [chunks[i] for i in indices[0]] | |
# Convert distances to similarity scores (L2 distance: lower is better) | |
# Normalize distances to [0, 1] range where 1 is most similar | |
max_distance = np.max(distances) | |
similarity_scores = [1 - (dist / max_distance) for dist in distances[0]] | |
return documents, similarity_scores | |
# Test the retrieval | |
query="how has the profitability of the company been in last five years" | |
print('Retrieving documents...') | |
general_docs, general_scores = retrieve_documents(query, n_results=15) | |
print(f"Number of docs returned for general query: {len(general_docs)}") | |
# Print the results | |
# for i, (doc, score) in enumerate(zip(general_docs, general_scores)): | |
# print(f"\nResult {i+1} (Score: {score:.4f}):") | |
# print(f"{doc[:200]}...") | |
new_query=query+generate_hypothetical_answer(query) | |
combined_context=retrieve_documents(new_query, n_results=15) | |
answer = query_llm_with_context(query, combined_context, top_n=3) | |
print('final_response:{answer}') | |