Spaces:
Sleeping
Sleeping
File size: 3,086 Bytes
a7aaec4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context
import numpy as np
import faiss
import pickle
import os
import logging
from helper import query_llm_with_context
logging.basicConfig(level=logging.INFO)
# Path for storing the FAISS index and document chunks
index_path = "./faiss_index"
chunks_path = "./document_chunks.pkl"
pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf'
print('Extracting text from pdf...')
pdf_text = extract_text_from_pdf(pdf_path)
print('Chunking pdf...')
chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)
print('Embedding chunks...')
embeddings = embedding_function(chunks)
print(f"Embeddings type: {type(embeddings)}")
print(f"First embedding type: {type(embeddings[0])}")
print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}")
# Convert embeddings to numpy array if they aren't already
if not isinstance(embeddings, np.ndarray):
print("Converting embeddings to numpy array...")
embeddings = np.array(embeddings).astype('float32')
# Get the dimension of the embeddings
dimension = embeddings.shape[1]
print(f"Embedding dimension: {dimension}")
# Initialize FAISS index
print('Initializing FAISS index...')
index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search
# Add vectors to the index
print('Adding vectors to FAISS index...')
index.add(embeddings)
# Save the index
print('Saving FAISS index...')
faiss.write_index(index, index_path)
# Save the document chunks for retrieval
print('Saving document chunks...')
with open(chunks_path, 'wb') as f:
pickle.dump(chunks, f)
print(f"Total vectors in index: {index.ntotal}")
def retrieve_documents(query, n_results=5):
# Generate embedding for the query
query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
# Search the index
distances, indices = index.search(query_embedding, n_results)
# Get the documents
documents = [chunks[i] for i in indices[0]]
# Convert distances to similarity scores (L2 distance: lower is better)
# Normalize distances to [0, 1] range where 1 is most similar
max_distance = np.max(distances)
similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
return documents, similarity_scores
# Test the retrieval
query="how has the profitability of the company been in last five years"
print('Retrieving documents...')
general_docs, general_scores = retrieve_documents(query, n_results=15)
print(f"Number of docs returned for general query: {len(general_docs)}")
# Print the results
# for i, (doc, score) in enumerate(zip(general_docs, general_scores)):
# print(f"\nResult {i+1} (Score: {score:.4f}):")
# print(f"{doc[:200]}...")
new_query=query+generate_hypothetical_answer(query)
combined_context=retrieve_documents(new_query, n_results=15)
answer = query_llm_with_context(query, combined_context, top_n=3)
print('final_response:{answer}')
|