File size: 3,086 Bytes
a7aaec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context
import numpy as np
import faiss
import pickle
import os
import logging
from helper import query_llm_with_context
logging.basicConfig(level=logging.INFO)

# Path for storing the FAISS index and document chunks
index_path = "./faiss_index"
chunks_path = "./document_chunks.pkl"

pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf'

print('Extracting text from pdf...')
pdf_text = extract_text_from_pdf(pdf_path)

print('Chunking pdf...')
chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)

print('Embedding chunks...')
embeddings = embedding_function(chunks)

print(f"Embeddings type: {type(embeddings)}")
print(f"First embedding type: {type(embeddings[0])}")
print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}")

# Convert embeddings to numpy array if they aren't already
if not isinstance(embeddings, np.ndarray):
    print("Converting embeddings to numpy array...")
    embeddings = np.array(embeddings).astype('float32')

# Get the dimension of the embeddings
dimension = embeddings.shape[1]
print(f"Embedding dimension: {dimension}")

# Initialize FAISS index
print('Initializing FAISS index...')
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search

# Add vectors to the index
print('Adding vectors to FAISS index...')
index.add(embeddings)

# Save the index
print('Saving FAISS index...')
faiss.write_index(index, index_path)

# Save the document chunks for retrieval
print('Saving document chunks...')
with open(chunks_path, 'wb') as f:
    pickle.dump(chunks, f)

print(f"Total vectors in index: {index.ntotal}")


def retrieve_documents(query, n_results=5):
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
    
    # Search the index
    distances, indices = index.search(query_embedding, n_results)
    
    # Get the documents
    documents = [chunks[i] for i in indices[0]]
    
    # Convert distances to similarity scores (L2 distance: lower is better)
    # Normalize distances to [0, 1] range where 1 is most similar
    max_distance = np.max(distances)
    similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
    
    return documents, similarity_scores


# Test the retrieval
query="how has the profitability of the company been in last five years"
print('Retrieving documents...')
general_docs, general_scores = retrieve_documents(query, n_results=15)
print(f"Number of docs returned for general query: {len(general_docs)}")

# Print the results
# for i, (doc, score) in enumerate(zip(general_docs, general_scores)):
#     print(f"\nResult {i+1} (Score: {score:.4f}):")
#     print(f"{doc[:200]}...")

new_query=query+generate_hypothetical_answer(query)
combined_context=retrieve_documents(new_query, n_results=15)

answer = query_llm_with_context(query, combined_context, top_n=3)

print('final_response:{answer}')