achdaisy commited on
Commit
2c48e89
1 Parent(s): e0e1c8f

Upload final_book_retriever.py

Browse files
Files changed (1) hide show
  1. final_book_retriever.py +63 -0
final_book_retriever.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
5
+
6
+ # Initialize BERT embeddings model
7
+ model_name = "BAAI/bge-small-en-v1.5"
8
+ encode_kwargs = {'normalize_embeddings': True} # Set True to compute cosine similarity
9
+ embeddings_model = HuggingFaceBgeEmbeddings(
10
+ model_name=model_name,
11
+ model_kwargs={'device': 'cpu'},
12
+ encode_kwargs=encode_kwargs
13
+ )
14
+
15
+ # Read CSV file
16
+ data = pd.read_csv(r'books.csv', encoding='latin1')
17
+
18
+ def retrieve_documents(query):
19
+ documents = data['Book Title'].tolist()
20
+ return documents
21
+
22
+ def answer(query, min_similarity=0.7):
23
+ # Retrieve documents
24
+ retrieved_documents = retrieve_documents(query)
25
+
26
+ # Embed query
27
+ embedded_query = embeddings_model.embed_query(query)
28
+
29
+ # Embed documents
30
+ embedded_documents = embeddings_model.embed_documents(retrieved_documents)
31
+
32
+ # Calculate cosine similarity between query and documents
33
+ similarities = cosine_similarity([embedded_query], embedded_documents)
34
+
35
+ # Rank documents based on similarity scores
36
+ ranked_indices = np.argsort(similarities[0])[::-1]
37
+
38
+ # Retrieve document details for documents with similarity score greater than min_similarity
39
+ ranked_documents = []
40
+ for index in ranked_indices:
41
+ similarity_score = similarities[0][index]
42
+ if similarity_score > min_similarity:
43
+ document_details = {
44
+ "Book": data['Book Title'].iloc[index],
45
+ "Author": data['Author'].iloc[index],
46
+ "Edition": data['Edition'].iloc[index],
47
+ "File Name": data['File_name'].iloc[index],
48
+ "Similarity Score": similarity_score
49
+ }
50
+ ranked_documents.append(document_details)
51
+ else:
52
+ # Since documents are ranked in descending order of similarity, break the loop when similarity score falls below min_similarity
53
+ break
54
+
55
+ if not ranked_documents:
56
+ print("No similar books found")
57
+
58
+ return ranked_documents
59
+
60
+ # Example usage
61
+ query = "machine learning"
62
+ result = answer(query)
63
+ print(result)