import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from langchain_community.embeddings import HuggingFaceBgeEmbeddings # Initialize BERT embeddings model model_name = "BAAI/bge-small-en-v1.5" encode_kwargs = {'normalize_embeddings': True} # Set True to compute cosine similarity embeddings_model = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs=encode_kwargs ) # Read CSV file data = pd.read_csv(r'books.csv', encoding='latin1') def retrieve_documents(query): documents = data['Book Title'].tolist() return documents def answer(query, min_similarity=0.7): # Retrieve documents retrieved_documents = retrieve_documents(query) # Embed query embedded_query = embeddings_model.embed_query(query) # Embed documents embedded_documents = embeddings_model.embed_documents(retrieved_documents) # Calculate cosine similarity between query and documents similarities = cosine_similarity([embedded_query], embedded_documents) # Rank documents based on similarity scores ranked_indices = np.argsort(similarities[0])[::-1] # Retrieve document details for documents with similarity score greater than min_similarity ranked_documents = [] for index in ranked_indices: similarity_score = round(similarities[0][index], 2) # Round similarity score to two decimal places if similarity_score > min_similarity: document_details = { "Book": data['Book Title'].iloc[index], "Author": data['Author'].iloc[index], "Edition": data['Edition'].iloc[index], "File Name": data['File_name'].iloc[index], "Similarity Score": similarity_score } ranked_documents.append(document_details) else: # Since documents are ranked in descending order of similarity, break the loop when similarity score falls below min_similarity break if not ranked_documents: print("No similar books found") return ranked_documents # Example usage #query = "machine learning" #result = answer(query) #print(result)