#pip install rank-bm25 import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split from rank_bm25 import BM25Okapi # Read CSV file data = pd.read_csv('books.csv', encoding='latin1') class TFIDFDoc2Vec: def __init__(self): self.tfidf_vectorizer = TfidfVectorizer() self.doc_vectors = None def initialize_vectors(self, documents): tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) self.doc_vectors = tfidf_matrix.toarray() def find_similar_documents(self, query, top_n=10): query_vector = self.tfidf_vectorizer.transform([query]).toarray() similarities = cosine_similarity(query_vector, self.doc_vectors) similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n] similar_documents = [] for indices in similar_indices: similar_documents.append(indices) return similar_documents def rank_bm25(self, query, bm25_model, documents, top_n=10): scores = bm25_model.get_scores(query) top_indices = np.argsort(scores)[::-1][:top_n] return top_indices data # Select the column containing book titles documents = data['Book Title'].astype(str) # Initialize TF-IDF vectors and model tfidf_doc2vec_model = TFIDFDoc2Vec() tfidf_doc2vec_model.initialize_vectors(documents) # Initialize BM25 model bm25_model = BM25Okapi(documents.str.split()) def answer(query): # Find similar documents similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query) # Rank similar documents using BM25 similar_documents_indices_bm25 = tfidf_doc2vec_model.rank_bm25(query, bm25_model, documents) # Initialize a list to store ranked documents ranked_documents = [] # Add details of each document to the list for indices in similar_documents_indices: for index in indices: document_details = { "Book": data['Book Title'].iloc[index], "Author": data['Author'].iloc[index], "Edition": data['Edition'].iloc[index], "File Name": data['File_name'].iloc[index] } ranked_documents.append(document_details) return ranked_documents # Receive query from the user #query = input("Enter your query: ") #result = answer(query) #print(result)