#pip install rank-bm25 import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split from rank_bm25 import BM25Okapi # Read CSV file data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1') class TFIDFDoc2Vec: def __init__(self): self.tfidf_vectorizer = TfidfVectorizer() self.doc_vectors = None def initialize_vectors(self, documents): tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) self.doc_vectors = tfidf_matrix.toarray() def find_similar_documents(self, query, top_n=10): query_vector = self.tfidf_vectorizer.transform([query]).toarray() similarities = cosine_similarity(query_vector, self.doc_vectors) similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n] similar_documents = [] for indices in similar_indices: similar_documents.append(indices) return similar_documents def rank_bm25(query, bm25_model, documents, top_n=5): scores = bm25_model.get_scores(query) top_indices = np.argsort(scores)[::-1][:top_n] return top_indices data # Select the column containing book titles documents = data['Book Title'].astype(str) # Initialize TF-IDF vectors and model tfidf_doc2vec_model = TFIDFDoc2Vec() tfidf_doc2vec_model.initialize_vectors(documents) # Initialize BM25 model bm25_model = BM25Okapi(documents.str.split()) def answer(query): # Find similar documents similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query) # Rank similar documents using BM25 imilar_documents_indices_bm25 = TFIDFDoc2Vec.rank_bm25(query, bm25_model, documents) # Print list of similar documents print("Similar documents:") for idx, indices in enumerate(similar_documents_indices): print(f"{idx+1}.") for index in indices: print(f" Book: {data['Book Title'][index]}") print(f" Author: {data['Author'][index]}") print(f" Edition: {data['Edition'][index]}") print(f" File Name: {data['File_name'][index]}") print() # Receive query from the user #query = input("Enter your query: ") #answer(query)