import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from rank_bm25 import BM25Okapi # Read CSV file data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1') class TFIDFDoc2Vec: def __init__(self): self.tfidf_vectorizer = TfidfVectorizer() self.doc_vectors = None def initialize_vectors(self, documents): tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) self.doc_vectors = tfidf_matrix.toarray() def find_similar_documents(self, query, threshold=0.5): query_vector = self.tfidf_vectorizer.transform([query]).toarray() similarities = np.dot(query_vector, self.doc_vectors.T) similar_indices = np.where(similarities >= threshold)[1] return similar_indices, similarities def answer(query, threshold=0.5, top_n=10): # Find similar documents using TF-IDF similar_documents_indices, similarities = tfidf_doc2vec_model.find_similar_documents(query, threshold=threshold) # Check if no similar documents are found if len(similar_documents_indices) == 0: return "No books found for the query." # Rank similar documents using BM25 scores = bm25_model.get_scores(query.split()) # Split the query into tokens bm25_ranked_indices = np.argsort(scores)[::-1] # Initialize a set to keep track of unique document indices unique_indices = set() # Combine results from TF-IDF and BM25, keeping unique indices combined_indices = [] for index in similar_documents_indices: if index not in unique_indices: combined_indices.append(index) unique_indices.add(index) for index in bm25_ranked_indices: if index not in unique_indices: combined_indices.append(index) unique_indices.add(index) # Retrieve document details ranked_documents = [] for index in combined_indices[:top_n]: # Adjust to the desired number of results document_details = { "Book": data['Book Title'].iloc[index], "Author": data['Author'].iloc[index], "Edition": data['Edition'].iloc[index], "File Name": data['File_name'].iloc[index] } ranked_documents.append(document_details) return ranked_documents # Initialize TF-IDF model tfidf_doc2vec_model = TFIDFDoc2Vec() documents = data['Book Title'].astype(str) tfidf_doc2vec_model.initialize_vectors(documents) # Initialize BM25 model bm25_model = BM25Okapi([doc.split() for doc in documents]) # Example usage query = "mathematics" result = answer(query) print(result)