Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from rank_bm25 import BM25Okapi | |
# Read CSV file | |
data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1') | |
class TFIDFDoc2Vec: | |
def __init__(self): | |
self.tfidf_vectorizer = TfidfVectorizer() | |
self.doc_vectors = None | |
def initialize_vectors(self, documents): | |
tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) | |
self.doc_vectors = tfidf_matrix.toarray() | |
def find_similar_documents(self, query, threshold=0.5): | |
query_vector = self.tfidf_vectorizer.transform([query]).toarray() | |
similarities = np.dot(query_vector, self.doc_vectors.T) | |
similar_indices = np.where(similarities >= threshold)[1] | |
return similar_indices, similarities | |
def answer(query, threshold=0.5, top_n=10): | |
# Find similar documents using TF-IDF | |
similar_documents_indices, similarities = tfidf_doc2vec_model.find_similar_documents(query, threshold=threshold) | |
# Check if no similar documents are found | |
if len(similar_documents_indices) == 0: | |
return "No books found for the query." | |
# Rank similar documents using BM25 | |
scores = bm25_model.get_scores(query.split()) # Split the query into tokens | |
bm25_ranked_indices = np.argsort(scores)[::-1] | |
# Initialize a set to keep track of unique document indices | |
unique_indices = set() | |
# Combine results from TF-IDF and BM25, keeping unique indices | |
combined_indices = [] | |
for index in similar_documents_indices: | |
if index not in unique_indices: | |
combined_indices.append(index) | |
unique_indices.add(index) | |
for index in bm25_ranked_indices: | |
if index not in unique_indices: | |
combined_indices.append(index) | |
unique_indices.add(index) | |
# Retrieve document details | |
ranked_documents = [] | |
for index in combined_indices[:top_n]: # Adjust to the desired number of results | |
document_details = { | |
"Book": data['Book Title'].iloc[index], | |
"Author": data['Author'].iloc[index], | |
"Edition": data['Edition'].iloc[index], | |
"File Name": data['File_name'].iloc[index] | |
} | |
ranked_documents.append(document_details) | |
return ranked_documents | |
# Initialize TF-IDF model | |
tfidf_doc2vec_model = TFIDFDoc2Vec() | |
documents = data['Book Title'].astype(str) | |
tfidf_doc2vec_model.initialize_vectors(documents) | |
# Initialize BM25 model | |
bm25_model = BM25Okapi([doc.split() for doc in documents]) | |
# Example usage | |
query = "mathematics" | |
result = answer(query) | |
print(result) | |