book_metadata_retriever / book_metadata_retriever.py
achdaisy's picture
Update book_metadata_retriever.py
1c5425b verified
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
# Read CSV file
data = pd.read_csv(r'books.csv', encoding='latin1')
class TFIDFDoc2Vec:
def __init__(self):
self.tfidf_vectorizer = TfidfVectorizer()
self.doc_vectors = None
def initialize_vectors(self, documents):
tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
self.doc_vectors = tfidf_matrix.toarray()
def find_similar_documents(self, query, threshold=0.5):
query_vector = self.tfidf_vectorizer.transform([query]).toarray()
similarities = np.dot(query_vector, self.doc_vectors.T)
similar_indices = np.where(similarities >= threshold)[1]
return similar_indices, similarities
def answer(query, threshold=0.5, top_n=10):
# Find similar documents using TF-IDF
similar_documents_indices, similarities = tfidf_doc2vec_model.find_similar_documents(query, threshold=threshold)
# Check if no similar documents are found
if len(similar_documents_indices) == 0:
return "No books found for the query."
# Rank similar documents using BM25
scores = bm25_model.get_scores(query.split()) # Split the query into tokens
bm25_ranked_indices = np.argsort(scores)[::-1]
# Initialize a set to keep track of unique document indices
unique_indices = set()
# Combine results from TF-IDF and BM25, keeping unique indices
combined_indices = []
for index in similar_documents_indices:
if index not in unique_indices:
combined_indices.append(index)
unique_indices.add(index)
for index in bm25_ranked_indices:
if index not in unique_indices:
combined_indices.append(index)
unique_indices.add(index)
# Retrieve document details
ranked_documents = []
for index in combined_indices[:top_n]: # Adjust to the desired number of results
document_details = {
"Book": data['Book Title'].iloc[index],
"Author": data['Author'].iloc[index],
"Edition": data['Edition'].iloc[index],
"Copyright Year": data['Copyright Year'].iloc[index],
"File Name": data['File_name'].iloc[index]
}
ranked_documents.append(document_details)
return ranked_documents
# Initialize TF-IDF model
tfidf_doc2vec_model = TFIDFDoc2Vec()
documents = data['Book Title'].astype(str)
tfidf_doc2vec_model.initialize_vectors(documents)
# Initialize BM25 model
bm25_model = BM25Okapi([doc.split() for doc in documents])
# Example usage
#query = "mathematics"
#result = answer(query)
#print(result)