|
import numpy as np |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from rank_bm25 import BM25Okapi |
|
|
|
|
|
data = pd.read_csv(r'books.csv', encoding='latin1') |
|
|
|
class TFIDFDoc2Vec: |
|
def __init__(self): |
|
self.tfidf_vectorizer = TfidfVectorizer() |
|
self.doc_vectors = None |
|
|
|
def initialize_vectors(self, documents): |
|
tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) |
|
self.doc_vectors = tfidf_matrix.toarray() |
|
|
|
def find_similar_documents(self, query, threshold=0.5): |
|
query_vector = self.tfidf_vectorizer.transform([query]).toarray() |
|
similarities = np.dot(query_vector, self.doc_vectors.T) |
|
similar_indices = np.where(similarities >= threshold)[1] |
|
return similar_indices, similarities |
|
|
|
def answer(query, threshold=0.5, top_n=10): |
|
|
|
similar_documents_indices, similarities = tfidf_doc2vec_model.find_similar_documents(query, threshold=threshold) |
|
|
|
|
|
if len(similar_documents_indices) == 0: |
|
return "No books found for the query." |
|
|
|
|
|
scores = bm25_model.get_scores(query.split()) |
|
bm25_ranked_indices = np.argsort(scores)[::-1] |
|
|
|
|
|
unique_indices = set() |
|
|
|
|
|
combined_indices = [] |
|
for index in similar_documents_indices: |
|
if index not in unique_indices: |
|
combined_indices.append(index) |
|
unique_indices.add(index) |
|
for index in bm25_ranked_indices: |
|
if index not in unique_indices: |
|
combined_indices.append(index) |
|
unique_indices.add(index) |
|
|
|
|
|
ranked_documents = [] |
|
for index in combined_indices[:top_n]: |
|
document_details = { |
|
"Book": data['Book Title'].iloc[index], |
|
"Author": data['Author'].iloc[index], |
|
"Edition": data['Edition'].iloc[index], |
|
"Copyright Year": data['Copyright Year'].iloc[index], |
|
"File Name": data['File_name'].iloc[index] |
|
} |
|
ranked_documents.append(document_details) |
|
|
|
return ranked_documents |
|
|
|
|
|
tfidf_doc2vec_model = TFIDFDoc2Vec() |
|
documents = data['Book Title'].astype(str) |
|
tfidf_doc2vec_model.initialize_vectors(documents) |
|
|
|
|
|
bm25_model = BM25Okapi([doc.split() for doc in documents]) |
|
|
|
|
|
query = "mathematics" |
|
result = answer(query) |
|
print(result) |
|
|