|
|
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.model_selection import train_test_split |
|
from rank_bm25 import BM25Okapi |
|
|
|
|
|
|
|
data = pd.read_csv('books.csv', encoding='latin1') |
|
|
|
class TFIDFDoc2Vec: |
|
def __init__(self): |
|
self.tfidf_vectorizer = TfidfVectorizer() |
|
self.doc_vectors = None |
|
|
|
def initialize_vectors(self, documents): |
|
tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents) |
|
self.doc_vectors = tfidf_matrix.toarray() |
|
|
|
def find_similar_documents(self, query, top_n=10): |
|
query_vector = self.tfidf_vectorizer.transform([query]).toarray() |
|
similarities = cosine_similarity(query_vector, self.doc_vectors) |
|
similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n] |
|
|
|
similar_documents = [] |
|
for indices in similar_indices: |
|
similar_documents.append(indices) |
|
return similar_documents |
|
|
|
def rank_bm25(query, bm25_model, documents, top_n=5): |
|
scores = bm25_model.get_scores(query) |
|
top_indices = np.argsort(scores)[::-1][:top_n] |
|
return top_indices |
|
|
|
data |
|
|
|
|
|
documents = data['Book Title'].astype(str) |
|
|
|
|
|
tfidf_doc2vec_model = TFIDFDoc2Vec() |
|
tfidf_doc2vec_model.initialize_vectors(documents) |
|
|
|
|
|
bm25_model = BM25Okapi(documents.str.split()) |
|
|
|
|
|
def answer(query): |
|
|
|
similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query) |
|
|
|
|
|
imilar_documents_indices_bm25 = TFIDFDoc2Vec.rank_bm25(query, bm25_model, documents) |
|
|
|
|
|
print("Similar documents:") |
|
for idx, indices in enumerate(similar_documents_indices): |
|
print(f"{idx+1}.") |
|
for index in indices: |
|
print(f" Book: {data['Book Title'][index]}") |
|
print(f" Author: {data['Author'][index]}") |
|
print(f" Edition: {data['Edition'][index]}") |
|
print(f" File Name: {data['File_name'][index]}") |
|
|
|
print() |
|
|
|
|
|
|
|
|