Spaces:
Sleeping
Sleeping
File size: 2,353 Bytes
7a8017f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
#pip install rank-bm25
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from rank_bm25 import BM25Okapi
# Read CSV file
data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1')
class TFIDFDoc2Vec:
def __init__(self):
self.tfidf_vectorizer = TfidfVectorizer()
self.doc_vectors = None
def initialize_vectors(self, documents):
tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
self.doc_vectors = tfidf_matrix.toarray()
def find_similar_documents(self, query, top_n=10):
query_vector = self.tfidf_vectorizer.transform([query]).toarray()
similarities = cosine_similarity(query_vector, self.doc_vectors)
similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n]
similar_documents = []
for indices in similar_indices:
similar_documents.append(indices)
return similar_documents
def rank_bm25(query, bm25_model, documents, top_n=5):
scores = bm25_model.get_scores(query)
top_indices = np.argsort(scores)[::-1][:top_n]
return top_indices
data
# Select the column containing book titles
documents = data['Book Title'].astype(str)
# Initialize TF-IDF vectors and model
tfidf_doc2vec_model = TFIDFDoc2Vec()
tfidf_doc2vec_model.initialize_vectors(documents)
# Initialize BM25 model
bm25_model = BM25Okapi(documents.str.split())
def answer(query):
# Find similar documents
similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query)
# Rank similar documents using BM25
imilar_documents_indices_bm25 = TFIDFDoc2Vec.rank_bm25(query, bm25_model, documents)
# Print list of similar documents
print("Similar documents:")
for idx, indices in enumerate(similar_documents_indices):
print(f"{idx+1}.")
for index in indices:
print(f" Book: {data['Book Title'][index]}")
print(f" Author: {data['Author'][index]}")
print(f" Edition: {data['Edition'][index]}")
print(f" File Name: {data['File_name'][index]}")
print()
# Receive query from the user
#query = input("Enter your query: ")
#answer(query) |