File size: 2,353 Bytes
7a8017f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

#pip install rank-bm25

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from rank_bm25 import BM25Okapi


# Read CSV file
data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1')

class TFIDFDoc2Vec:
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer()
        self.doc_vectors = None

    def initialize_vectors(self, documents):
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
        self.doc_vectors = tfidf_matrix.toarray()

    def find_similar_documents(self, query, top_n=10):
        query_vector = self.tfidf_vectorizer.transform([query]).toarray()
        similarities = cosine_similarity(query_vector, self.doc_vectors)
        similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n]

        similar_documents = []
        for indices in similar_indices:
            similar_documents.append(indices)
        return similar_documents

    def rank_bm25(query, bm25_model, documents, top_n=5):
        scores = bm25_model.get_scores(query)
        top_indices = np.argsort(scores)[::-1][:top_n]
        return top_indices

data

# Select the column containing book titles
documents = data['Book Title'].astype(str)

# Initialize TF-IDF vectors and model
tfidf_doc2vec_model = TFIDFDoc2Vec()
tfidf_doc2vec_model.initialize_vectors(documents)

# Initialize BM25 model
bm25_model = BM25Okapi(documents.str.split())


def answer(query):
    # Find similar documents
    similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query)

    # Rank similar documents using BM25
    imilar_documents_indices_bm25 = TFIDFDoc2Vec.rank_bm25(query, bm25_model, documents)

    # Print list of similar documents
    print("Similar documents:")
    for idx, indices in enumerate(similar_documents_indices):
        print(f"{idx+1}.")
        for index in indices:
            print(f"   Book: {data['Book Title'][index]}")
            print(f"   Author: {data['Author'][index]}")
            print(f"   Edition: {data['Edition'][index]}")
            print(f"   File Name: {data['File_name'][index]}")

            print()

# Receive query from the user
#query = input("Enter your query: ")
#answer(query)