achdaisy commited on
Commit
7a8017f
1 Parent(s): c7d3204

Upload book_metadata_retriever.py

Browse files
Files changed (1) hide show
  1. book_metadata_retriever.py +73 -0
book_metadata_retriever.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #pip install rank-bm25
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from sklearn.model_selection import train_test_split
9
+ from rank_bm25 import BM25Okapi
10
+
11
+
12
+ # Read CSV file
13
+ data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1')
14
+
15
+ class TFIDFDoc2Vec:
16
+ def __init__(self):
17
+ self.tfidf_vectorizer = TfidfVectorizer()
18
+ self.doc_vectors = None
19
+
20
+ def initialize_vectors(self, documents):
21
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
22
+ self.doc_vectors = tfidf_matrix.toarray()
23
+
24
+ def find_similar_documents(self, query, top_n=10):
25
+ query_vector = self.tfidf_vectorizer.transform([query]).toarray()
26
+ similarities = cosine_similarity(query_vector, self.doc_vectors)
27
+ similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n]
28
+
29
+ similar_documents = []
30
+ for indices in similar_indices:
31
+ similar_documents.append(indices)
32
+ return similar_documents
33
+
34
+ def rank_bm25(query, bm25_model, documents, top_n=5):
35
+ scores = bm25_model.get_scores(query)
36
+ top_indices = np.argsort(scores)[::-1][:top_n]
37
+ return top_indices
38
+
39
+ data
40
+
41
+ # Select the column containing book titles
42
+ documents = data['Book Title'].astype(str)
43
+
44
+ # Initialize TF-IDF vectors and model
45
+ tfidf_doc2vec_model = TFIDFDoc2Vec()
46
+ tfidf_doc2vec_model.initialize_vectors(documents)
47
+
48
+ # Initialize BM25 model
49
+ bm25_model = BM25Okapi(documents.str.split())
50
+
51
+
52
+ def answer(query):
53
+ # Find similar documents
54
+ similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query)
55
+
56
+ # Rank similar documents using BM25
57
+ imilar_documents_indices_bm25 = TFIDFDoc2Vec.rank_bm25(query, bm25_model, documents)
58
+
59
+ # Print list of similar documents
60
+ print("Similar documents:")
61
+ for idx, indices in enumerate(similar_documents_indices):
62
+ print(f"{idx+1}.")
63
+ for index in indices:
64
+ print(f" Book: {data['Book Title'][index]}")
65
+ print(f" Author: {data['Author'][index]}")
66
+ print(f" Edition: {data['Edition'][index]}")
67
+ print(f" File Name: {data['File_name'][index]}")
68
+
69
+ print()
70
+
71
+ # Receive query from the user
72
+ #query = input("Enter your query: ")
73
+ #answer(query)