achdaisy commited on
Commit
f1263cb
1 Parent(s): 3d1d864

Upload book_metadata_retriever.py

Browse files
Files changed (1) hide show
  1. book_metadata_retriever.py +48 -53
book_metadata_retriever.py CHANGED
@@ -1,16 +1,10 @@
1
-
2
- #pip install rank-bm25
3
-
4
  import numpy as np
5
  import pandas as pd
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- from sklearn.model_selection import train_test_split
9
  from rank_bm25 import BM25Okapi
10
 
11
-
12
  # Read CSV file
13
- data = pd.read_csv('books.csv', encoding='latin1')
14
 
15
  class TFIDFDoc2Vec:
16
  def __init__(self):
@@ -21,59 +15,60 @@ class TFIDFDoc2Vec:
21
  tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
22
  self.doc_vectors = tfidf_matrix.toarray()
23
 
24
- def find_similar_documents(self, query, top_n=10):
25
  query_vector = self.tfidf_vectorizer.transform([query]).toarray()
26
- similarities = cosine_similarity(query_vector, self.doc_vectors)
27
- similar_indices = similarities.argsort(axis=1)[:, ::-1][:, :top_n]
 
28
 
29
- similar_documents = []
30
- for indices in similar_indices:
31
- similar_documents.append(indices)
32
- return similar_documents
33
 
34
- def rank_bm25(self, query, bm25_model, documents, top_n=10):
35
- scores = bm25_model.get_scores(query)
36
- top_indices = np.argsort(scores)[::-1][:top_n]
37
- return top_indices
38
- data
39
 
40
- # Select the column containing book titles
41
- documents = data['Book Title'].astype(str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Initialize TF-IDF vectors and model
44
  tfidf_doc2vec_model = TFIDFDoc2Vec()
 
45
  tfidf_doc2vec_model.initialize_vectors(documents)
46
 
47
  # Initialize BM25 model
48
- bm25_model = BM25Okapi(documents.str.split())
49
-
50
-
51
- def answer(query):
52
- # Find similar documents
53
- similar_documents_indices = tfidf_doc2vec_model.find_similar_documents(query)
54
-
55
- # Rank similar documents using BM25
56
- similar_documents_indices_bm25 = tfidf_doc2vec_model.rank_bm25(query, bm25_model, documents)
57
-
58
- # Initialize a list to store ranked documents
59
- ranked_documents = []
60
-
61
- # Add details of each document to the list
62
- for indices in similar_documents_indices:
63
- for index in indices:
64
- document_details = {
65
- "Book": data['Book Title'].iloc[index],
66
- "Author": data['Author'].iloc[index],
67
- "Copyright Year": data['Copyright Year'].iloc[index],
68
- "Edition": data['Edition'].iloc[index],
69
- "File Name": data['File_name'].iloc[index]
70
-
71
- }
72
- ranked_documents.append(document_details)
73
-
74
- return ranked_documents
75
 
76
- # Receive query from the user
77
- #query = input("Enter your query: ")
78
- #result = answer(query)
79
- #print(result)
 
 
 
 
1
  import numpy as np
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
 
 
4
  from rank_bm25 import BM25Okapi
5
 
 
6
  # Read CSV file
7
+ data = pd.read_csv(r'C:\book_metadata_retriever\books.csv', encoding='latin1')
8
 
9
  class TFIDFDoc2Vec:
10
  def __init__(self):
 
15
  tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
16
  self.doc_vectors = tfidf_matrix.toarray()
17
 
18
+ def find_similar_documents(self, query, threshold=0.5):
19
  query_vector = self.tfidf_vectorizer.transform([query]).toarray()
20
+ similarities = np.dot(query_vector, self.doc_vectors.T)
21
+ similar_indices = np.where(similarities >= threshold)[1]
22
+ return similar_indices, similarities
23
 
24
+ def answer(query, threshold=0.5, top_n=10):
25
+ # Find similar documents using TF-IDF
26
+ similar_documents_indices, similarities = tfidf_doc2vec_model.find_similar_documents(query, threshold=threshold)
 
27
 
28
+ # Check if no similar documents are found
29
+ if len(similar_documents_indices) == 0:
30
+ return "No books found for the query."
 
 
31
 
32
+ # Rank similar documents using BM25
33
+ scores = bm25_model.get_scores(query.split()) # Split the query into tokens
34
+ bm25_ranked_indices = np.argsort(scores)[::-1]
35
+
36
+ # Initialize a set to keep track of unique document indices
37
+ unique_indices = set()
38
+
39
+ # Combine results from TF-IDF and BM25, keeping unique indices
40
+ combined_indices = []
41
+ for index in similar_documents_indices:
42
+ if index not in unique_indices:
43
+ combined_indices.append(index)
44
+ unique_indices.add(index)
45
+ for index in bm25_ranked_indices:
46
+ if index not in unique_indices:
47
+ combined_indices.append(index)
48
+ unique_indices.add(index)
49
+
50
+ # Retrieve document details
51
+ ranked_documents = []
52
+ for index in combined_indices[:top_n]: # Adjust to the desired number of results
53
+ document_details = {
54
+ "Book": data['Book Title'].iloc[index],
55
+ "Author": data['Author'].iloc[index],
56
+ "Edition": data['Edition'].iloc[index],
57
+ "File Name": data['File_name'].iloc[index]
58
+ }
59
+ ranked_documents.append(document_details)
60
+
61
+ return ranked_documents
62
 
63
+ # Initialize TF-IDF model
64
  tfidf_doc2vec_model = TFIDFDoc2Vec()
65
+ documents = data['Book Title'].astype(str)
66
  tfidf_doc2vec_model.initialize_vectors(documents)
67
 
68
  # Initialize BM25 model
69
+ bm25_model = BM25Okapi([doc.split() for doc in documents])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Example usage
72
+ query = "mathematics"
73
+ result = answer(query)
74
+ print(result)