Rushi2903 commited on
Commit
ad35e35
1 Parent(s): c0f4686

Upload operations.py

Browse files
Files changed (1) hide show
  1. operations.py +177 -0
operations.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numpy.linalg import norm
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ import PyPDF2
5
+ from nltk.tokenize import sent_tokenize
6
+
7
+
8
+ def read_pdf(fname):
9
+ """
10
+ This function reads the pdf file and extracts the text from it.
11
+
12
+ Parameters:
13
+ fname (str): Name of the pdf file
14
+
15
+ Returns:
16
+ text_ext (list): List of extracted text from the pdf file
17
+ """
18
+ reader = PyPDF2.PdfReader(fname)
19
+ text_ext = []
20
+ for i in range(len(reader.pages)):
21
+ pageObj = reader.pages[i]
22
+ # extracting text from page
23
+ text_ext.append(pageObj.extract_text())
24
+
25
+ return text_ext
26
+
27
+
28
+ def sent_tokenize(text_ext):
29
+ """
30
+ This function apply sent_tokenize to the text and stores the result in a list.
31
+
32
+ Parameters:
33
+ text_ext (list): List of extracted text from the pdf file
34
+
35
+ Returns:
36
+ sent_toks (list): List of tokenized sentences
37
+ """
38
+ sent_toks = []
39
+
40
+ for i in text_ext:
41
+ sent_toks.append(sent_tokenize(i))
42
+ print("len(sent_toks) ", len(sent_toks))
43
+
44
+ return sent_toks
45
+
46
+
47
+ def create_content_embeddings(concat_list):
48
+ """
49
+ This function creates embeddings for the document sentences.
50
+
51
+ Parameters:
52
+ concat_list (list): List of tokenized sentences
53
+
54
+ Returns:
55
+ embeddings (list): List of embeddings of the sentences
56
+ """
57
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
58
+ embeddings = model.encode(concat_list)
59
+
60
+ return embeddings
61
+
62
+
63
+ def create_query_embeddings(query_text):
64
+ """
65
+ This function creates embeddings for the query.
66
+ Parameters:
67
+ query_text (str): Query entered by the user
68
+
69
+ Returns:
70
+ query_embedding (list): List of embeddings of the query
71
+ """
72
+
73
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
74
+ query_embedding = model.encode(query_text)
75
+ return query_embedding
76
+
77
+
78
+ def calculate_cosine(query_embedding, embeddings, concat_list):
79
+ """
80
+
81
+ This function calculates cosine similarity between the query and the sentences.
82
+
83
+ Parameters:
84
+ query_embedding (list): List of embeddings of the query
85
+ embeddings (list): List of embeddings of the sentences
86
+ concat_list (list): List of tokenized sentences
87
+
88
+ Returns:
89
+ cosine_lis (list): List of cosine similarity values
90
+ """
91
+ cosine_lis = []
92
+
93
+ for i in range(len(concat_list)):
94
+ cosine = np.dot(query_embedding,
95
+ embeddings[i]) / (norm(query_embedding)*norm(embeddings[i]))
96
+ cosine_lis.append(cosine)
97
+
98
+ # print("cosine_lis ", cosine_lis)
99
+ return (cosine_lis)
100
+
101
+
102
+ def fetch_top_rank_ans(cosine_lis, N):
103
+ """
104
+ This function fetches the top N ranked sentences.
105
+
106
+ Parameters:
107
+ cosine_lis (list): List of cosine similarity values
108
+ N (int): Number of sentences to be ranked
109
+
110
+ Returns:
111
+ indexes_final (list): List of top N ranked sentences
112
+ """
113
+
114
+ list1 = cosine_lis
115
+ indexes_final = sorted(
116
+ range(len(list1)), key=lambda i: list1[i], reverse=True)[:N]
117
+
118
+ print("indexes_final ", indexes_final)
119
+ indices = range(len(list1))
120
+
121
+ sorted_indices = sorted(indices, key=lambda i: list1[i], reverse=True)
122
+ # print(sorted_indices)
123
+ indexes_final = []
124
+ for i in range(N):
125
+ indexes_final.append(sorted_indices[i])
126
+ len(indexes_final)
127
+ return indexes_final
128
+
129
+
130
+ def fetch_most_relevant(indexes_final, concat_list, list1, query):
131
+ """
132
+ This function fetches the most relevant sentences, pass it as a context to GPT-3 prompt along with user's query.
133
+
134
+ Parameters:
135
+ indexes_final (list): List of top N ranked sentences
136
+ concat_list (list): List of tokenized sentences
137
+ list1 (list): List of cosine similarity values
138
+ query (str): Query entered by the user
139
+
140
+ Returns:
141
+ prompt (str): GPT-3 prompt
142
+ """
143
+
144
+ dicts = {}
145
+
146
+ keys = indexes_final
147
+ for i in keys:
148
+ dicts[i] = concat_list[i]
149
+
150
+ most_relevant_document_sections = [dicts]
151
+
152
+ len(most_relevant_document_sections)
153
+
154
+ chosen_sections = []
155
+ chosen_sections_len = 0
156
+ chosen_sections_indexes = []
157
+
158
+ indices = range(len(list1))
159
+ sorted_indices = sorted(indices, key=lambda i: list1[i], reverse=True)
160
+ # print(len(indexes_final))
161
+
162
+ for section_index in range(len(indexes_final)):
163
+
164
+ if chosen_sections_len > 500:
165
+ break
166
+ chosen_sections.append(
167
+ concat_list[sorted_indices[section_index]].replace("\n", " "))
168
+ chosen_sections_indexes.append(str(section_index))
169
+
170
+ # Useful diagnostic information
171
+ print(f"Selected {len(chosen_sections)} document sections:")
172
+
173
+ header = """Answer the question as a human in natural language conversation using the provided context, and if the answer is not contained within the text below, say "I don't have that information"\n\nContext:\n"""
174
+
175
+ # print(query)
176
+ prompt = header + "".join(chosen_sections) + "\n\n Q: " + query + "\n A:"
177
+ return prompt