ravithejads commited on
Commit
6844495
1 Parent(s): 4557c88

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -0
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gpt_index import Document, GPTListIndex
2
+ import gradio as gr
3
+ import openai
4
+ import os
5
+ import PyPDF2
6
+ import docx
7
+ import pytesseract
8
+ from PIL import Image
9
+
10
+ def pdftotext(file_name):
11
+ """
12
+ Function to extract text from .pdf format files
13
+ """
14
+
15
+ text = []
16
+ # Open the PDF file in read-binary mode
17
+ with open(file_name, 'rb') as file:
18
+ # Create a PDF object
19
+ pdf = PyPDF2.PdfReader(file)
20
+
21
+ # Get the number of pages in the PDF document
22
+ num_pages = len(pdf.pages)
23
+
24
+ # Iterate over every page
25
+ for page in range(num_pages):
26
+ # Extract the text from the page
27
+ result = pdf.pages[page].extract_text()
28
+ text.append(result)
29
+
30
+ text = "\n".join(text)
31
+
32
+ return text
33
+
34
+ def docxtotext(file_name):
35
+ """
36
+ Function to read .docx format files
37
+ """
38
+ # Open the Word document
39
+ document = docx.Document(file_name)
40
+
41
+ # Extract the text from the document
42
+ text = '\n'.join([paragraph.text for paragraph in document.paragraphs])
43
+
44
+ return text
45
+
46
+ def readtextfile(file_name):
47
+ """
48
+ Function to read .txt format files
49
+ """
50
+
51
+ # Open the Text document
52
+ with open(file_name, 'r') as file:
53
+ text = file.read()
54
+
55
+ return text
56
+
57
+ def imagetotext(file_name):
58
+ """
59
+ Function to extract text from images
60
+ """
61
+ # Open the image using PIL
62
+ image = Image.open(file_name)
63
+
64
+ # Extract the text from the image
65
+ text = pytesseract.image_to_string(image)
66
+
67
+ return text
68
+
69
+ def preprocesstext(text):
70
+ """
71
+ Function to preprocess text
72
+ """
73
+ # Split the string into lines
74
+ lines = text.splitlines()
75
+ # Use a list comprehension to filter out empty lines
76
+ lines = [line for line in lines if line.strip()]
77
+ # Join the modified lines back into a single string
78
+ text = '\n'.join(lines)
79
+
80
+ return text
81
+
82
+ def processfiles(files):
83
+ """
84
+ Function to extract text from documents
85
+ """
86
+ textlist = []
87
+
88
+ # Iterate over provided files
89
+ for file in files:
90
+ # Get file name
91
+ file_name = file.name
92
+ # Get extention of file name
93
+ ext = file_name.split(".")[-1].lower()
94
+
95
+ # Process document based on extention
96
+ if ext == "pdf":
97
+ text = pdftotext(file_name)
98
+ elif ext == "docx":
99
+ text = docxtotext(file_name)
100
+ elif ext == "txt":
101
+ text = readtextfile(file_name)
102
+ elif ext in ["png", "jpg", "jpeg"]:
103
+ text = imagetotext(file_name)
104
+ else:
105
+ text = ""
106
+
107
+ # Preprocess text
108
+ text = preprocesstext(text)
109
+
110
+ # Append the text to final result
111
+ textlist.append(text)
112
+
113
+ return textlist
114
+
115
+ def createdocuments(textlist):
116
+ """
117
+ Function to create documents as needed for indexing.
118
+ """
119
+ documents = []
120
+ # Create Document for indexing
121
+ for text in textlist:
122
+ documents.append(Document(text))
123
+
124
+ return documents
125
+
126
+ def fileformatvaliditycheck(files):
127
+ """
128
+ Function to check validity of file formats
129
+ """
130
+
131
+ for file1 in files:
132
+ file_name = file1.name
133
+ # Get extention of file name
134
+ ext = file_name.split(".")[-1].lower()
135
+
136
+ if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]:
137
+ return False
138
+ return True
139
+
140
+ def openaiapikeyvaliditycheck(openaikey):
141
+ """
142
+ Function to check validity of openai key
143
+ """
144
+ # Set the API key
145
+ openai.api_key = openaikey
146
+ # Test the API key by making a request to the OpenAI API
147
+ try:
148
+ response = openai.Model.list()
149
+ return "Valid OpenAI API key"
150
+ except openai.OpenAIError:
151
+ apikeylink = "https://beta.openai.com/account/api-keys"
152
+ return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}"
153
+
154
+ def createindex(files, openaikey):
155
+ """
156
+ Function to create index
157
+ """
158
+
159
+ # Basic Checks
160
+ if not files:
161
+ return "Upload file before proceeding further."
162
+
163
+ fileformatvalidity = fileformatvaliditycheck(files)
164
+
165
+ if not fileformatvalidity:
166
+ return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only."
167
+
168
+ if not openaikey:
169
+ return "Please enter your openai key."
170
+
171
+ openaiapikeyvality = openaiapikeyvaliditycheck(openaikey)
172
+
173
+ if openaiapikeyvality != "Valid OpenAI API key":
174
+ return openaiapikeyvality
175
+
176
+ # Store openai key in environment
177
+ os.environ['OPENAI_API_KEY'] = openaikey
178
+
179
+ # Process the Documents
180
+ doctextlist = processfiles(files)
181
+ documents = createdocuments(doctextlist)
182
+
183
+ # Create index
184
+ index = GPTListIndex(documents, chunk_size_limit = 3500)
185
+ # Save index
186
+ index.save_to_disk('index.json')
187
+
188
+ return "Uploading documents successfully. OpenAI API Key provided is Valid."
189
+
190
+ def docques(query, openaikey):
191
+ """
192
+ Function to for quering on the index created
193
+ """
194
+
195
+ # Store openai key in environment
196
+ os.environ['OPENAI_API_KEY'] = openaikey
197
+
198
+ # Load index
199
+ index = GPTListIndex.load_from_disk('index.json')
200
+
201
+ # Query based on index
202
+ response = index.query(query, response_mode="tree_summarize")
203
+
204
+ return response
205
+
206
+ def cleartext(query, output):
207
+ """
208
+ Function to clear text
209
+ """
210
+ return ["", ""]
211
+
212
+ with gr.Blocks() as demo:
213
+ gr.Markdown(
214
+ """
215
+ <h1><center><b>DocQues</center></h1>
216
+
217
+ """)
218
+ gr.Markdown(
219
+ """
220
+ This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your
221
+ <a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n
222
+ """)
223
+ gr.Markdown(
224
+ """
225
+ <br>**Use this space effectively by following below 2 step process.**</br>
226
+ *Step-1*
227
+ <br>- Upload pdf/docx/txt/png/jpeg/jpg format documents.
228
+ <br>- Enter your openai key.
229
+ <br>- Click upload and wait to see if upload is successful or not. </br>
230
+ *Step-2*
231
+ <br>- Enter your query.
232
+ <br>- Click submit.
233
+ <br>- Check Answer </br>
234
+ Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> .
235
+ """
236
+ )
237
+ with gr.Row():
238
+ with gr.Column():
239
+ files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple")
240
+ openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.")
241
+ upload_button = gr.Button("Upload")
242
+ query = gr.Textbox(lines = 2, label = "Enter Your Question.")
243
+ submit_button = gr.Button("Submit")
244
+ with gr.Column():
245
+ upload_output = gr.Textbox(label = "Upload/ Error.")
246
+ ans_output = gr.Textbox(label = "Answer.")
247
+ clear_button = gr.Button("Clear")
248
+
249
+ # Upload button for uploading files and openai key.
250
+ upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] )
251
+
252
+ # Submit button for submitting query.
253
+ submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] )
254
+
255
+ # Clear button for clearing query and answer.
256
+ clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] )
257
+
258
+ demo.launch()