awacke1 commited on
Commit
bf4227b
1 Parent(s): fc73efd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -21
app.py CHANGED
@@ -10,6 +10,7 @@ import math
10
  import requests
11
  import time
12
  import re
 
13
 
14
  from datetime import datetime
15
  from openai import ChatCompletion
@@ -202,11 +203,10 @@ def extract_mime_type(file):
202
  else:
203
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
204
 
 
205
 
206
- import textract
207
- import os
208
  def extract_file_extension(file):
209
- # Assume file is an UploadedFile object and get the name directly
210
  file_name = file.name
211
  pattern = r".*?\.(.*?)$"
212
  match = re.search(pattern, file_name)
@@ -215,28 +215,36 @@ def extract_file_extension(file):
215
  else:
216
  raise ValueError(f"Unable to extract file extension from {file_name}")
217
 
218
- def pdf2txt(pdf_docs):
219
  text = ""
220
- for file in pdf_docs:
221
  file_extension = extract_file_extension(file)
222
  # print the file extension
223
- print(f"File type extension: {file_extension}")
224
-
225
- # Simulate file reading
226
- # You need to replace the following lines with actual file reading
227
- # based on the file_extension
228
- if file_extension in ['txt', 'html', 'htm', 'py', 'xml', 'json']:
229
- text += textract.process(str(file.name))
230
- text += f"\nExtracted text from {file_extension} file..."
231
- elif file_extension == 'pdf':
232
- pdf_reader = PdfReader(file.name)
233
- for page in pdf_reader.pages:
234
- text += page.extract_text()
235
- text += f"\nExtracted text from PDF file..."
 
 
 
 
 
 
 
 
236
 
237
  return text
238
 
239
-
240
  def pdf2txt_old(pdf_docs):
241
  st.write(pdf_docs)
242
  for file in pdf_docs:
@@ -399,7 +407,7 @@ if user_question:
399
 
400
  with st.sidebar:
401
  st.subheader("Your documents")
402
- docs = st.file_uploader("Upload your documents", accept_multiple_files=True)
403
  with st.spinner("Processing"):
404
  raw = pdf2txt(docs)
405
  if len(raw) > 0:
@@ -407,6 +415,6 @@ with st.sidebar:
407
  text_chunks = txt2chunks(raw)
408
  vectorstore = vector_store(text_chunks)
409
  st.session_state.conversation = get_chain(vectorstore)
410
- st.markdown('# AI Search Index of Length:' + length + ' Created.')
411
  filename = generate_filename(raw, 'txt')
412
  create_file(filename, raw, '')
 
10
  import requests
11
  import time
12
  import re
13
+ import textract
14
 
15
  from datetime import datetime
16
  from openai import ChatCompletion
 
203
  else:
204
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
205
 
206
+ from io import BytesIO
207
 
 
 
208
  def extract_file_extension(file):
209
+ # get the file name directly from the UploadedFile object
210
  file_name = file.name
211
  pattern = r".*?\.(.*?)$"
212
  match = re.search(pattern, file_name)
 
215
  else:
216
  raise ValueError(f"Unable to extract file extension from {file_name}")
217
 
218
+ def pdf2txt(docs):
219
  text = ""
220
+ for file in docs:
221
  file_extension = extract_file_extension(file)
222
  # print the file extension
223
+ st.write(f"File type extension: {file_extension}")
224
+
225
+ # save the uploaded file temporarily
226
+ temp_file_name = file.name
227
+ with open(temp_file_name, "wb") as f:
228
+ f.write(file.getvalue())
229
+
230
+ # read the file according to its extension
231
+ try:
232
+ if file_extension.lower() in ['txt', 'html', 'htm', 'py', 'xml', 'json', 'docx']:
233
+ text += textract.process(temp_file_name).decode("utf-8")
234
+ elif file_extension.lower() == 'pdf':
235
+ with open(temp_file_name, "rb") as f:
236
+ pdf = PdfFileReader(f)
237
+ for page in range(pdf.getNumPages()):
238
+ text += pdf.getPage(page).extractText()
239
+ except Exception as e:
240
+ st.write(f"Error processing file {file.name}: {e}")
241
+
242
+ # remove the temporary file
243
+ os.remove(temp_file_name)
244
 
245
  return text
246
 
247
+
248
  def pdf2txt_old(pdf_docs):
249
  st.write(pdf_docs)
250
  for file in pdf_docs:
 
407
 
408
  with st.sidebar:
409
  st.subheader("Your documents")
410
+ docs = st.file_uploader("import documents", accept_multiple_files=True)
411
  with st.spinner("Processing"):
412
  raw = pdf2txt(docs)
413
  if len(raw) > 0:
 
415
  text_chunks = txt2chunks(raw)
416
  vectorstore = vector_store(text_chunks)
417
  st.session_state.conversation = get_chain(vectorstore)
418
+ st.markdown('# AI Search Index of Length:' + length + ' Created.') # add timing
419
  filename = generate_filename(raw, 'txt')
420
  create_file(filename, raw, '')