Spaces:

awacke1
/

VoiceGPT15

Sleeping

App Files Files Community

awacke1 commited on Jul 7, 2023

Commit

bf4227b

1 Parent(s): fc73efd

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -21

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import math
 import requests
 import time
 import re
 from datetime import datetime
 from openai import ChatCompletion
@@ -202,11 +203,10 @@ def extract_mime_type(file):
     else:
         raise TypeError("Input should be a string or a streamlit.UploadedFile object")
-import textract
-import os
 def extract_file_extension(file):
-    # Assume file is an UploadedFile object and get the name directly
     file_name = file.name
     pattern = r".*?\.(.*?)$"
     match = re.search(pattern, file_name)
@@ -215,28 +215,36 @@ def extract_file_extension(file):
     else:
         raise ValueError(f"Unable to extract file extension from {file_name}")
-def pdf2txt(pdf_docs):
     text = ""
-    for file in pdf_docs:
         file_extension = extract_file_extension(file)
         # print the file extension
-        print(f"File type extension: {file_extension}")
-        # Simulate file reading
-        # You need to replace the following lines with actual file reading
-        # based on the file_extension
-        if file_extension in ['txt', 'html', 'htm', 'py', 'xml', 'json']:
-            text += textract.process(str(file.name))
-            text += f"\nExtracted text from {file_extension} file..."
-        elif file_extension == 'pdf':
-            pdf_reader = PdfReader(file.name)
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-            text += f"\nExtracted text from PDF file..."
     return text
 def pdf2txt_old(pdf_docs):
     st.write(pdf_docs)
     for file in pdf_docs:
@@ -399,7 +407,7 @@ if user_question:
 with st.sidebar:
     st.subheader("Your documents")
-    docs = st.file_uploader("Upload your documents", accept_multiple_files=True)
     with st.spinner("Processing"):
         raw = pdf2txt(docs)
         if len(raw) > 0:
@@ -407,6 +415,6 @@ with st.sidebar:
             text_chunks = txt2chunks(raw)
             vectorstore = vector_store(text_chunks)
             st.session_state.conversation = get_chain(vectorstore)
-            st.markdown('# AI Search Index of Length:' + length + ' Created.')
             filename = generate_filename(raw, 'txt')
             create_file(filename, raw, '')

 import requests
 import time
 import re
+import textract
 from datetime import datetime
 from openai import ChatCompletion
     else:
         raise TypeError("Input should be a string or a streamlit.UploadedFile object")
+from io import BytesIO
 def extract_file_extension(file):
+    # get the file name directly from the UploadedFile object
     file_name = file.name
     pattern = r".*?\.(.*?)$"
     match = re.search(pattern, file_name)
     else:
         raise ValueError(f"Unable to extract file extension from {file_name}")
+def pdf2txt(docs):
     text = ""
+    for file in docs:
         file_extension = extract_file_extension(file)
         # print the file extension
+        st.write(f"File type extension: {file_extension}")
+        # save the uploaded file temporarily
+        temp_file_name = file.name
+        with open(temp_file_name, "wb") as f:
+            f.write(file.getvalue())
+        # read the file according to its extension
+        try:
+            if file_extension.lower() in ['txt', 'html', 'htm', 'py', 'xml', 'json', 'docx']:
+                text += textract.process(temp_file_name).decode("utf-8")
+            elif file_extension.lower() == 'pdf':
+                with open(temp_file_name, "rb") as f:
+                    pdf = PdfFileReader(f)
+                    for page in range(pdf.getNumPages()):
+                        text += pdf.getPage(page).extractText()
+        except Exception as e:
+            st.write(f"Error processing file {file.name}: {e}")
+        # remove the temporary file
+        os.remove(temp_file_name)
     return text
 def pdf2txt_old(pdf_docs):
     st.write(pdf_docs)
     for file in pdf_docs:
 with st.sidebar:
     st.subheader("Your documents")
+    docs = st.file_uploader("import documents", accept_multiple_files=True)
     with st.spinner("Processing"):
         raw = pdf2txt(docs)
         if len(raw) > 0:
             text_chunks = txt2chunks(raw)
             vectorstore = vector_store(text_chunks)
             st.session_state.conversation = get_chain(vectorstore)
+            st.markdown('# AI Search Index of Length:' + length + ' Created.')  # add timing
             filename = generate_filename(raw, 'txt')
             create_file(filename, raw, '')