VoiceGPT15

Sleeping

App Files Files Community

awacke1 commited on Jul 7, 2023

Commit

e6741ed

•

1 Parent(s): 531e73c

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -1

app.py CHANGED Viewed

@@ -201,8 +201,47 @@ def extract_mime_type(file):
         return file.type
     else:
         raise TypeError("Input should be a string or a streamlit.UploadedFile object")
 def pdf2txt(pdf_docs):
     st.write(pdf_docs)
     for file in pdf_docs:
         mime_type = extract_mime_type(file)

         return file.type
     else:
         raise TypeError("Input should be a string or a streamlit.UploadedFile object")
+import textract
+import os
+def extract_mime_type(file):
+    # If it's not a string, assume it's a streamlit.UploadedFile object
+    if isinstance(file, streamlit.UploadedFile):
+        return file.type
+    else:
+        raise TypeError("Input should be a streamlit.UploadedFile object")
 def pdf2txt(pdf_docs):
+    st.write(pdf_docs)
+    file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
+                  'text/html': '.html', 'application/json': '.json',
+                  'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
+    text = ""
+    for file in pdf_docs:
+        mime_type = extract_mime_type(file)
+        file_extension = file_types.get(mime_type, '')
+        st.write(f"File type extension: {file_extension}")
+        # You might want to save the file and use textract to extract text from it.
+        # Assuming file is a streamlit.UploadedFile object
+        with open(file.name, "wb") as f:
+            f.write(file.getvalue())
+        if file_extension in ['.txt', '.html', '.htm', '.py', '.xml', '.json']:
+            text += textract.process(file.name).decode("utf-8")
+        elif file_extension == '.pdf':
+            pdf_reader = PdfReader(file.name)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+        # Delete the file after processing
+        os.remove(file.name)
+    return text
+def pdf2txt_old(pdf_docs):
     st.write(pdf_docs)
     for file in pdf_docs:
         mime_type = extract_mime_type(file)