VoiceGPT15

Sleeping

awacke1 commited on Jul 7, 2023

Commit

89a7198

•

1 Parent(s): ecc7e6b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -205,40 +205,34 @@ def extract_mime_type(file):
 import textract
 import os
-def extract_mime_type(file):
-    # If it's not a string, assume it's a streamlit.UploadedFile object
-    if isinstance(file, streamlit.UploadedFile):
-        return file.type
     else:
-        raise TypeError("Input should be a streamlit.UploadedFile object")
 def pdf2txt(pdf_docs):
-    st.write(pdf_docs)
-    file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
-                  'text/html': '.html', 'application/json': '.json',
-                  'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
     text = ""
-    for file in pdf_docs:
-        mime_type = extract_mime_type(file)
-        file_extension = file_types.get(mime_type, '')
-        st.write(f"File type extension: {file_extension}")
-        # You might want to save the file and use textract to extract text from it.
-        # Assuming file is a streamlit.UploadedFile object
-        with open(file.name, "wb") as f:
-            f.write(file.getvalue())
-        if file_extension in ['.txt', '.html', '.htm', '.py', '.xml', '.json']:
-            text += textract.process(file.name).decode("utf-8")
-        elif file_extension == '.pdf':
-            pdf_reader = PdfReader(file.name)
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-        # Delete the file after processing
-        os.remove(file.name)
     return text
 def pdf2txt_old(pdf_docs):

 import textract
 import os
+def extract_file_extension(file_str):
+    # Using regex pattern matching to find the file extension
+    pattern = r"name='.*?\.(.*?)'"
+    match = re.search(pattern, file_str)
+    if match:
+        return match.group(1)
     else:
+        raise ValueError(f"Unable to extract file extension from {file_str}")
 def pdf2txt(pdf_docs):
     text = ""
+    for file_str in pdf_docs:
+        file_extension = extract_file_extension(file_str)
+        # Print the file extension
+        print(f"File type extension: {file_extension}")
+        # Simulate file reading
+        # You need to replace the following lines with actual file reading
+        # based on the file_extension
+        if file_extension in ['txt', 'html', 'htm', 'py', 'xml', 'json']:
+            # text += textract.process(file_str).decode("utf-8")
+            text += f"\nExtracted text from {file_extension} file..."
+        elif file_extension == 'pdf':
+            # pdf_reader = PdfReader(file_str)
+            # for page in pdf_reader.pages:
+            #     text += page.extract_text()
+            text += f"\nExtracted text from PDF file..."
     return text
 def pdf2txt_old(pdf_docs):