Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

App Files Files Community

KingNish commited on Sep 20, 2024

Commit

dbc91d5

verified ·

1 Parent(s): 459ea62

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -11

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from huggingface_hub import InferenceClient
 import re
 import zipfile
 import xml.etree.ElementTree as ET
 # Constants
 CHUNK_SIZE = 32000
@@ -95,12 +96,20 @@ def extract_text_from_pptx(pptx_data, clean=True):
 def read_document(file, clean=True):
     """Reads content from various document formats."""
     file_path = file.name
-    file_extension = file_path.split('.')[-1].lower()
     with open(file_path, "rb") as f:
         file_content = f.read()
-    if file_extension == 'pdf':
         try:
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
             content = ''
@@ -111,8 +120,8 @@ def read_document(file, clean=True):
             return content, len(content)
         except Exception as e:
             return f"Error reading PDF: {e}", 0
-    elif file_extension == 'xlsx':
         try:
             wb = load_workbook(io.BytesIO(file_content))
             content = ''
@@ -126,18 +135,32 @@ def read_document(file, clean=True):
             return content, len(content)
         except Exception as e:
             return f"Error reading XLSX: {e}", 0
-    elif file_extension == 'pptx':
         try:
-            return extract_text_from_pptx(file_content, clean)
         except Exception as e:
-            return f"Error reading PPTX: {e}", 0
-    elif file_extension == 'doc' or file_extension == 'docx':
         try:
             return extract_text_from_docx(file_content, clean)
         except Exception as e:
-            return f"Error reading DOC/DOCX: {e}", 0
     else:
         try:

 import re
 import zipfile
 import xml.etree.ElementTree as ET
+import filetype
 # Constants
 CHUNK_SIZE = 32000
 def read_document(file, clean=True):
     """Reads content from various document formats."""
     file_path = file.name
+    # No file extension used
     with open(file_path, "rb") as f:
         file_content = f.read()
+    kind = filetype.guess(file_content)
+    if kind is None:
+        return "Cannot guess file type", 0  # Handle unknown file types
+    mime = kind.mime
+    if mime == "application/pdf":
+        # PDF Handling (unchanged)
         try:
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
             content = ''
             return content, len(content)
         except Exception as e:
             return f"Error reading PDF: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        # XLSX Handling (unchanged)
         try:
             wb = load_workbook(io.BytesIO(file_content))
             content = ''
             return content, len(content)
         except Exception as e:
             return f"Error reading XLSX: {e}", 0
+    elif mime == "text/plain":
         try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(content)
         except Exception as e:
+            return f"Error reading TXT file: {e}", 0
+    elif mime == "text/csv":
+        try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(content)
+        except Exception as e:
+            return f"Error reading CSV file: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
         try:
             return extract_text_from_docx(file_content, clean)
         except Exception as e:
+            return f"Error reading DOCX: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
+        try:
+            return extract_text_from_pptx(file_content, clean)
+        except Exception as e:
+            return f"Error reading PPTX: {e}", 0
     else:
         try: