Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

hellorahulk commited on Jan 23

Commit

fdbfd73

1 Parent(s): 8c92c5f

Improve error handling and file processing

Browse files

Files changed (2) hide show

app.py +42 -12
dockling_parser/parser.py +93 -51

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import gradio as gr
 import pandas as pd
 from dockling_parser import DocumentParser
-from dockling_parser.exceptions import ParserError
 import tempfile
 import mimetypes
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
@@ -23,19 +24,37 @@ ARTICLE = """
 Made with ❤️ using Docling and Gradio
 """
 # Initialize the document parser
 parser = DocumentParser()
 def process_document(file_path):
     """Process uploaded document and return structured information"""
     if file_path is None:
-        return (
-            "Error: No file uploaded",
-            pd.DataFrame(),
-            "No sections available",
-            "No entities available",
-            "Confidence Score: 0.0"
-        )
     try:
         # Parse the document directly using the file path
@@ -64,18 +83,29 @@ def process_document(file_path):
             f"Confidence Score: {result.confidence_score:.2f}"  # Confidence score
         )
     except ParserError as e:
         return (
-            f"Error parsing document: {str(e)}",
-            pd.DataFrame(),
             "No sections available",
             "No entities available",
             "Confidence Score: 0.0"
         )
     except Exception as e:
         return (
-            f"Unexpected error: {str(e)}",
-            pd.DataFrame(),
             "No sections available",
             "No entities available",
             "Confidence Score: 0.0"

 import gradio as gr
 import pandas as pd
 from dockling_parser import DocumentParser
+from dockling_parser.exceptions import ParserError, UnsupportedFormatError
 import tempfile
 import mimetypes
+import traceback
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
 Made with ❤️ using Docling and Gradio
 """
+ERROR_MESSAGES = {
+    "no_file": (
+        "⚠️ No file uploaded",
+        "Please upload a document to process.",
+        "No sections available",
+        "No entities available",
+        "Confidence Score: 0.0"
+    ),
+    "unsupported_format": (
+        "⚠️ Unsupported file format",
+        "Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
+        "No sections available",
+        "No entities available",
+        "Confidence Score: 0.0"
+    ),
+    "processing_error": (
+        "⚠️ Error processing document",
+        "An error occurred while processing the document. Please try again with a different file.",
+        "No sections available",
+        "No entities available",
+        "Confidence Score: 0.0"
+    )
+}
 # Initialize the document parser
 parser = DocumentParser()
 def process_document(file_path):
     """Process uploaded document and return structured information"""
     if file_path is None:
+        return ERROR_MESSAGES["no_file"]
     try:
         # Parse the document directly using the file path
             f"Confidence Score: {result.confidence_score:.2f}"  # Confidence score
         )
+    except UnsupportedFormatError as e:
+        error_msg = f"⚠️ {str(e)}"
+        return (
+            error_msg,
+            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
+            "No sections available",
+            "No entities available",
+            "Confidence Score: 0.0"
+        )
     except ParserError as e:
+        error_msg = f"⚠️ {str(e)}"
         return (
+            error_msg,
+            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
             "No sections available",
             "No entities available",
             "Confidence Score: 0.0"
         )
     except Exception as e:
+        error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
         return (
+            error_msg,
+            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
             "No sections available",
             "No entities available",
             "Confidence Score: 0.0"

dockling_parser/parser.py CHANGED Viewed

@@ -4,6 +4,8 @@ from typing import Optional, Dict, Any, Union
 import magic
 from docling.document_converter import DocumentConverter
 from datetime import datetime
 from .types import ParsedDocument, DocumentMetadata
 from .exceptions import UnsupportedFormatError, ParseError
@@ -40,20 +42,17 @@ class DocumentParser:
     def __init__(self, config: Optional[Dict[str, Any]] = None):
         self.config = config or {}
         self.converter = DocumentConverter()
-    def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
         """
-        Parse a document file and return structured content
-        Args:
-            file_path: Path to the document file
-        Returns:
-            ParsedDocument object containing parsed content and metadata
-        Raises:
-            UnsupportedFormatError: If the file format is not supported
-            ParseError: If parsing fails
         """
         file_path = Path(file_path)
         if not file_path.exists():
@@ -66,15 +65,43 @@ class DocumentParser:
         # If extension not recognized, use magic
         if not mime_type:
             mime_type = magic.from_file(str(file_path), mime=True)
-        if mime_type not in self.SUPPORTED_FORMATS:
-            raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
         try:
             # Get file metadata
-            stats = file_path.stat()
             metadata = DocumentMetadata(
-                filename=file_path.name,
                 file_type=self.SUPPORTED_FORMATS[mime_type],
                 size_bytes=stats.st_size,
                 created_at=datetime.fromtimestamp(stats.st_ctime),
@@ -82,44 +109,59 @@ class DocumentParser:
                 mime_type=mime_type
             )
-            # Parse document using Docling
-            result = self.converter.convert(str(file_path))
-            doc = result.document
-            # Extract content using proper methods
-            content = doc.export_to_text()
-            # Extract structured content
-            structured_content = {
-                'sections': doc.sections if hasattr(doc, 'sections') else [],
-                'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
-                'entities': doc.entities if hasattr(doc, 'entities') else {},
-                'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
-            }
-            # Get raw text if available
             try:
-                raw_text = doc.export_to_text(include_layout=True)
-            except:
-                raw_text = content
-            # Update metadata with document-specific information
-            if hasattr(doc, 'metadata') and doc.metadata:
-                metadata.title = doc.metadata.get('title')
-                metadata.author = doc.metadata.get('author')
-                metadata.pages = doc.metadata.get('pages')
-                metadata.extra.update(doc.metadata)
-            return ParsedDocument(
-                content=content,
-                metadata=metadata,
-                raw_text=raw_text,
-                structured_content=structured_content,
-                confidence_score=getattr(doc, 'confidence', 1.0)
-            )
         except Exception as e:
-            raise ParseError(f"Failed to parse document: {str(e)}") from e
     def supports_format(self, mime_type: str) -> bool:
         """Check if a given MIME type is supported"""

 import magic
 from docling.document_converter import DocumentConverter
 from datetime import datetime
+import shutil
+import tempfile
 from .types import ParsedDocument, DocumentMetadata
 from .exceptions import UnsupportedFormatError, ParseError
     def __init__(self, config: Optional[Dict[str, Any]] = None):
         self.config = config or {}
         self.converter = DocumentConverter()
+        # Create a temporary directory for processing files
+        self.temp_dir = Path(tempfile.mkdtemp(prefix="dockling_"))
+    def __del__(self):
+        """Cleanup temporary directory on object destruction"""
+        if hasattr(self, 'temp_dir') and self.temp_dir.exists():
+            shutil.rmtree(self.temp_dir, ignore_errors=True)
+    def _validate_and_copy_file(self, file_path: Union[str, Path]) -> Path:
         """
+        Validate file and copy to temporary location with correct extension
         """
         file_path = Path(file_path)
         if not file_path.exists():
         # If extension not recognized, use magic
         if not mime_type:
             mime_type = magic.from_file(str(file_path), mime=True)
+            if mime_type in self.SUPPORTED_FORMATS:
+                extension = f".{self.SUPPORTED_FORMATS[mime_type]}"
+            else:
+                raise UnsupportedFormatError(
+                    f"Unsupported file format: {mime_type}. "
+                    f"Supported formats are: {', '.join(set(self.SUPPORTED_FORMATS.values()))}"
+                )
+        # Copy file to temp directory with correct extension
+        temp_file = self.temp_dir / f"doc{extension}"
+        shutil.copy2(file_path, temp_file)
+        return temp_file
+    def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
+        """
+        Parse a document file and return structured content
+        Args:
+            file_path: Path to the document file
+        Returns:
+            ParsedDocument object containing parsed content and metadata
+        Raises:
+            UnsupportedFormatError: If the file format is not supported
+            ParseError: If parsing fails
+        """
         try:
+            # Validate and prepare file
+            temp_file = self._validate_and_copy_file(file_path)
             # Get file metadata
+            stats = temp_file.stat()
+            mime_type = magic.from_file(str(temp_file), mime=True)
             metadata = DocumentMetadata(
+                filename=Path(file_path).name,  # Use original filename
                 file_type=self.SUPPORTED_FORMATS[mime_type],
                 size_bytes=stats.st_size,
                 created_at=datetime.fromtimestamp(stats.st_ctime),
                 mime_type=mime_type
             )
             try:
+                # Parse document using Docling
+                result = self.converter.convert(str(temp_file))
+                doc = result.document
+                # Extract content using proper methods
+                try:
+                    content = doc.export_to_text()
+                except Exception as e:
+                    raise ParseError(f"Failed to extract text content: {str(e)}")
+                # Extract structured content
+                structured_content = {
+                    'sections': doc.sections if hasattr(doc, 'sections') else [],
+                    'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
+                    'entities': doc.entities if hasattr(doc, 'entities') else {},
+                    'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
+                }
+                # Get raw text if available
+                try:
+                    raw_text = doc.export_to_text(include_layout=True)
+                except:
+                    raw_text = content
+                # Update metadata with document-specific information
+                if hasattr(doc, 'metadata') and doc.metadata:
+                    metadata.title = doc.metadata.get('title')
+                    metadata.author = doc.metadata.get('author')
+                    metadata.pages = doc.metadata.get('pages')
+                    metadata.extra.update(doc.metadata)
+                return ParsedDocument(
+                    content=content,
+                    metadata=metadata,
+                    raw_text=raw_text,
+                    structured_content=structured_content,
+                    confidence_score=getattr(doc, 'confidence', 1.0)
+                )
+            except Exception as e:
+                raise ParseError(f"Failed to parse document: {str(e)}")
         except Exception as e:
+            raise ParseError(str(e))
+        finally:
+            # Cleanup temporary files
+            if 'temp_file' in locals() and temp_file.exists():
+                try:
+                    temp_file.unlink()
+                except:
+                    pass
     def supports_format(self, mime_type: str) -> bool:
         """Check if a given MIME type is supported"""