Spaces:
Running
Running
| import os | |
| from pathlib import Path | |
| from typing import Optional, Dict, Any, Union | |
| import magic | |
| from docling.document_converter import DocumentConverter | |
| from datetime import datetime | |
| from .types import ParsedDocument, DocumentMetadata | |
| from .exceptions import UnsupportedFormatError, ParseError | |
| class DocumentParser: | |
| """ | |
| A multiformat document parser using Docling | |
| """ | |
| SUPPORTED_FORMATS = { | |
| 'application/pdf': 'pdf', | |
| 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', | |
| 'text/plain': 'txt', | |
| 'text/html': 'html', | |
| 'text/markdown': 'md' | |
| } | |
| def __init__(self, config: Optional[Dict[str, Any]] = None): | |
| self.config = config or {} | |
| self.converter = DocumentConverter() | |
| def parse(self, file_path: Union[str, Path]) -> ParsedDocument: | |
| """ | |
| Parse a document file and return structured content | |
| Args: | |
| file_path: Path to the document file | |
| Returns: | |
| ParsedDocument object containing parsed content and metadata | |
| Raises: | |
| UnsupportedFormatError: If the file format is not supported | |
| ParseError: If parsing fails | |
| """ | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| mime_type = magic.from_file(str(file_path), mime=True) | |
| if mime_type not in self.SUPPORTED_FORMATS: | |
| raise UnsupportedFormatError(f"Unsupported file format: {mime_type}") | |
| try: | |
| # Get file metadata | |
| stats = file_path.stat() | |
| metadata = DocumentMetadata( | |
| filename=file_path.name, | |
| file_type=self.SUPPORTED_FORMATS[mime_type], | |
| size_bytes=stats.st_size, | |
| created_at=datetime.fromtimestamp(stats.st_ctime), | |
| modified_at=datetime.fromtimestamp(stats.st_mtime), | |
| mime_type=mime_type | |
| ) | |
| # Parse document using Docling | |
| result = self.converter.convert(str(file_path)) | |
| doc = result.document | |
| # Extract content and structure | |
| content = doc.text | |
| structured_content = { | |
| 'sections': doc.sections if hasattr(doc, 'sections') else [], | |
| 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [], | |
| 'entities': doc.entities if hasattr(doc, 'entities') else {}, | |
| 'metadata': doc.metadata if hasattr(doc, 'metadata') else {} | |
| } | |
| # Update metadata with document-specific information | |
| if hasattr(doc, 'metadata') and doc.metadata: | |
| metadata.title = doc.metadata.get('title') | |
| metadata.author = doc.metadata.get('author') | |
| metadata.pages = doc.metadata.get('pages') | |
| metadata.extra.update(doc.metadata) | |
| return ParsedDocument( | |
| content=content, | |
| metadata=metadata, | |
| raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None, | |
| structured_content=structured_content, | |
| confidence_score=getattr(doc, 'confidence', 1.0) | |
| ) | |
| except Exception as e: | |
| raise ParseError(f"Failed to parse document: {str(e)}") from e | |
| def supports_format(self, mime_type: str) -> bool: | |
| """Check if a given MIME type is supported""" | |
| return mime_type in self.SUPPORTED_FORMATS |