Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / dockling_parser /parser.py

hellorahulk

Fix Docling import and usage with DocumentConverter

5c197b6 10 months ago

raw

history blame

3.57 kB

	import os
	from pathlib import Path
	from typing import Optional, Dict, Any, Union
	import magic
	from docling.document_converter import DocumentConverter
	from datetime import datetime

	from .types import ParsedDocument, DocumentMetadata
	from .exceptions import UnsupportedFormatError, ParseError

	class DocumentParser:
	"""
	A multiformat document parser using Docling
	"""

	SUPPORTED_FORMATS = {
	'application/pdf': 'pdf',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
	'text/plain': 'txt',
	'text/html': 'html',
	'text/markdown': 'md'
	}

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self.config = config or {}
	self.converter = DocumentConverter()

	def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
	"""
	Parse a document file and return structured content

	Args:
	file_path: Path to the document file

	Returns:
	ParsedDocument object containing parsed content and metadata

	Raises:
	UnsupportedFormatError: If the file format is not supported
	ParseError: If parsing fails
	"""
	file_path = Path(file_path)
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	mime_type = magic.from_file(str(file_path), mime=True)
	if mime_type not in self.SUPPORTED_FORMATS:
	raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")

	try:
	# Get file metadata
	stats = file_path.stat()
	metadata = DocumentMetadata(
	filename=file_path.name,
	file_type=self.SUPPORTED_FORMATS[mime_type],
	size_bytes=stats.st_size,
	created_at=datetime.fromtimestamp(stats.st_ctime),
	modified_at=datetime.fromtimestamp(stats.st_mtime),
	mime_type=mime_type
	)

	# Parse document using Docling
	result = self.converter.convert(str(file_path))
	doc = result.document

	# Extract content and structure
	content = doc.text
	structured_content = {
	'sections': doc.sections if hasattr(doc, 'sections') else [],
	'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
	'entities': doc.entities if hasattr(doc, 'entities') else {},
	'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
	}

	# Update metadata with document-specific information
	if hasattr(doc, 'metadata') and doc.metadata:
	metadata.title = doc.metadata.get('title')
	metadata.author = doc.metadata.get('author')
	metadata.pages = doc.metadata.get('pages')
	metadata.extra.update(doc.metadata)

	return ParsedDocument(
	content=content,
	metadata=metadata,
	raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
	structured_content=structured_content,
	confidence_score=getattr(doc, 'confidence', 1.0)
	)

	except Exception as e:
	raise ParseError(f"Failed to parse document: {str(e)}") from e

	def supports_format(self, mime_type: str) -> bool:
	"""Check if a given MIME type is supported"""
	return mime_type in self.SUPPORTED_FORMATS