Spaces:

Mqleet
/

AutoPage

Running

App Files Files Community

AutoPage / docling /datamodel /base_models.py

Mqleet

upd code

fcaa164 about 1 month ago

raw

history blame

7.03 kB

	from enum import Enum
	from typing import TYPE_CHECKING, Dict, List, Optional, Union

	from docling_core.types.doc import (
	BoundingBox,
	DocItemLabel,
	NodeItem,
	PictureDataType,
	Size,
	TableCell,
	)
	from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
	DocumentStream,
	)
	from PIL.Image import Image
	from pydantic import BaseModel, ConfigDict

	if TYPE_CHECKING:
	from docling.backend.pdf_backend import PdfPageBackend


	class ConversionStatus(str, Enum):
	PENDING = "pending"
	STARTED = "started"
	FAILURE = "failure"
	SUCCESS = "success"
	PARTIAL_SUCCESS = "partial_success"
	SKIPPED = "skipped"


	class InputFormat(str, Enum):
	"""A document format supported by document backend parsers."""

	DOCX = "docx"
	PPTX = "pptx"
	HTML = "html"
	XML_PUBMED = "xml_pubmed"
	IMAGE = "image"
	PDF = "pdf"
	ASCIIDOC = "asciidoc"
	MD = "md"
	XLSX = "xlsx"
	XML_USPTO = "xml_uspto"
	JSON_DOCLING = "json_docling"


	class OutputFormat(str, Enum):
	MARKDOWN = "md"
	JSON = "json"
	HTML = "html"
	TEXT = "text"
	DOCTAGS = "doctags"


	FormatToExtensions: Dict[InputFormat, List[str]] = {
	InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
	InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
	InputFormat.PDF: ["pdf"],
	InputFormat.MD: ["md"],
	InputFormat.HTML: ["html", "htm", "xhtml"],
	InputFormat.XML_PUBMED: ["xml", "nxml"],
	InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
	InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
	InputFormat.XLSX: ["xlsx"],
	InputFormat.XML_USPTO: ["xml", "txt"],
	InputFormat.JSON_DOCLING: ["json"],
	}

	FormatToMimeType: Dict[InputFormat, List[str]] = {
	InputFormat.DOCX: [
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
	],
	InputFormat.PPTX: [
	"application/vnd.openxmlformats-officedocument.presentationml.template",
	"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
	"application/vnd.openxmlformats-officedocument.presentationml.presentation",
	],
	InputFormat.HTML: ["text/html", "application/xhtml+xml"],
	InputFormat.XML_PUBMED: ["application/xml"],
	InputFormat.IMAGE: [
	"image/png",
	"image/jpeg",
	"image/tiff",
	"image/gif",
	"image/bmp",
	],
	InputFormat.PDF: ["application/pdf"],
	InputFormat.ASCIIDOC: ["text/asciidoc"],
	InputFormat.MD: ["text/markdown", "text/x-markdown"],
	InputFormat.XLSX: [
	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	],
	InputFormat.XML_USPTO: ["application/xml", "text/plain"],
	InputFormat.JSON_DOCLING: ["application/json"],
	}

	MimeTypeToFormat: dict[str, list[InputFormat]] = {
	mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
	for value in FormatToMimeType.values()
	for mime in value
	}


	class DocInputType(str, Enum):
	PATH = "path"
	STREAM = "stream"


	class DoclingComponentType(str, Enum):
	DOCUMENT_BACKEND = "document_backend"
	MODEL = "model"
	DOC_ASSEMBLER = "doc_assembler"
	USER_INPUT = "user_input"


	class ErrorItem(BaseModel):
	component_type: DoclingComponentType
	module_name: str
	error_message: str


	class Cell(BaseModel):
	id: int
	text: str
	bbox: BoundingBox


	class OcrCell(Cell):
	confidence: float


	class Cluster(BaseModel):
	id: int
	label: DocItemLabel
	bbox: BoundingBox
	confidence: float = 1.0
	cells: List[Cell] = []
	children: List["Cluster"] = [] # Add child cluster support


	class BasePageElement(BaseModel):
	label: DocItemLabel
	id: int
	page_no: int
	cluster: Cluster
	text: Optional[str] = None


	class LayoutPrediction(BaseModel):
	clusters: List[Cluster] = []


	class ContainerElement(
	BasePageElement
	): # Used for Form and Key-Value-Regions, only for typing.
	pass


	class Table(BasePageElement):
	otsl_seq: List[str]
	num_rows: int = 0
	num_cols: int = 0
	table_cells: List[TableCell]


	class TableStructurePrediction(BaseModel):
	table_map: Dict[int, Table] = {}


	class TextElement(BasePageElement):
	text: str


	class FigureElement(BasePageElement):
	annotations: List[PictureDataType] = []
	provenance: Optional[str] = None
	predicted_class: Optional[str] = None
	confidence: Optional[float] = None


	class FigureClassificationPrediction(BaseModel):
	figure_count: int = 0
	figure_map: Dict[int, FigureElement] = {}


	class EquationPrediction(BaseModel):
	equation_count: int = 0
	equation_map: Dict[int, TextElement] = {}


	class PagePredictions(BaseModel):
	layout: Optional[LayoutPrediction] = None
	tablestructure: Optional[TableStructurePrediction] = None
	figures_classification: Optional[FigureClassificationPrediction] = None
	equations_prediction: Optional[EquationPrediction] = None


	PageElement = Union[TextElement, Table, FigureElement, ContainerElement]


	class AssembledUnit(BaseModel):
	elements: List[PageElement] = []
	body: List[PageElement] = []
	headers: List[PageElement] = []


	class ItemAndImageEnrichmentElement(BaseModel):
	model_config = ConfigDict(arbitrary_types_allowed=True)

	item: NodeItem
	image: Image


	class Page(BaseModel):
	model_config = ConfigDict(arbitrary_types_allowed=True)

	page_no: int
	# page_hash: Optional[str] = None
	size: Optional[Size] = None
	cells: List[Cell] = []
	predictions: PagePredictions = PagePredictions()
	assembled: Optional[AssembledUnit] = None

	_backend: Optional["PdfPageBackend"] = (
	None # Internal PDF backend. By default it is cleared during assembling.
	)
	_default_image_scale: float = 1.0 # Default image scale for external usage.
	_image_cache: Dict[float, Image] = (
	{}
	) # Cache of images in different scales. By default it is cleared during assembling.

	def get_image(
	self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
	) -> Optional[Image]:
	if self._backend is None:
	return self._image_cache.get(scale, None)

	if not scale in self._image_cache:
	if cropbox is None:
	self._image_cache[scale] = self._backend.get_page_image(scale=scale)
	else:
	return self._backend.get_page_image(scale=scale, cropbox=cropbox)

	if cropbox is None:
	return self._image_cache[scale]
	else:
	page_im = self._image_cache[scale]
	assert self.size is not None
	return page_im.crop(
	cropbox.to_top_left_origin(page_height=self.size.height)
	.scaled(scale=scale)
	.as_tuple()
	)

	@property
	def image(self) -> Optional[Image]:
	return self.get_image(scale=self._default_image_scale)