feat: Initial commit of docling-layout-egret-large with safetensor checkpoint and demo code.

Files changed (4) hide show

README.md +114 -3
config.json +224 -0
model.safetensors +3 -0
preprocessor_config.json +26 -0

README.md CHANGED Viewed

@@ -1,3 +1,114 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+---
+THIS IS WORK IN PROGRESS
+# Docling Layout Model egret-large
+`docling-layout-egret-large` is a Document Layout Model based on [DFINE-m](https://github.com/Peterande/D-FINE).
+The model has been trained from scratch on a mix of document datasets.
+It is part of the [Docling project](https://github.com/docling-project/docling).
+# Inference code example
+Prerequisites:
+```bash
+pip install transformers Pillow torch requests
+```
+Prediction:
+```python
+import requests
+from transformers import (
+    DFineForObjectDetection,
+    RTDetrImageProcessor,
+)
+import torch
+from PIL import Image
+classes_map = {
+    0: "Caption",
+    1: "Footnote",
+    2: "Formula",
+    3: "List-item",
+    4: "Page-footer",
+    5: "Page-header",
+    6: "Picture",
+    7: "Section-header",
+    8: "Table",
+    9: "Text",
+    10: "Title",
+    11: "Document Index",
+    12: "Code",
+    13: "Checkbox-Selected",
+    14: "Checkbox-Unselected",
+    15: "Form",
+    16: "Key-Value Region",
+}
+image_url = "https://huggingface.co/spaces/ds4sd/SmolDocling-256M-Demo/resolve/main/example_images/annual_rep_14.png"
+model_name = "ds4sd/docling-layout-egret-large"
+threshold = 0.6
+# Download the image
+image = Image.open(requests.get(image_url, stream=True).raw)
+image = image.convert("RGB")
+# Initialize the model
+image_processor = RTDetrImageProcessor.from_pretrained(model_name)
+model = DFineForObjectDetection.from_pretrained(model_name)
+# Run the prediction pipeline
+inputs = image_processor(images=[image], return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+results = image_processor.post_process_object_detection(
+    outputs,
+    target_sizes=torch.tensor([image.size[::-1]]),
+    threshold=threshold,
+)
+# Get the results
+for result in results:
+    for score, label_id, box in zip(
+        result["scores"], result["labels"], result["boxes"]
+    ):
+        score = round(score.item(), 2)
+        label = classes_map[label_id.item()]
+        box = [round(i, 2) for i in box.tolist()]
+        print(f"{label}:{score} {box}")
+```
+# References
+```
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {Docling Technical Report},
+  url = {https://arxiv.org/abs/2408.09869v4},
+  eprint = {2408.09869},
+  doi = {10.48550/arXiv.2408.09869},
+  version = {1.0.0},
+  year = {2024}
+}
+@misc{peng2024dfine,
+      title={D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement},
+      author={Yansong Peng and Hebei Li and Peixi Wu and Yueyi Zhang and Xiaoyan Sun and Feng Wu},
+      year={2024},
+      eprint={2410.13842},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,224 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "silu",
+  "anchor_image_size": null,
+  "architectures": [
+    "DFineForObjectDetection"
+  ],
+  "attention_dropout": 0.0,
+  "auxiliary_loss": true,
+  "backbone": null,
+  "backbone_config": {
+    "depths": [
+      3,
+      4,
+      6,
+      3
+    ],
+    "downsample_in_bottleneck": false,
+    "downsample_in_first_stage": false,
+    "embedding_size": 32,
+    "hidden_act": "relu",
+    "hidden_sizes": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "initializer_range": 0.02,
+    "layer_type": "basic",
+    "model_type": "hgnet_v2",
+    "num_channels": 3,
+    "out_features": [
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "out_indices": [
+      2,
+      3,
+      4
+    ],
+    "stage_downsample": [
+      false,
+      true,
+      true,
+      true
+    ],
+    "stage_in_channels": [
+      48,
+      128,
+      512,
+      1024
+    ],
+    "stage_kernel_size": [
+      3,
+      3,
+      5,
+      5
+    ],
+    "stage_light_block": [
+      false,
+      false,
+      true,
+      true
+    ],
+    "stage_mid_channels": [
+      48,
+      96,
+      192,
+      384
+    ],
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "stage_num_blocks": [
+      1,
+      1,
+      3,
+      1
+    ],
+    "stage_numb_of_layers": [
+      6,
+      6,
+      6,
+      6
+    ],
+    "stage_out_channels": [
+      128,
+      512,
+      1024,
+      2048
+    ],
+    "stem_channels": [
+      3,
+      32,
+      48
+    ],
+    "use_learnable_affine_block": false
+  },
+  "backbone_kwargs": null,
+  "batch_norm_eps": 1e-05,
+  "box_noise_scale": 1.0,
+  "d_model": 256,
+  "decoder_activation_function": "relu",
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 1024,
+  "decoder_in_channels": [
+    256,
+    256,
+    256
+  ],
+  "decoder_layers": 6,
+  "decoder_method": "default",
+  "decoder_n_points": [
+    3,
+    6,
+    3
+  ],
+  "decoder_offset_scale": 0.5,
+  "depth_mult": 1.0,
+  "dropout": 0.0,
+  "encode_proj_layers": [
+    2
+  ],
+  "encoder_activation_function": "gelu",
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 1024,
+  "encoder_hidden_dim": 256,
+  "encoder_in_channels": [
+    512,
+    1024,
+    2048
+  ],
+  "encoder_layers": 1,
+  "eos_coefficient": 0.0001,
+  "eval_idx": -1,
+  "eval_size": null,
+  "feat_strides": [
+    8,
+    16,
+    32
+  ],
+  "focal_loss_alpha": 0.75,
+  "focal_loss_gamma": 2.0,
+  "freeze_backbone_batch_norms": true,
+  "hidden_expansion": 1.0,
+  "id2label": {
+    "0": "Caption",
+    "1": "Footnote",
+    "2": "Formula",
+    "3": "List-item",
+    "4": "Page-footer",
+    "5": "Page-header",
+    "6": "Picture",
+    "7": "Section-header",
+    "8": "Table",
+    "9": "Text",
+    "10": "Title",
+    "11": "Document Index",
+    "12": "Code",
+    "13": "Checkbox-Selected",
+    "14": "Checkbox-Unselected",
+    "15": "Form",
+    "16": "Key-Value Region"
+  },
+  "initializer_bias_prior_prob": null,
+  "initializer_range": 0.01,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "Caption": 0,
+    "Checkbox-Selected": 13,
+    "Checkbox-Unselected": 14,
+    "Code": 12,
+    "Document Index": 11,
+    "Footnote": 1,
+    "Form": 15,
+    "Formula": 2,
+    "Key-Value Region": 16,
+    "List-item": 3,
+    "Page-footer": 4,
+    "Page-header": 5,
+    "Picture": 6,
+    "Section-header": 7,
+    "Table": 8,
+    "Text": 9,
+    "Title": 10
+  },
+  "label_noise_ratio": 0.5,
+  "layer_norm_eps": 1e-05,
+  "layer_scale": 1,
+  "learn_initial_query": false,
+  "lqe_hidden_dim": 64,
+  "lqe_layers": 2,
+  "matcher_alpha": 0.25,
+  "matcher_bbox_cost": 5.0,
+  "matcher_class_cost": 2.0,
+  "matcher_gamma": 2.0,
+  "matcher_giou_cost": 2.0,
+  "max_num_bins": 32,
+  "model_type": "d_fine",
+  "normalize_before": false,
+  "num_denoising": 100,
+  "num_feature_levels": 3,
+  "num_queries": 300,
+  "positional_encoding_temperature": 10000,
+  "reg_scale": 4.0,
+  "top_prob_values": 4,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0.dev0",
+  "up": 0.5,
+  "use_focal_loss": true,
+  "use_pretrained_backbone": false,
+  "use_timm_backbone": false,
+  "weight_loss_bbox": 5.0,
+  "weight_loss_ddf": 1.5,
+  "weight_loss_fgl": 0.15,
+  "weight_loss_giou": 2.0,
+  "weight_loss_vfl": 1.0,
+  "with_box_refine": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f79def9d4a0d4e6e62cab25ec7846d1579ef1ef657c39554363813f7d1a14f1b
+size 125100636

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "do_convert_annotations": true,
+  "do_normalize": false,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "format": "coco_detection",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "RTDetrImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "pad_size": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 640,
+    "width": 640
+  }
+}