Safetensors
d_fine
Nikos Livathinos commited on
Commit
95c2a5e
·
1 Parent(s): 3b4e5e7

feat: Initial commit of docling-layout-egret-large with safetensor checkpoint and demo code.

Browse files
Files changed (4) hide show
  1. README.md +114 -3
  2. config.json +224 -0
  3. model.safetensors +3 -0
  4. preprocessor_config.json +26 -0
README.md CHANGED
@@ -1,3 +1,114 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ THIS IS WORK IN PROGRESS
6
+
7
+
8
+ # Docling Layout Model egret-large
9
+
10
+ `docling-layout-egret-large` is a Document Layout Model based on [DFINE-m](https://github.com/Peterande/D-FINE).
11
+
12
+ The model has been trained from scratch on a mix of document datasets.
13
+
14
+ It is part of the [Docling project](https://github.com/docling-project/docling).
15
+
16
+
17
+ # Inference code example
18
+
19
+ Prerequisites:
20
+
21
+ ```bash
22
+ pip install transformers Pillow torch requests
23
+ ```
24
+
25
+ Prediction:
26
+
27
+ ```python
28
+ import requests
29
+ from transformers import (
30
+ DFineForObjectDetection,
31
+ RTDetrImageProcessor,
32
+ )
33
+ import torch
34
+ from PIL import Image
35
+
36
+
37
+ classes_map = {
38
+ 0: "Caption",
39
+ 1: "Footnote",
40
+ 2: "Formula",
41
+ 3: "List-item",
42
+ 4: "Page-footer",
43
+ 5: "Page-header",
44
+ 6: "Picture",
45
+ 7: "Section-header",
46
+ 8: "Table",
47
+ 9: "Text",
48
+ 10: "Title",
49
+ 11: "Document Index",
50
+ 12: "Code",
51
+ 13: "Checkbox-Selected",
52
+ 14: "Checkbox-Unselected",
53
+ 15: "Form",
54
+ 16: "Key-Value Region",
55
+ }
56
+ image_url = "https://huggingface.co/spaces/ds4sd/SmolDocling-256M-Demo/resolve/main/example_images/annual_rep_14.png"
57
+ model_name = "ds4sd/docling-layout-egret-large"
58
+ threshold = 0.6
59
+
60
+ # Download the image
61
+ image = Image.open(requests.get(image_url, stream=True).raw)
62
+ image = image.convert("RGB")
63
+
64
+
65
+ # Initialize the model
66
+ image_processor = RTDetrImageProcessor.from_pretrained(model_name)
67
+ model = DFineForObjectDetection.from_pretrained(model_name)
68
+
69
+ # Run the prediction pipeline
70
+ inputs = image_processor(images=[image], return_tensors="pt")
71
+ with torch.no_grad():
72
+ outputs = model(**inputs)
73
+ results = image_processor.post_process_object_detection(
74
+ outputs,
75
+ target_sizes=torch.tensor([image.size[::-1]]),
76
+ threshold=threshold,
77
+ )
78
+
79
+ # Get the results
80
+ for result in results:
81
+ for score, label_id, box in zip(
82
+ result["scores"], result["labels"], result["boxes"]
83
+ ):
84
+ score = round(score.item(), 2)
85
+ label = classes_map[label_id.item()]
86
+ box = [round(i, 2) for i in box.tolist()]
87
+ print(f"{label}:{score} {box}")
88
+ ```
89
+
90
+
91
+ # References
92
+
93
+ ```
94
+ @techreport{Docling,
95
+ author = {Deep Search Team},
96
+ month = {8},
97
+ title = {Docling Technical Report},
98
+ url = {https://arxiv.org/abs/2408.09869v4},
99
+ eprint = {2408.09869},
100
+ doi = {10.48550/arXiv.2408.09869},
101
+ version = {1.0.0},
102
+ year = {2024}
103
+ }
104
+
105
+ @misc{peng2024dfine,
106
+ title={D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement},
107
+ author={Yansong Peng and Hebei Li and Peixi Wu and Yueyi Zhang and Xiaoyan Sun and Feng Wu},
108
+ year={2024},
109
+ eprint={2410.13842},
110
+ archivePrefix={arXiv},
111
+ primaryClass={cs.CV}
112
+ }
113
+ ```
114
+
config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "architectures": [
6
+ "DFineForObjectDetection"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auxiliary_loss": true,
10
+ "backbone": null,
11
+ "backbone_config": {
12
+ "depths": [
13
+ 3,
14
+ 4,
15
+ 6,
16
+ 3
17
+ ],
18
+ "downsample_in_bottleneck": false,
19
+ "downsample_in_first_stage": false,
20
+ "embedding_size": 32,
21
+ "hidden_act": "relu",
22
+ "hidden_sizes": [
23
+ 256,
24
+ 512,
25
+ 1024,
26
+ 2048
27
+ ],
28
+ "initializer_range": 0.02,
29
+ "layer_type": "basic",
30
+ "model_type": "hgnet_v2",
31
+ "num_channels": 3,
32
+ "out_features": [
33
+ "stage2",
34
+ "stage3",
35
+ "stage4"
36
+ ],
37
+ "out_indices": [
38
+ 2,
39
+ 3,
40
+ 4
41
+ ],
42
+ "stage_downsample": [
43
+ false,
44
+ true,
45
+ true,
46
+ true
47
+ ],
48
+ "stage_in_channels": [
49
+ 48,
50
+ 128,
51
+ 512,
52
+ 1024
53
+ ],
54
+ "stage_kernel_size": [
55
+ 3,
56
+ 3,
57
+ 5,
58
+ 5
59
+ ],
60
+ "stage_light_block": [
61
+ false,
62
+ false,
63
+ true,
64
+ true
65
+ ],
66
+ "stage_mid_channels": [
67
+ 48,
68
+ 96,
69
+ 192,
70
+ 384
71
+ ],
72
+ "stage_names": [
73
+ "stem",
74
+ "stage1",
75
+ "stage2",
76
+ "stage3",
77
+ "stage4"
78
+ ],
79
+ "stage_num_blocks": [
80
+ 1,
81
+ 1,
82
+ 3,
83
+ 1
84
+ ],
85
+ "stage_numb_of_layers": [
86
+ 6,
87
+ 6,
88
+ 6,
89
+ 6
90
+ ],
91
+ "stage_out_channels": [
92
+ 128,
93
+ 512,
94
+ 1024,
95
+ 2048
96
+ ],
97
+ "stem_channels": [
98
+ 3,
99
+ 32,
100
+ 48
101
+ ],
102
+ "use_learnable_affine_block": false
103
+ },
104
+ "backbone_kwargs": null,
105
+ "batch_norm_eps": 1e-05,
106
+ "box_noise_scale": 1.0,
107
+ "d_model": 256,
108
+ "decoder_activation_function": "relu",
109
+ "decoder_attention_heads": 8,
110
+ "decoder_ffn_dim": 1024,
111
+ "decoder_in_channels": [
112
+ 256,
113
+ 256,
114
+ 256
115
+ ],
116
+ "decoder_layers": 6,
117
+ "decoder_method": "default",
118
+ "decoder_n_points": [
119
+ 3,
120
+ 6,
121
+ 3
122
+ ],
123
+ "decoder_offset_scale": 0.5,
124
+ "depth_mult": 1.0,
125
+ "dropout": 0.0,
126
+ "encode_proj_layers": [
127
+ 2
128
+ ],
129
+ "encoder_activation_function": "gelu",
130
+ "encoder_attention_heads": 8,
131
+ "encoder_ffn_dim": 1024,
132
+ "encoder_hidden_dim": 256,
133
+ "encoder_in_channels": [
134
+ 512,
135
+ 1024,
136
+ 2048
137
+ ],
138
+ "encoder_layers": 1,
139
+ "eos_coefficient": 0.0001,
140
+ "eval_idx": -1,
141
+ "eval_size": null,
142
+ "feat_strides": [
143
+ 8,
144
+ 16,
145
+ 32
146
+ ],
147
+ "focal_loss_alpha": 0.75,
148
+ "focal_loss_gamma": 2.0,
149
+ "freeze_backbone_batch_norms": true,
150
+ "hidden_expansion": 1.0,
151
+ "id2label": {
152
+ "0": "Caption",
153
+ "1": "Footnote",
154
+ "2": "Formula",
155
+ "3": "List-item",
156
+ "4": "Page-footer",
157
+ "5": "Page-header",
158
+ "6": "Picture",
159
+ "7": "Section-header",
160
+ "8": "Table",
161
+ "9": "Text",
162
+ "10": "Title",
163
+ "11": "Document Index",
164
+ "12": "Code",
165
+ "13": "Checkbox-Selected",
166
+ "14": "Checkbox-Unselected",
167
+ "15": "Form",
168
+ "16": "Key-Value Region"
169
+ },
170
+ "initializer_bias_prior_prob": null,
171
+ "initializer_range": 0.01,
172
+ "is_encoder_decoder": true,
173
+ "label2id": {
174
+ "Caption": 0,
175
+ "Checkbox-Selected": 13,
176
+ "Checkbox-Unselected": 14,
177
+ "Code": 12,
178
+ "Document Index": 11,
179
+ "Footnote": 1,
180
+ "Form": 15,
181
+ "Formula": 2,
182
+ "Key-Value Region": 16,
183
+ "List-item": 3,
184
+ "Page-footer": 4,
185
+ "Page-header": 5,
186
+ "Picture": 6,
187
+ "Section-header": 7,
188
+ "Table": 8,
189
+ "Text": 9,
190
+ "Title": 10
191
+ },
192
+ "label_noise_ratio": 0.5,
193
+ "layer_norm_eps": 1e-05,
194
+ "layer_scale": 1,
195
+ "learn_initial_query": false,
196
+ "lqe_hidden_dim": 64,
197
+ "lqe_layers": 2,
198
+ "matcher_alpha": 0.25,
199
+ "matcher_bbox_cost": 5.0,
200
+ "matcher_class_cost": 2.0,
201
+ "matcher_gamma": 2.0,
202
+ "matcher_giou_cost": 2.0,
203
+ "max_num_bins": 32,
204
+ "model_type": "d_fine",
205
+ "normalize_before": false,
206
+ "num_denoising": 100,
207
+ "num_feature_levels": 3,
208
+ "num_queries": 300,
209
+ "positional_encoding_temperature": 10000,
210
+ "reg_scale": 4.0,
211
+ "top_prob_values": 4,
212
+ "torch_dtype": "float32",
213
+ "transformers_version": "4.53.0.dev0",
214
+ "up": 0.5,
215
+ "use_focal_loss": true,
216
+ "use_pretrained_backbone": false,
217
+ "use_timm_backbone": false,
218
+ "weight_loss_bbox": 5.0,
219
+ "weight_loss_ddf": 1.5,
220
+ "weight_loss_fgl": 0.15,
221
+ "weight_loss_giou": 2.0,
222
+ "weight_loss_vfl": 1.0,
223
+ "with_box_refine": true
224
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79def9d4a0d4e6e62cab25ec7846d1579ef1ef657c39554363813f7d1a14f1b
3
+ size 125100636
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_annotations": true,
3
+ "do_normalize": false,
4
+ "do_pad": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "format": "coco_detection",
8
+ "image_mean": [
9
+ 0.485,
10
+ 0.456,
11
+ 0.406
12
+ ],
13
+ "image_processor_type": "RTDetrImageProcessor",
14
+ "image_std": [
15
+ 0.229,
16
+ 0.224,
17
+ 0.225
18
+ ],
19
+ "pad_size": null,
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 640,
24
+ "width": 640
25
+ }
26
+ }