CT2534 Bvicii commited on
Commit
32d3ea3
·
0 Parent(s):

Duplicate from PaddlePaddle/PP-DocLayout_plus-L_safetensors

Browse files

Co-authored-by: Yizhan Huang <Bvicii@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +35 -0
  2. README.md +48 -0
  3. config.json +232 -0
  4. inference.yml +123 -0
  5. model.safetensors +3 -0
  6. preprocessor_config.json +36 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ - zh
7
+ pipeline_tag: image-to-text
8
+ tags:
9
+ - OCR
10
+ - PaddlePaddle
11
+ - PaddleOCR
12
+ - layout_detection
13
+ ---
14
+
15
+ # PP-DocLayout_plus-L
16
+
17
+ ## Introduction
18
+
19
+ A higher-precision layout area localization model trained on a self-built dataset containing Chinese and English papers, PPT, multi-layout magazines, contracts, books, exams, ancient books and research reports using RT-DETR-L. The layout detection model includes 20 common categories: document title, paragraph title, text, page number, abstract, table, references, footnotes, header, footer, algorithm, formula, formula number, image, table, seal, figure_table title, chart, and sidebar text and lists of references. The key metrics are as follow:
20
+
21
+ | Model| mAP(0.5) (%) |
22
+ | --- | --- |
23
+ |PP-DocLayout_plus-L | 83.2 |
24
+
25
+ **Note**: the evaluation set of the above precision indicators is the self built version sub area detection data set, including Chinese and English papers, magazines, newspapers, research reports PPT、 1000 document type pictures such as test papers and textbooks.
26
+
27
+ ## Model Usage
28
+
29
+ ```python
30
+ import requests
31
+ from PIL import Image
32
+ from transformers import AutoImageProcessor, AutoModelForObjectDetection
33
+
34
+ model_path = "PaddlePaddle/PP-DocLayout_plus-L_safetensors"
35
+ model = AutoModelForObjectDetection.from_pretrained(model_path)
36
+ image_processor = AutoImageProcessor.from_pretrained(model_path)
37
+
38
+ image = Image.open(requests.get("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg", stream=True).raw)
39
+ inputs = image_processor(images=image, return_tensors="pt")
40
+
41
+ outputs = model(**inputs)
42
+ results = image_processor.post_process_object_detection(outputs, target_sizes=[image.size[::-1]])
43
+ for result in results:
44
+ for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
45
+ score, label = score.item(), label_id.item()
46
+ box = [round(i, 2) for i in box.tolist()]
47
+ print(f"{model.config.id2label[label]}: {score:.2f} {box}")
48
+ ```
config.json ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "attention_dropout": 0.0,
6
+ "auxiliary_loss": true,
7
+ "backbone_config": {
8
+ "arch": "L",
9
+ "depths": [
10
+ 3,
11
+ 4,
12
+ 6,
13
+ 3
14
+ ],
15
+ "embedding_size": 64,
16
+ "hidden_act": "relu",
17
+ "hidden_sizes": [
18
+ 256,
19
+ 512,
20
+ 1024,
21
+ 2048
22
+ ],
23
+ "initializer_range": 0.02,
24
+ "model_type": "hgnet_v2",
25
+ "num_channels": 3,
26
+ "out_features": [
27
+ "stage2",
28
+ "stage3",
29
+ "stage4"
30
+ ],
31
+ "out_indices": [
32
+ 2,
33
+ 3,
34
+ 4
35
+ ],
36
+ "stage_downsample": [
37
+ false,
38
+ true,
39
+ true,
40
+ true
41
+ ],
42
+ "stage_downsample_strides": [
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2
47
+ ],
48
+ "stage_in_channels": [
49
+ 48,
50
+ 128,
51
+ 512,
52
+ 1024
53
+ ],
54
+ "stage_kernel_size": [
55
+ 3,
56
+ 3,
57
+ 5,
58
+ 5
59
+ ],
60
+ "stage_light_block": [
61
+ false,
62
+ false,
63
+ true,
64
+ true
65
+ ],
66
+ "stage_mid_channels": [
67
+ 48,
68
+ 96,
69
+ 192,
70
+ 384
71
+ ],
72
+ "stage_names": [
73
+ "stem",
74
+ "stage1",
75
+ "stage2",
76
+ "stage3",
77
+ "stage4"
78
+ ],
79
+ "stage_num_blocks": [
80
+ 1,
81
+ 1,
82
+ 3,
83
+ 1
84
+ ],
85
+ "stage_numb_of_layers": [
86
+ 6,
87
+ 6,
88
+ 6,
89
+ 6
90
+ ],
91
+ "stage_out_channels": [
92
+ 128,
93
+ 512,
94
+ 1024,
95
+ 2048
96
+ ],
97
+ "stem_channels": [
98
+ 3,
99
+ 32,
100
+ 48
101
+ ],
102
+ "stem_strides": [
103
+ 2,
104
+ 1,
105
+ 1,
106
+ 2,
107
+ 1
108
+ ],
109
+ "use_learnable_affine_block": false,
110
+ "return_idx": [
111
+ 1,
112
+ 2,
113
+ 3
114
+ ],
115
+ "freeze_stem_only": true,
116
+ "freeze_at": 0,
117
+ "freeze_norm": false,
118
+ "lr_mult_list": [
119
+ 0.05,
120
+ 0.05,
121
+ 0.1,
122
+ 0.15,
123
+ 0.2
124
+ ]
125
+ },
126
+ "batch_norm_eps": 1e-05,
127
+ "box_noise_scale": 1.0,
128
+ "d_model": 256,
129
+ "decoder_activation_function": "relu",
130
+ "decoder_attention_heads": 8,
131
+ "decoder_ffn_dim": 1024,
132
+ "decoder_in_channels": [
133
+ 256,
134
+ 256,
135
+ 256
136
+ ],
137
+ "decoder_layers": 6,
138
+ "decoder_n_points": 4,
139
+ "disable_custom_kernels": true,
140
+ "dropout": 0.0,
141
+ "encode_proj_layers": [
142
+ 2
143
+ ],
144
+ "encoder_activation_function": "gelu",
145
+ "encoder_attention_heads": 8,
146
+ "encoder_ffn_dim": 1024,
147
+ "encoder_hidden_dim": 256,
148
+ "encoder_in_channels": [
149
+ 512,
150
+ 1024,
151
+ 2048
152
+ ],
153
+ "encoder_layers": 1,
154
+ "eos_coefficient": 0.0001,
155
+ "eval_size": null,
156
+ "feat_strides": [
157
+ 8,
158
+ 16,
159
+ 32
160
+ ],
161
+ "focal_loss_alpha": 0.75,
162
+ "focal_loss_gamma": 2.0,
163
+ "freeze_backbone_batch_norms": true,
164
+ "hidden_expansion": 1.0,
165
+ "id2label": {
166
+ "0": "paragraph_title",
167
+ "1": "image",
168
+ "10": "doc_title",
169
+ "11": "footnote",
170
+ "12": "header",
171
+ "13": "algorithm",
172
+ "14": "footer",
173
+ "15": "seal",
174
+ "16": "chart",
175
+ "17": "formula_number",
176
+ "18": "aside_text",
177
+ "19": "reference_content",
178
+ "2": "text",
179
+ "3": "number",
180
+ "4": "abstract",
181
+ "5": "content",
182
+ "6": "figure_title",
183
+ "7": "formula",
184
+ "8": "table",
185
+ "9": "reference"
186
+ },
187
+ "initializer_bias_prior_prob": null,
188
+ "initializer_range": 0.01,
189
+ "is_encoder_decoder": true,
190
+ "label2id": {
191
+ "abstract": 4,
192
+ "algorithm": 13,
193
+ "aside_text": 18,
194
+ "chart": 16,
195
+ "content": 5,
196
+ "doc_title": 10,
197
+ "figure_title": 6,
198
+ "footer": 14,
199
+ "footnote": 11,
200
+ "formula": 7,
201
+ "formula_number": 17,
202
+ "header": 12,
203
+ "image": 1,
204
+ "number": 3,
205
+ "paragraph_title": 0,
206
+ "reference": 9,
207
+ "reference_content": 19,
208
+ "seal": 15,
209
+ "table": 8,
210
+ "text": 2
211
+ },
212
+ "label_noise_ratio": 0.5,
213
+ "layer_norm_eps": 1e-05,
214
+ "learn_initial_query": false,
215
+ "matcher_alpha": 0.25,
216
+ "matcher_bbox_cost": 5.0,
217
+ "matcher_class_cost": 2.0,
218
+ "matcher_gamma": 2.0,
219
+ "matcher_giou_cost": 2.0,
220
+ "model_type": "rt_detr",
221
+ "normalize_before": false,
222
+ "num_denoising": 100,
223
+ "num_feature_levels": 3,
224
+ "num_queries": 300,
225
+ "positional_encoding_temperature": 10000,
226
+ "transformers_version": "5.3.0.dev0",
227
+ "use_focal_loss": true,
228
+ "weight_loss_bbox": 5.0,
229
+ "weight_loss_giou": 2.0,
230
+ "weight_loss_vfl": 1.0,
231
+ "with_box_refine": true
232
+ }
inference.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: paddle
2
+ draw_threshold: 0.5
3
+ metric: COCO
4
+ use_dynamic_shape: false
5
+ Global:
6
+ model_name: PP-DocLayout_plus-L
7
+ arch: DETR
8
+ min_subgraph_size: 3
9
+ Preprocess:
10
+ - interp: 2
11
+ keep_ratio: false
12
+ target_size:
13
+ - 800
14
+ - 800
15
+ type: Resize
16
+ - mean:
17
+ - 0.0
18
+ - 0.0
19
+ - 0.0
20
+ norm_type: none
21
+ std:
22
+ - 1.0
23
+ - 1.0
24
+ - 1.0
25
+ type: NormalizeImage
26
+ - type: Permute
27
+ label_list:
28
+ - paragraph_title
29
+ - image
30
+ - text
31
+ - number
32
+ - abstract
33
+ - content
34
+ - figure_title
35
+ - formula
36
+ - table
37
+ - reference
38
+ - doc_title
39
+ - footnote
40
+ - header
41
+ - algorithm
42
+ - footer
43
+ - seal
44
+ - chart
45
+ - formula_number
46
+ - aside_text
47
+ - reference_content
48
+ Hpi:
49
+ backend_configs:
50
+ paddle_infer:
51
+ trt_dynamic_shapes: &id001
52
+ im_shape:
53
+ - - 1
54
+ - 2
55
+ - - 1
56
+ - 2
57
+ - - 8
58
+ - 2
59
+ image:
60
+ - - 1
61
+ - 3
62
+ - 800
63
+ - 800
64
+ - - 1
65
+ - 3
66
+ - 800
67
+ - 800
68
+ - - 8
69
+ - 3
70
+ - 800
71
+ - 800
72
+ scale_factor:
73
+ - - 1
74
+ - 2
75
+ - - 1
76
+ - 2
77
+ - - 8
78
+ - 2
79
+ trt_dynamic_shape_input_data:
80
+ im_shape:
81
+ - - 800
82
+ - 800
83
+ - - 800
84
+ - 800
85
+ - - 800
86
+ - 800
87
+ - 800
88
+ - 800
89
+ - 800
90
+ - 800
91
+ - 800
92
+ - 800
93
+ - 800
94
+ - 800
95
+ - 800
96
+ - 800
97
+ - 800
98
+ - 800
99
+ - 800
100
+ - 800
101
+ scale_factor:
102
+ - - 2
103
+ - 2
104
+ - - 1
105
+ - 1
106
+ - - 0.67
107
+ - 0.67
108
+ - 0.67
109
+ - 0.67
110
+ - 0.67
111
+ - 0.67
112
+ - 0.67
113
+ - 0.67
114
+ - 0.67
115
+ - 0.67
116
+ - 0.67
117
+ - 0.67
118
+ - 0.67
119
+ - 0.67
120
+ - 0.67
121
+ - 0.67
122
+ tensorrt:
123
+ dynamic_shapes: *id001
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa00c2cf5285eeec58c47d5c91d0950f34d61af8a2a484137790278cca899b98
3
+ size 131776256
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_processor_type": "RTDetrImageProcessor",
20
+ "image_mean": [
21
+ 0,
22
+ 0,
23
+ 0
24
+ ],
25
+ "image_std": [
26
+ 1,
27
+ 1,
28
+ 1
29
+ ],
30
+ "rescale_factor": 0.00392156862745098,
31
+ "resample": 3,
32
+ "size": {
33
+ "height": 800,
34
+ "width": 800
35
+ }
36
+ }