topdu commited on
Commit
29f689c
·
1 Parent(s): 2d03ea4

openocr demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +127 -0
  2. configs/det/dbnet/repvit_db.yml +173 -0
  3. configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
  4. configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
  5. configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
  6. configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
  7. configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
  8. configs/rec/aster/svtrv2_aster.yml +127 -0
  9. configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
  10. configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
  11. configs/rec/busnet/svtrv2_busnet.yml +135 -0
  12. configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
  13. configs/rec/busnet/vit_busnet.yml +104 -0
  14. configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
  15. configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
  16. configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
  17. configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
  18. configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
  19. configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
  20. configs/rec/cppd/svtr_base_cppd.yml +123 -0
  21. configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
  22. configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
  23. configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
  24. configs/rec/cppd/svtrv2_cppd.yml +150 -0
  25. configs/rec/dan/resnet45_fpn_dan.yml +98 -0
  26. configs/rec/dan/svtrv2_dan.yml +130 -0
  27. configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
  28. configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
  29. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
  30. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
  31. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
  32. configs/rec/igtr/readme.md +189 -0
  33. configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
  34. configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
  35. configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
  36. configs/rec/lpv/svtr_base_lpv.yml +124 -0
  37. configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
  38. configs/rec/lpv/svtrv2_lpv.yml +147 -0
  39. configs/rec/lpv/svtrv2_lpv_wo_glrm.yml +146 -0
  40. configs/rec/maerec/vit_nrtr.yml +116 -0
  41. configs/rec/matrn/resnet45_trans_matrn.yml +95 -0
  42. configs/rec/matrn/svtrv2_matrn.yml +130 -0
  43. configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml +140 -0
  44. configs/rec/mgpstr/vit_base_mgpstr_only_char.yml +111 -0
  45. configs/rec/mgpstr/vit_large_mgpstr_only_char.yml +110 -0
  46. configs/rec/mgpstr/vit_mgpstr.yml +110 -0
  47. configs/rec/mgpstr/vit_mgpstr_only_char.yml +110 -0
  48. configs/rec/moran/resnet31_lstm_moran.yml +92 -0
  49. configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml +145 -0
  50. configs/rec/nrtr/nrtr.yml +107 -0
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr # gradio==4.20.0
3
+
4
+ os.environ['FLAGS_allocator_strategy'] = 'auto_growth'
5
+ import cv2
6
+ import numpy as np
7
+ import json
8
+ import time
9
+ from PIL import Image
10
+ from tools.infer_e2e import OpenOCR, check_and_download_font, draw_ocr_box_txt
11
+
12
+ drop_score = 0.01
13
+ text_sys = OpenOCR(drop_score=drop_score)
14
+ # warm up 5 times
15
+ if True:
16
+ img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
17
+ for i in range(5):
18
+ res = text_sys(img_numpy=img)
19
+ font_path = './simfang.ttf'
20
+ check_and_download_font(font_path)
21
+
22
+
23
+ def main(input_image):
24
+ img = input_image[:, :, ::-1]
25
+ starttime = time.time()
26
+ results, time_dict, mask = text_sys(img_numpy=img, return_mask=True)
27
+ elapse = time.time() - starttime
28
+ save_pred = json.dumps(results[0], ensure_ascii=False)
29
+ image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
30
+ boxes = [res['points'] for res in results[0]]
31
+ txts = [res['transcription'] for res in results[0]]
32
+ scores = [res['score'] for res in results[0]]
33
+ draw_img = draw_ocr_box_txt(
34
+ image,
35
+ boxes,
36
+ txts,
37
+ scores,
38
+ drop_score=drop_score,
39
+ font_path=font_path,
40
+ )
41
+ mask = mask[0, 0, :, :] > 0.3
42
+ return save_pred, elapse, draw_img, mask.astype('uint8') * 255
43
+
44
+
45
+ def get_all_file_names_including_subdirs(dir_path):
46
+ all_file_names = []
47
+
48
+ for root, dirs, files in os.walk(dir_path):
49
+ for file_name in files:
50
+ all_file_names.append(os.path.join(root, file_name))
51
+
52
+ file_names_only = [os.path.basename(file) for file in all_file_names]
53
+ return file_names_only
54
+
55
+
56
+ def list_image_paths(directory):
57
+ image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')
58
+
59
+ image_paths = []
60
+ for root, dirs, files in os.walk(directory):
61
+ for file in files:
62
+ if file.lower().endswith(image_extensions):
63
+ relative_path = os.path.relpath(os.path.join(root, file),
64
+ directory)
65
+ full_path = os.path.join(directory, relative_path)
66
+ image_paths.append(full_path)
67
+ image_paths = sorted(image_paths)
68
+ return image_paths
69
+
70
+
71
+ def find_file_in_current_dir_and_subdirs(file_name):
72
+ for root, dirs, files in os.walk('.'):
73
+ if file_name in files:
74
+ relative_path = os.path.join(root, file_name)
75
+ return relative_path
76
+
77
+
78
+ def predict1(input_image, Model_type, OCR_type):
79
+ if OCR_type == 'E2E':
80
+ return 11111, 'E2E', input_image
81
+ elif OCR_type == 'STR':
82
+ return 11111, 'STR', input_image
83
+ else:
84
+ return 11111, 'STD', input_image
85
+
86
+
87
+ e2e_img_example = list_image_paths('./OCR_e2e_img')
88
+
89
+ if __name__ == '__main__':
90
+ css = '.image-container img { width: 100%; max-height: 320px;}'
91
+
92
+ with gr.Blocks(css=css) as demo:
93
+ gr.HTML("""
94
+ <h1 style='text-align: center;'>OpenOCR</h1>""")
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ input_image = gr.Image(label='Input image',
98
+ elem_classes=['image-container'])
99
+
100
+ examples = gr.Examples(examples=e2e_img_example,
101
+ inputs=input_image,
102
+ label='Examples')
103
+ downstream = gr.Button('Run')
104
+
105
+ with gr.Column(scale=1):
106
+ img_mask = gr.Image(label='mask',
107
+ interactive=False,
108
+ elem_classes=['image-container'])
109
+ img_output = gr.Image(label=' ',
110
+ interactive=False,
111
+ elem_classes=['image-container'])
112
+
113
+ output = gr.Textbox(label='Result')
114
+ confidence = gr.Textbox(label='Latency')
115
+
116
+ downstream.click(fn=main,
117
+ inputs=[
118
+ input_image,
119
+ ],
120
+ outputs=[
121
+ output,
122
+ confidence,
123
+ img_output,
124
+ img_mask,
125
+ ])
126
+
127
+ demo.launch(share=True)
configs/det/dbnet/repvit_db.yml ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: &epoch_num 500
4
+ log_smooth_window: 20
5
+ print_batch_step: 100
6
+ save_model_dir: ./output/det_repsvtr_db
7
+ save_epoch_step: 10
8
+ eval_batch_step:
9
+ - 0
10
+ - 1000
11
+ cal_metric_during_train: false
12
+ checkpoints:
13
+ pretrained_model: openocr_det_repvit_ch.pth
14
+ save_inference_dir: null
15
+ use_visualdl: false
16
+ infer_img: ./testA
17
+ save_res_path: ./checkpoints/det_db/predicts_db.txt
18
+ distributed: true
19
+ model_type: det
20
+
21
+ Architecture:
22
+ algorithm: DB
23
+ Backbone:
24
+ name: RepSVTR_det
25
+ Neck:
26
+ name: RSEFPN
27
+ out_channels: 96
28
+ shortcut: True
29
+ Head:
30
+ name: DBHead
31
+ k: 50
32
+
33
+ # Loss:
34
+ # name: DBLoss
35
+ # balance_loss: true
36
+ # main_loss_type: DiceLoss
37
+ # alpha: 5
38
+ # beta: 10
39
+ # ohem_ratio: 3
40
+
41
+ # Optimizer:
42
+ # name: Adam
43
+ # beta1: 0.9
44
+ # beta2: 0.999
45
+ # lr:
46
+ # name: Cosine
47
+ # learning_rate: 0.001 #(8*8c)
48
+ # warmup_epoch: 2
49
+ # regularizer:
50
+ # name: L2
51
+ # factor: 5.0e-05
52
+
53
+ PostProcess:
54
+ name: DBPostProcess
55
+ thresh: 0.3
56
+ box_thresh: 0.4
57
+ max_candidates: 1000
58
+ unclip_ratio: 1.5
59
+ score_mode: 'slow'
60
+
61
+ # Metric:
62
+ # name: DetMetric
63
+ # main_indicator: hmean
64
+
65
+ # Train:
66
+ # dataset:
67
+ # name: SimpleDataSet
68
+ # data_dir: ./train_data/icdar2015/text_localization/
69
+ # label_file_list:
70
+ # - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
71
+ # ratio_list: [1.0]
72
+ # transforms:
73
+ # - DecodeImage:
74
+ # img_mode: BGR
75
+ # channel_first: false
76
+ # - DetLabelEncode: null
77
+ # - CopyPaste: null
78
+ # - IaaAugment:
79
+ # augmenter_args:
80
+ # - type: Fliplr
81
+ # args:
82
+ # p: 0.5
83
+ # - type: Affine
84
+ # args:
85
+ # rotate:
86
+ # - -10
87
+ # - 10
88
+ # - type: Resize
89
+ # args:
90
+ # size:
91
+ # - 0.5
92
+ # - 3
93
+ # - EastRandomCropData:
94
+ # size:
95
+ # - 640
96
+ # - 640
97
+ # max_tries: 50
98
+ # keep_ratio: true
99
+ # - MakeBorderMap:
100
+ # shrink_ratio: 0.4
101
+ # thresh_min: 0.3
102
+ # thresh_max: 0.7
103
+ # total_epoch: *epoch_num
104
+ # - MakeShrinkMap:
105
+ # shrink_ratio: 0.4
106
+ # min_text_size: 8
107
+ # total_epoch: *epoch_num
108
+ # - NormalizeImage:
109
+ # scale: 1./255.
110
+ # mean:
111
+ # - 0.485
112
+ # - 0.456
113
+ # - 0.406
114
+ # std:
115
+ # - 0.229
116
+ # - 0.224
117
+ # - 0.225
118
+ # order: hwc
119
+ # - ToCHWImage: null
120
+ # - KeepKeys:
121
+ # keep_keys:
122
+ # - image
123
+ # - threshold_map
124
+ # - threshold_mask
125
+ # - shrink_map
126
+ # - shrink_mask
127
+ # loader:
128
+ # shuffle: true
129
+ # drop_last: false
130
+ # batch_size_per_card: 8
131
+ # num_workers: 8
132
+
133
+ Eval:
134
+ dataset:
135
+ name: SimpleDataSet
136
+ data_dir: ./train_data/icdar2015/text_localization/
137
+ label_file_list:
138
+ - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
139
+ transforms:
140
+ - DecodeImage:
141
+ img_mode: BGR
142
+ channel_first: false
143
+ - DetLabelEncode: null
144
+ - DetResizeForTest:
145
+ # image_shape: [1280, 1280]
146
+ # keep_ratio: True
147
+ # padding: True
148
+ limit_side_len: 960
149
+ limit_type: max
150
+ - NormalizeImage:
151
+ scale: 1./255.
152
+ mean:
153
+ - 0.485
154
+ - 0.456
155
+ - 0.406
156
+ std:
157
+ - 0.229
158
+ - 0.224
159
+ - 0.225
160
+ order: hwc
161
+ - ToCHWImage: null
162
+ - KeepKeys:
163
+ keep_keys:
164
+ - image
165
+ - shape
166
+ - polys
167
+ - ignore_tags
168
+ loader:
169
+ shuffle: false
170
+ drop_last: false
171
+ batch_size_per_card: 1
172
+ num_workers: 2
173
+ profiler_options: null
configs/rec/abinet/resnet45_trans_abinet_lang.yml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
20
+ grad_clip_val: 20
21
+ use_amp: True
22
+
23
+ Optimizer:
24
+ name: Adam
25
+ lr: 0.000267
26
+ weight_decay: 0.0
27
+ filter_bias_and_bn: False
28
+
29
+ LRScheduler:
30
+ name: MultiStepLR
31
+ milestones: [12]
32
+ gamma: 0.1
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: ABINet
37
+ Transform:
38
+ Encoder:
39
+ name: ResNet45
40
+ in_channels: 3
41
+ strides: [2, 1, 2, 1, 1]
42
+ Decoder:
43
+ name: ABINetDecoder
44
+ iter_size: 3
45
+
46
+ Loss:
47
+ name: ABINetLoss
48
+
49
+ PostProcess:
50
+ name: ABINetLabelDecode
51
+
52
+ Metric:
53
+ name: RecMetric
54
+ main_indicator: acc
55
+ is_filter: True
56
+
57
+ Train:
58
+ dataset:
59
+ name: LMDBDataSet
60
+ data_dir: ../Union14M-L-LMDB-Filtered
61
+ transforms:
62
+ - DecodeImagePIL: # load image
63
+ img_mode: RGB
64
+ - PARSeqAugPIL:
65
+ - ABINetLabelEncode:
66
+ - RecTVResize:
67
+ image_shape: [32, 128]
68
+ padding: False
69
+ - KeepKeys:
70
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
71
+ loader:
72
+ shuffle: True
73
+ batch_size_per_card: 256
74
+ drop_last: True
75
+ num_workers: 4
76
+
77
+ Eval:
78
+ dataset:
79
+ name: LMDBDataSet
80
+ data_dir: ../evaluation
81
+ transforms:
82
+ - DecodeImagePIL: # load image
83
+ img_mode: RGB
84
+ - ABINetLabelEncode:
85
+ - RecTVResize:
86
+ image_shape: [32, 128]
87
+ padding: False
88
+ - KeepKeys:
89
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
90
+ loader:
91
+ shuffle: False
92
+ drop_last: False
93
+ batch_size_per_card: 256
94
+ num_workers: 2
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.000267
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [12]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: ABINet
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ Decoder:
42
+ name: ABINetDecoder
43
+ iter_size: 0
44
+
45
+ Loss:
46
+ name: ABINetLoss
47
+
48
+ PostProcess:
49
+ name: ABINetLabelDecode
50
+
51
+ Metric:
52
+ name: RecMetric
53
+ main_indicator: acc
54
+ is_filter: True
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ABINetLabelEncode:
65
+ - RecTVResize:
66
+ image_shape: [32, 128]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 256
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ABINetLabelEncode:
84
+ - RecTVResize:
85
+ image_shape: [32, 128]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/abinet/svtrv2_abinet_lang.yml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_svtrv2_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
20
+ use_amp: True
21
+ grad_clip_val: 20
22
+
23
+ Optimizer:
24
+ name: AdamW
25
+ lr: 0.00065 # for 4gpus bs256/gpu
26
+ weight_decay: 0.05
27
+ filter_bias_and_bn: True
28
+
29
+ LRScheduler:
30
+ name: OneCycleLR
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: ABINet
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ dims: [128, 256, 384]
42
+ depths: [6, 6, 6]
43
+ num_heads: [4, 8, 12]
44
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
45
+ local_k: [[5, 5], [5, 5], [-1, -1]]
46
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
47
+ last_stage: false
48
+ feat2d: True
49
+ Decoder:
50
+ name: ABINetDecoder
51
+ iter_size: 3
52
+ num_layers: 0
53
+
54
+ Loss:
55
+ name: ABINetLoss
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSetTVResize
68
+ ds_width: True
69
+ padding: false
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImagePIL: # load image
78
+ img_mode: RGB
79
+ - PARSeqAugPIL:
80
+ - ABINetLabelEncode:
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
83
+ sampler:
84
+ name: RatioSampler
85
+ scales: [[128, 32]] # w, h
86
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
87
+ first_bs: &bs 256
88
+ fix_bs: false
89
+ divided_factor: [4, 16] # w, h
90
+ is_training: True
91
+ loader:
92
+ shuffle: True
93
+ batch_size_per_card: *bs
94
+ drop_last: True
95
+ max_ratio: &max_ratio 4
96
+ num_workers: 4
97
+
98
+ Eval:
99
+ dataset:
100
+ name: RatioDataSetTVResize
101
+ ds_width: True
102
+ padding: False
103
+ data_dir_list: [
104
+ '../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - ABINetLabelEncode:
115
+ - KeepKeys:
116
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
117
+ sampler:
118
+ name: RatioSampler
119
+ scales: [[128, 32]] # w, h
120
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
121
+ first_bs: *bs
122
+ fix_bs: false
123
+ divided_factor: [4, 16] # w, h
124
+ is_training: False
125
+ loader:
126
+ shuffle: False
127
+ drop_last: False
128
+ batch_size_per_card: *bs
129
+ max_ratio: *max_ratio
130
+ num_workers: 4
configs/rec/abinet/svtrv2_abinet_wo_lang.yml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: ABINet
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: True
48
+ Decoder:
49
+ name: ABINetDecoder
50
+ iter_size: 0
51
+ num_layers: 0
52
+ Loss:
53
+ name: ABINetLoss
54
+
55
+ PostProcess:
56
+ name: ABINetLabelDecode
57
+
58
+ Metric:
59
+ name: RecMetric
60
+ main_indicator: acc
61
+ is_filter: True
62
+
63
+ Train:
64
+ dataset:
65
+ name: RatioDataSetTVResize
66
+ ds_width: True
67
+ padding: false
68
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
69
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
70
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
73
+ ]
74
+ transforms:
75
+ - DecodeImagePIL: # load image
76
+ img_mode: RGB
77
+ - PARSeqAugPIL:
78
+ - ABINetLabelEncode:
79
+ - KeepKeys:
80
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
81
+ sampler:
82
+ name: RatioSampler
83
+ scales: [[128, 32]] # w, h
84
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
85
+ first_bs: &bs 256
86
+ fix_bs: false
87
+ divided_factor: [4, 16] # w, h
88
+ is_training: True
89
+ loader:
90
+ shuffle: True
91
+ batch_size_per_card: *bs
92
+ drop_last: True
93
+ max_ratio: &max_ratio 4
94
+ num_workers: 4
95
+
96
+ Eval:
97
+ dataset:
98
+ name: RatioDataSetTVResize
99
+ ds_width: True
100
+ padding: False
101
+ data_dir_list: [
102
+ '../evaluation/CUTE80',
103
+ '../evaluation/IC13_857',
104
+ '../evaluation/IC15_1811',
105
+ '../evaluation/IIIT5k',
106
+ '../evaluation/SVT',
107
+ '../evaluation/SVTP',
108
+ ]
109
+ transforms:
110
+ - DecodeImagePIL: # load image
111
+ img_mode: RGB
112
+ - ABINetLabelEncode:
113
+ - KeepKeys:
114
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
115
+ sampler:
116
+ name: RatioSampler
117
+ scales: [[128, 32]] # w, h
118
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
119
+ first_bs: *bs
120
+ fix_bs: false
121
+ divided_factor: [4, 16] # w, h
122
+ is_training: False
123
+ loader:
124
+ shuffle: False
125
+ drop_last: False
126
+ batch_size_per_card: *bs
127
+ max_ratio: *max_ratio
128
+ num_workers: 4
configs/rec/aster/resnet31_lstm_aster_tps_on.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/predicts_aster_tps.txt
19
+ use_amp: True
20
+ grad_clip_val: 1.0
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 1gpus bs1024/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: aster
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: [32, 128]
40
+ Encoder:
41
+ name: ResNet_ASTER
42
+ Decoder:
43
+ name: ASTERDecoder
44
+
45
+ Loss:
46
+ name: ARLoss
47
+
48
+ Metric:
49
+ name: RecMetric
50
+ main_indicator: acc
51
+ is_filter: True
52
+
53
+ PostProcess:
54
+ name: ARLabelDecode
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ARLabelEncode: # Class handling label
65
+ - RecTVResize:
66
+ image_shape: [64, 256]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 1024
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ARLabelEncode: # Class handling label
84
+ - RecTVResize:
85
+ image_shape: [64, 256]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/aster/svtrv2_aster.yml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_aster
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: aster
35
+ Transform:
36
+ Encoder:
37
+ name: SVTRv2LNConvTwo33
38
+ use_pos_embed: False
39
+ out_channels: 256
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: False
48
+ Decoder:
49
+ name: ASTERDecoder
50
+
51
+ Loss:
52
+ name: ARLoss
53
+
54
+ Metric:
55
+ name: RecMetric
56
+ main_indicator: acc
57
+ is_filter: True
58
+
59
+ PostProcess:
60
+ name: ARLabelDecode
61
+
62
+ Train:
63
+ dataset:
64
+ name: RatioDataSetTVResize
65
+ ds_width: True
66
+ padding: false
67
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
68
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
69
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
70
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
72
+ ]
73
+ transforms:
74
+ - DecodeImagePIL: # load image
75
+ img_mode: RGB
76
+ - PARSeqAugPIL:
77
+ - ARLabelEncode: # Class handling label
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ sampler:
81
+ name: RatioSampler
82
+ scales: [[128, 32]] # w, h
83
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
84
+ first_bs: &bs 256
85
+ fix_bs: false
86
+ divided_factor: [4, 16] # w, h
87
+ is_training: True
88
+ loader:
89
+ shuffle: True
90
+ batch_size_per_card: *bs
91
+ drop_last: True
92
+ max_ratio: &max_ratio 4
93
+ num_workers: 4
94
+
95
+ Eval:
96
+ dataset:
97
+ name: RatioDataSetTVResize
98
+ ds_width: True
99
+ padding: False
100
+ data_dir_list: [
101
+ '../evaluation/CUTE80',
102
+ '../evaluation/IC13_857',
103
+ '../evaluation/IC15_1811',
104
+ '../evaluation/IIIT5k',
105
+ '../evaluation/SVT',
106
+ '../evaluation/SVTP',
107
+ ]
108
+ transforms:
109
+ - DecodeImagePIL: # load image
110
+ img_mode: RGB
111
+ - ARLabelEncode: # Class handling label
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ sampler:
115
+ name: RatioSampler
116
+ scales: [[128, 32]] # w, h
117
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
118
+ first_bs: *bs
119
+ fix_bs: false
120
+ divided_factor: [4, 16] # w, h
121
+ is_training: False
122
+ loader:
123
+ shuffle: False
124
+ drop_last: False
125
+ batch_size_per_card: *bs
126
+ max_ratio: *max_ratio
127
+ num_workers: 4
configs/rec/aster/svtrv2_aster_tps_on.yml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: aster
35
+ Transform:
36
+ name: Aster_TPS
37
+ tps_inputsize: [32, 64]
38
+ tps_outputsize: [32, 128]
39
+ Encoder:
40
+ name: SVTRv2LNConvTwo33
41
+ use_pos_embed: False
42
+ out_channels: 256
43
+ dims: [128, 256, 384]
44
+ depths: [6, 6, 6]
45
+ num_heads: [4, 8, 12]
46
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
47
+ local_k: [[5, 5], [5, 5], [-1, -1]]
48
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
49
+ last_stage: false
50
+ feat2d: False
51
+ Decoder:
52
+ name: ASTERDecoder
53
+
54
+ Loss:
55
+ name: ARLoss
56
+
57
+ Metric:
58
+ name: RecMetric
59
+ main_indicator: acc
60
+ is_filter: True
61
+
62
+ PostProcess:
63
+ name: ARLabelDecode
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ARLabelEncode: # Class handling label
74
+ - RecTVResize:
75
+ image_shape: [64, 256]
76
+ padding: False
77
+ - KeepKeys:
78
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
79
+ loader:
80
+ shuffle: True
81
+ batch_size_per_card: 256
82
+ drop_last: True
83
+ num_workers: 4
84
+
85
+ Eval:
86
+ dataset:
87
+ name: LMDBDataSet
88
+ data_dir: ../evaluation
89
+ transforms:
90
+ - DecodeImagePIL: # load image
91
+ img_mode: RGB
92
+ - ARLabelEncode: # Class handling label
93
+ - RecTVResize:
94
+ image_shape: [64, 256]
95
+ padding: False
96
+ - KeepKeys:
97
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
98
+ loader:
99
+ shuffle: False
100
+ drop_last: False
101
+ batch_size_per_card: 256
102
+ num_workers: 2
configs/rec/autostr/autostr_lstm_aster_tps_on.yml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
19
+ use_amp: True
20
+ grad_clip_val: 1.0
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: autostr
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: [32, 128]
40
+ Encoder:
41
+ name: AutoSTREncoder
42
+ stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
43
+ conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
44
+ Decoder:
45
+ name: ASTERDecoder
46
+
47
+ Loss:
48
+ name: ARLoss
49
+
50
+ Metric:
51
+ name: RecMetric
52
+ main_indicator: acc
53
+ is_filter: True
54
+
55
+ PostProcess:
56
+ name: ARLabelDecode
57
+
58
+ Train:
59
+ dataset:
60
+ name: LMDBDataSet
61
+ data_dir: ../Union14M-L-LMDB-Filtered
62
+ transforms:
63
+ - DecodeImagePIL: # load image
64
+ img_mode: RGB
65
+ - PARSeqAugPIL:
66
+ - ARLabelEncode: # Class handling label
67
+ - RecTVResize:
68
+ image_shape: [64, 256]
69
+ padding: False
70
+ - KeepKeys:
71
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
72
+ loader:
73
+ shuffle: True
74
+ batch_size_per_card: 256
75
+ drop_last: True
76
+ num_workers: 4
77
+
78
+ Eval:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../evaluation
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - ARLabelEncode: # Class handling label
86
+ - RecTVResize:
87
+ image_shape: [64, 256]
88
+ padding: False
89
+ - KeepKeys:
90
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
91
+ loader:
92
+ shuffle: False
93
+ drop_last: False
94
+ batch_size_per_card: 256
95
+ num_workers: 2
configs/rec/busnet/svtrv2_busnet.yml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: False
48
+ Decoder:
49
+ name: BUSDecoder
50
+ nhead: 6
51
+ num_layers: 6
52
+ dim_feedforward: 1536
53
+ ignore_index: &ignore_index 100
54
+ pretraining: False
55
+ # return_id: 2
56
+ Loss:
57
+ name: ABINetLoss
58
+ ignore_index: *ignore_index
59
+
60
+ PostProcess:
61
+ name: ABINetLabelDecode
62
+
63
+ Metric:
64
+ name: RecMetric
65
+ main_indicator: acc
66
+ is_filter: True
67
+
68
+ Train:
69
+ dataset:
70
+ name: RatioDataSetTVResize
71
+ ds_width: True
72
+ padding: false
73
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
77
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
78
+ ]
79
+ transforms:
80
+ - DecodeImagePIL: # load image
81
+ img_mode: RGB
82
+ - PARSeqAugPIL:
83
+ - ABINetLabelEncode:
84
+ ignore_index: *ignore_index
85
+ - KeepKeys:
86
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
87
+ sampler:
88
+ name: RatioSampler
89
+ scales: [[128, 32]] # w, h
90
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
91
+ first_bs: &bs 256
92
+ fix_bs: false
93
+ divided_factor: [4, 16] # w, h
94
+ is_training: True
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: *bs
98
+ drop_last: True
99
+ max_ratio: &max_ratio 4
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: RatioDataSetTVResize
105
+ ds_width: True
106
+ padding: False
107
+ data_dir_list: [
108
+ '../evaluation/CUTE80',
109
+ '../evaluation/IC13_857',
110
+ '../evaluation/IC15_1811',
111
+ '../evaluation/IIIT5k',
112
+ '../evaluation/SVT',
113
+ '../evaluation/SVTP',
114
+ ]
115
+ transforms:
116
+ - DecodeImagePIL: # load image
117
+ img_mode: RGB
118
+ - ABINetLabelEncode:
119
+ ignore_index: *ignore_index
120
+ - KeepKeys:
121
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
122
+ sampler:
123
+ name: RatioSampler
124
+ scales: [[128, 32]] # w, h
125
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
126
+ first_bs: *bs
127
+ fix_bs: false
128
+ divided_factor: [4, 16] # w, h
129
+ is_training: False
130
+ loader:
131
+ shuffle: False
132
+ drop_last: False
133
+ batch_size_per_card: *bs
134
+ max_ratio: *max_ratio
135
+ num_workers: 4
configs/rec/busnet/svtrv2_busnet_pretraining.yml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: BUSBet
35
+ Transform:
36
+ Encoder:
37
+ name: SVTRv2LNConvTwo33
38
+ use_pos_embed: False
39
+ dims: [128, 256, 384]
40
+ depths: [6, 6, 6]
41
+ num_heads: [4, 8, 12]
42
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
43
+ local_k: [[5, 5], [5, 5], [-1, -1]]
44
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
45
+ last_stage: false
46
+ feat2d: False
47
+ Decoder:
48
+ name: BUSDecoder
49
+ nhead: 6
50
+ num_layers: 6
51
+ dim_feedforward: 1536
52
+ ignore_index: &ignore_index 100
53
+ pretraining: True
54
+ # return_id: 0
55
+ Loss:
56
+ name: ABINetLoss
57
+ ignore_index: *ignore_index
58
+
59
+ PostProcess:
60
+ name: ABINetLabelDecode
61
+
62
+ Metric:
63
+ name: RecMetric
64
+ main_indicator: acc
65
+ is_filter: True
66
+
67
+ Train:
68
+ dataset:
69
+ name: RatioDataSetTVResize
70
+ ds_width: True
71
+ padding: false
72
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
77
+ ]
78
+ transforms:
79
+ - DecodeImagePIL: # load image
80
+ img_mode: RGB
81
+ - PARSeqAugPIL:
82
+ - ABINetLabelEncode:
83
+ ignore_index: *ignore_index
84
+ - KeepKeys:
85
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
86
+ sampler:
87
+ name: RatioSampler
88
+ scales: [[128, 32]] # w, h
89
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
90
+ first_bs: &bs 256
91
+ fix_bs: false
92
+ divided_factor: [4, 16] # w, h
93
+ is_training: True
94
+ loader:
95
+ shuffle: True
96
+ batch_size_per_card: *bs
97
+ drop_last: True
98
+ max_ratio: &max_ratio 4
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: RatioDataSetTVResize
104
+ ds_width: True
105
+ padding: False
106
+ data_dir_list: [
107
+ '../evaluation/CUTE80',
108
+ '../evaluation/IC13_857',
109
+ '../evaluation/IC15_1811',
110
+ '../evaluation/IIIT5k',
111
+ '../evaluation/SVT',
112
+ '../evaluation/SVTP',
113
+ ]
114
+ transforms:
115
+ - DecodeImagePIL: # load image
116
+ img_mode: RGB
117
+ - ABINetLabelEncode:
118
+ ignore_index: *ignore_index
119
+ - KeepKeys:
120
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
121
+ sampler:
122
+ name: RatioSampler
123
+ scales: [[128, 32]] # w, h
124
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
125
+ first_bs: *bs
126
+ fix_bs: false
127
+ divided_factor: [4, 16] # w, h
128
+ is_training: False
129
+ loader:
130
+ shuffle: False
131
+ drop_last: False
132
+ batch_size_per_card: *bs
133
+ max_ratio: *max_ratio
134
+ num_workers: 4
configs/rec/busnet/vit_busnet.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_busnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00053 # 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [6]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 8]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: BUSDecoder
48
+ nhead: 6
49
+ num_layers: 6
50
+ dim_feedforward: 1536
51
+ ignore_index: &ignore_index 100
52
+ pretraining: False
53
+ Loss:
54
+ name: ABINetLoss
55
+ ignore_index: *ignore_index
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ABINetLabelEncode:
74
+ ignore_index: *ignore_index
75
+ - RecTVResize:
76
+ image_shape: [32, 128]
77
+ padding: False
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ loader:
81
+ shuffle: True
82
+ batch_size_per_card: 256
83
+ drop_last: True
84
+ num_workers: 4
85
+
86
+ Eval:
87
+ dataset:
88
+ name: LMDBDataSet
89
+ data_dir: ../evaluation
90
+ transforms:
91
+ - DecodeImagePIL: # load image
92
+ img_mode: RGB
93
+ - ABINetLabelEncode:
94
+ ignore_index: *ignore_index
95
+ - RecTVResize:
96
+ image_shape: [32, 128]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: False
102
+ drop_last: False
103
+ batch_size_per_card: 256
104
+ num_workers: 2
configs/rec/busnet/vit_busnet_pretraining.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00053 # 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [6]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 8]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: BUSDecoder
48
+ nhead: 6
49
+ num_layers: 6
50
+ dim_feedforward: 1536
51
+ ignore_index: &ignore_index 100
52
+ pretraining: True
53
+ Loss:
54
+ name: ABINetLoss
55
+ ignore_index: *ignore_index
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ABINetLabelEncode:
74
+ ignore_index: *ignore_index
75
+ - RecTVResize:
76
+ image_shape: [32, 128]
77
+ padding: False
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ loader:
81
+ shuffle: True
82
+ batch_size_per_card: 256
83
+ drop_last: True
84
+ num_workers: 4
85
+
86
+ Eval:
87
+ dataset:
88
+ name: LMDBDataSet
89
+ data_dir: ../evaluation
90
+ transforms:
91
+ - DecodeImagePIL: # load image
92
+ img_mode: RGB
93
+ - ABINetLabelEncode:
94
+ ignore_index: *ignore_index
95
+ - RecTVResize:
96
+ image_shape: [32, 128]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: False
102
+ drop_last: False
103
+ batch_size_per_card: 256
104
+ num_workers: 2
configs/rec/cam/convnextv2_cam_tps_on.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.0008 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+ eps: 1.e-8
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CAM
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: &img_shape [32, 128]
40
+ Encoder:
41
+ name: CAMEncoder
42
+ encoder_config:
43
+ name: ConvNeXtV2
44
+ depths: [2, 2, 8, 2]
45
+ dims: [80, 160, 320, 640]
46
+ strides: [[4,4], [2,1], [2,1], [1,1]]
47
+ drop_path_rate: 0.2
48
+ feat2d: True
49
+ nb_classes: 97
50
+ strides: [[4,4], [2,1], [2,1], [1,1]]
51
+ deform_stride: 2
52
+ stage_idx: 2
53
+ use_depthwise_unet: True
54
+ use_more_unet: False
55
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
56
+ mid_size: True
57
+ d_embedding: 384
58
+ Decoder:
59
+ name: CAMDecoder
60
+ num_encoder_layers: -1
61
+ beam_size: 0
62
+ num_decoder_layers: 2
63
+ nhead: 8
64
+ max_len: *max_text_length
65
+
66
+ Loss:
67
+ name: CAMLoss
68
+ loss_weight_binary: 1.5
69
+ label_smoothing: 0.
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ PostProcess:
77
+ name: ARLabelDecode
78
+
79
+ Train:
80
+ dataset:
81
+ name: LMDBDataSet
82
+ data_dir: ../Union14M-L-LMDB-Filtered
83
+ transforms:
84
+ - DecodeImagePIL: # load image
85
+ img_mode: RGB
86
+ - PARSeqAugPIL:
87
+ - CAMLabelEncode: # Class handling label
88
+ font_path: ./arial.ttf
89
+ image_shape: *img_shape
90
+ - RecTVResize:
91
+ image_shape: [64, 256]
92
+ padding: False
93
+ - KeepKeys:
94
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: 256
98
+ drop_last: True
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: LMDBDataSet
104
+ data_dir: ../evaluation
105
+ transforms:
106
+ - DecodeImagePIL: # load image
107
+ img_mode: RGB
108
+ - ARLabelEncode: # Class handling label
109
+ - RecTVResize:
110
+ image_shape: [64, 256]
111
+ padding: False
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ loader:
115
+ shuffle: False
116
+ drop_last: False
117
+ batch_size_per_card: 256
118
+ num_workers: 2
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.0008 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+ eps: 1.e-8
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CAM
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: &img_shape [32, 128]
40
+ Encoder:
41
+ name: CAMEncoder
42
+ encoder_config:
43
+ name: ConvNeXtV2
44
+ depths: [3, 3, 9, 3]
45
+ dims: [96, 192, 384, 768]
46
+ strides: [[4,4], [2,1], [2,1], [1,1]]
47
+ drop_path_rate: 0.2
48
+ feat2d: True
49
+ nb_classes: 97
50
+ strides: [[4,4], [2,1], [2,1], [1,1]]
51
+ deform_stride: 2
52
+ stage_idx: 2
53
+ use_depthwise_unet: True
54
+ use_more_unet: False
55
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
56
+ mid_size: False
57
+ d_embedding: 512
58
+ Decoder:
59
+ name: CAMDecoder
60
+ num_encoder_layers: -1
61
+ beam_size: 0
62
+ num_decoder_layers: 2
63
+ nhead: 8
64
+ max_len: *max_text_length
65
+
66
+ Loss:
67
+ name: CAMLoss
68
+ loss_weight_binary: 1.5
69
+ label_smoothing: 0.
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ PostProcess:
77
+ name: ARLabelDecode
78
+
79
+ Train:
80
+ dataset:
81
+ name: LMDBDataSet
82
+ data_dir: ../Union14M-L-LMDB-Filtered
83
+ transforms:
84
+ - DecodeImagePIL: # load image
85
+ img_mode: RGB
86
+ - PARSeqAugPIL:
87
+ - CAMLabelEncode: # Class handling label
88
+ font_path: ./arial.ttf
89
+ image_shape: *img_shape
90
+ - RecTVResize:
91
+ image_shape: [64, 256]
92
+ padding: False
93
+ - KeepKeys:
94
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: 256
98
+ drop_last: True
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: LMDBDataSet
104
+ data_dir: ../evaluation
105
+ transforms:
106
+ - DecodeImagePIL: # load image
107
+ img_mode: RGB
108
+ - ARLabelEncode: # Class handling label
109
+ - RecTVResize:
110
+ image_shape: [64, 256]
111
+ padding: False
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ loader:
115
+ shuffle: False
116
+ drop_last: False
117
+ batch_size_per_card: 256
118
+ num_workers: 2
configs/rec/cam/svtrv2_cam_tps_on.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: CAM
35
+ Transform:
36
+ name: Aster_TPS
37
+ tps_inputsize: [32, 64]
38
+ tps_outputsize: &img_shape [32, 128]
39
+ Encoder:
40
+ name: CAMEncoder
41
+ encoder_config:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: True
52
+ nb_classes: 97
53
+ strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
54
+ k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
55
+ q_size: [4, 32]
56
+ deform_stride: 2
57
+ stage_idx: 2
58
+ use_depthwise_unet: True
59
+ use_more_unet: False
60
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
61
+ mid_size: True
62
+ d_embedding: 384
63
+ Decoder:
64
+ name: CAMDecoder
65
+ num_encoder_layers: -1
66
+ beam_size: 0
67
+ num_decoder_layers: 2
68
+ nhead: 8
69
+ max_len: *max_text_length
70
+
71
+ Loss:
72
+ name: CAMLoss
73
+ loss_weight_binary: 1.5
74
+ label_smoothing: 0.
75
+
76
+ Metric:
77
+ name: RecMetric
78
+ main_indicator: acc
79
+ is_filter: True
80
+
81
+ PostProcess:
82
+ name: ARLabelDecode
83
+
84
+ Train:
85
+ dataset:
86
+ name: LMDBDataSet
87
+ data_dir: ../Union14M-L-LMDB-Filtered
88
+ transforms:
89
+ - DecodeImagePIL: # load image
90
+ img_mode: RGB
91
+ - PARSeqAugPIL:
92
+ - CAMLabelEncode: # Class handling label
93
+ font_path: ./arial.ttf
94
+ image_shape: *img_shape
95
+ - RecTVResize:
96
+ image_shape: [64, 256]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: True
102
+ batch_size_per_card: 256
103
+ drop_last: True
104
+ num_workers: 4
105
+
106
+ Eval:
107
+ dataset:
108
+ name: LMDBDataSet
109
+ data_dir: ../evaluation
110
+ transforms:
111
+ - DecodeImagePIL: # load image
112
+ img_mode: RGB
113
+ - ARLabelEncode: # Class handling label
114
+ - RecTVResize:
115
+ image_shape: [64, 256]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 256
123
+ num_workers: 2
configs/rec/cdistnet/resnet45_trans_cdistnet.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
19
+ use_amp: True
20
+ grad_clip_val: 5
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CDistNet
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ Decoder:
42
+ name: CDistNetDecoder
43
+ add_conv: True
44
+
45
+ Loss:
46
+ name: ARLoss
47
+
48
+ PostProcess:
49
+ name: ARLabelDecode
50
+
51
+ Metric:
52
+ name: RecMetric
53
+ main_indicator: acc
54
+ is_filter: True
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ARLabelEncode: # Class handling label
65
+ - RecTVResize:
66
+ image_shape: [32, 128]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 256
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ARLabelEncode: # Class handling label
84
+ - RecTVResize:
85
+ image_shape: [32, 128]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/cdistnet/svtrv2_cdistnet.yml ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
16
+ # ./tools/utils/ppocr_keys_v1.txt # ch
17
+ max_text_length: &max_text_length 25
18
+ use_space_char: &use_space_char False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 #4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CDistNet
36
+ in_channels: 3
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ out_channels: 256
42
+ dims: [128, 256, 384]
43
+ depths: [6, 6, 6]
44
+ num_heads: [4, 8, 12]
45
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
46
+ local_k: [[5, 5], [5, 5], [-1, -1]]
47
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
48
+ last_stage: false
49
+ feat2d: True
50
+ Decoder:
51
+ name: CDistNetDecoder
52
+ add_conv: False
53
+ num_encoder_blocks: 0
54
+
55
+ Loss:
56
+ name: ARLoss
57
+
58
+ PostProcess:
59
+ name: ARLabelDecode
60
+ character_dict_path: *character_dict_path
61
+ use_space_char: *use_space_char
62
+
63
+ Metric:
64
+ name: RecMetric
65
+ main_indicator: acc
66
+ is_filter: True
67
+
68
+ Train:
69
+ dataset:
70
+ name: RatioDataSetTVResize
71
+ ds_width: True
72
+ padding: false
73
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
77
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
78
+ ]
79
+ transforms:
80
+ - DecodeImagePIL: # load image
81
+ img_mode: RGB
82
+ - PARSeqAugPIL:
83
+ - ARLabelEncode: # Class handling label
84
+ character_dict_path: *character_dict_path
85
+ use_space_char: *use_space_char
86
+ max_text_length: *max_text_length
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ sampler:
90
+ name: RatioSampler
91
+ scales: [[128, 32]] # w, h
92
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
93
+ first_bs: &bs 256
94
+ fix_bs: false
95
+ divided_factor: [4, 16] # w, h
96
+ is_training: True
97
+ loader:
98
+ shuffle: True
99
+ batch_size_per_card: *bs
100
+ drop_last: True
101
+ max_ratio: &max_ratio 4
102
+ num_workers: 4
103
+
104
+ Eval:
105
+ dataset:
106
+ name: RatioDataSetTVResize
107
+ ds_width: True
108
+ padding: False
109
+ data_dir_list: [
110
+ '../evaluation/CUTE80',
111
+ '../evaluation/IC13_857',
112
+ '../evaluation/IC15_1811',
113
+ '../evaluation/IIIT5k',
114
+ '../evaluation/SVT',
115
+ '../evaluation/SVTP',
116
+ ]
117
+ transforms:
118
+ - DecodeImagePIL: # load image
119
+ img_mode: RGB
120
+ - ARLabelEncode: # Class handling label
121
+ character_dict_path: *character_dict_path
122
+ use_space_char: *use_space_char
123
+ max_text_length: *max_text_length
124
+ - KeepKeys:
125
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
126
+ sampler:
127
+ name: RatioSampler
128
+ scales: [[128, 32]] # w, h
129
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
130
+ first_bs: *bs
131
+ fix_bs: false
132
+ divided_factor: [4, 16] # w, h
133
+ is_training: False
134
+ loader:
135
+ shuffle: False
136
+ drop_last: False
137
+ batch_size_per_card: *bs
138
+ max_ratio: *max_ratio
139
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.00065 # for 4gpus bs256/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: CPPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRNet
43
+ img_size: [32, 128]
44
+ out_char_num: 25
45
+ out_channels: 256
46
+ patch_merging: 'Conv'
47
+ embed_dim: [128, 256, 384]
48
+ depth: [6, 6, 6]
49
+ num_heads: [4, 8, 12]
50
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ Decoder:
55
+ name: CPPDDecoder
56
+ vis_seq: 64
57
+ num_layer: 2
58
+ pos_len: False
59
+ rec_layer: 1
60
+
61
+
62
+ Loss:
63
+ name: CPPDLoss
64
+ ignore_index: 100
65
+ smoothing: True
66
+ pos_len: False
67
+ sideloss_weight: 1.0
68
+
69
+ PostProcess:
70
+ name: CPPDLabelDecode
71
+ character_dict_path: *character_dict_path
72
+ use_space_char: *use_space_char
73
+
74
+ Metric:
75
+ name: RecMetric
76
+ main_indicator: acc
77
+
78
+ Train:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../Union14M-L-LMDB-Filtered
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - PARSeqAugPIL:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - RecTVResize:
92
+ image_shape: [32, 128]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImagePIL: # load image
108
+ img_mode: RGB
109
+ - CPPDLabelEncode: # Class handling label
110
+ pos_len: False
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_ch.yml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 100
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/ch/svtr_base_cppd/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 2000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: False
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # for 4gpus bs128/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: CosineAnnealingLR
33
+ warmup_epoch: 5
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 256]
43
+ patch_merging: 'Conv'
44
+ embed_dim: [128, 256, 384]
45
+ depth: [6, 6, 4]
46
+ num_heads: [4, 8, 12]
47
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
48
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
49
+ last_stage: False
50
+ prenorm: True
51
+ Decoder:
52
+ name: CPPDDecoder
53
+ vis_seq: 128
54
+ num_layer: 3
55
+ pos_len: False
56
+ rec_layer: 1
57
+ ch: True
58
+
59
+
60
+ Loss:
61
+ name: CPPDLoss
62
+ ignore_index: 7000
63
+ smoothing: True
64
+ pos_len: False
65
+ sideloss_weight: 1.0
66
+
67
+ PostProcess:
68
+ name: CPPDLabelDecode
69
+ character_dict_path: *character_dict_path
70
+ use_space_char: *use_space_char
71
+
72
+ Metric:
73
+ name: RecMetric
74
+ main_indicator: acc
75
+
76
+ Train:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../benchmark_bctr/benchmark_bctr_train
80
+ transforms:
81
+ - DecodeImage: # load image
82
+ img_mode: BGR
83
+ channel_first: False
84
+ - CPPDLabelEncode: # Class handling label
85
+ pos_len: False
86
+ ch: True
87
+ ignore_index: 7000
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - SVTRResize:
92
+ image_shape: [3, 32, 256]
93
+ padding: True
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 128
99
+ drop_last: True
100
+ num_workers: 8
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
106
+ transforms:
107
+ - DecodeImage: # load image
108
+ img_mode: BGR
109
+ channel_first: False
110
+ - CPPDLabelEncode: # Class handling label
111
+ pos_len: False
112
+ ch: True
113
+ ignore_index: 7000
114
+ character_dict_path: *character_dict_path
115
+ use_space_char: *use_space_char
116
+ max_text_length: *max_text_length
117
+ - SVTRResize:
118
+ image_shape: [3, 32, 256]
119
+ padding: True
120
+ - KeepKeys:
121
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
122
+ loader:
123
+ shuffle: False
124
+ drop_last: False
125
+ batch_size_per_card: 256
126
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_h8.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 128]
43
+ out_char_num: 25
44
+ out_channels: 256
45
+ patch_merging: 'Conv'
46
+ embed_dim: [128, 256, 384]
47
+ depth: [6, 6, 6]
48
+ num_heads: [4, 8, 12]
49
+ sub_k: [[1, 1], [2, 1]]
50
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ Decoder:
55
+ name: CPPDDecoder
56
+ vis_seq: 128
57
+ num_layer: 2
58
+ pos_len: False
59
+ rec_layer: 1
60
+
61
+ Loss:
62
+ name: CPPDLoss
63
+ ignore_index: 100
64
+ smoothing: True
65
+ pos_len: False
66
+ sideloss_weight: 1.0
67
+
68
+ PostProcess:
69
+ name: CPPDLabelDecode
70
+ character_dict_path: *character_dict_path
71
+ use_space_char: *use_space_char
72
+
73
+ Metric:
74
+ name: RecMetric
75
+ main_indicator: acc
76
+ is_filter: True
77
+
78
+ Train:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../Union14M-L-LMDB-Filtered
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - PARSeqAugPIL:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - RecTVResize:
92
+ image_shape: [32, 128]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImagePIL: # load image
108
+ img_mode: RGB
109
+ - CPPDLabelEncode: # Class handling label
110
+ pos_len: False
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_syn.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 60
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/syn/svtr_base_cppd/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # for 4gpus bs256/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: CosineAnnealingLR
33
+ warmup_epoch: 6
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 100]
43
+ out_char_num: 25
44
+ out_channels: 256
45
+ patch_merging: 'Conv'
46
+ embed_dim: [128, 256, 384]
47
+ depth: [6, 6, 4]
48
+ num_heads: [4, 8, 12]
49
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
50
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
51
+ last_stage: False
52
+ prenorm: True
53
+ Decoder:
54
+ name: CPPDDecoder
55
+ vis_seq: 50
56
+ num_layer: 3
57
+ pos_len: False
58
+ rec_layer: 1
59
+
60
+
61
+ Loss:
62
+ name: CPPDLoss
63
+ ignore_index: 100
64
+ smoothing: True
65
+ pos_len: False
66
+ sideloss_weight: 1.0
67
+
68
+ PostProcess:
69
+ name: CPPDLabelDecode
70
+ character_dict_path: *character_dict_path
71
+ use_space_char: *use_space_char
72
+
73
+ Metric:
74
+ name: RecMetric
75
+ main_indicator: acc
76
+
77
+ Train:
78
+ dataset:
79
+ name: STRLMDBDataSet
80
+ data_dir: ./
81
+ transforms:
82
+ - DecodeImage: # load image
83
+ img_mode: BGR
84
+ channel_first: False
85
+ # - SVTRRAug:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - SVTRResize:
92
+ image_shape: [3, 32, 100]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 8
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImage: # load image
108
+ img_mode: BGR
109
+ channel_first: False
110
+ - CPPDLabelEncode: # Class handling label
111
+ pos_len: False
112
+ character_dict_path: *character_dict_path
113
+ use_space_char: *use_space_char
114
+ max_text_length: *max_text_length
115
+ - SVTRResize:
116
+ image_shape: [3, 32, 100]
117
+ padding: False
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
120
+ loader:
121
+ shuffle: False
122
+ drop_last: False
123
+ batch_size_per_card: 256
124
+ num_workers: 4
configs/rec/cppd/svtrv2_cppd.yml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRv2LNConvTwo33
42
+ use_pos_embed: False
43
+ out_channels: 256
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: False
52
+ Decoder:
53
+ name: CPPDDecoder
54
+ ds: True
55
+ num_layer: 2
56
+ pos_len: False
57
+ rec_layer: 1
58
+
59
+
60
+ Loss:
61
+ name: CPPDLoss
62
+ ignore_index: 100
63
+ smoothing: True
64
+ pos_len: False
65
+ sideloss_weight: 1.0
66
+
67
+ PostProcess:
68
+ name: CPPDLabelDecode
69
+ character_dict_path: *character_dict_path
70
+ use_space_char: *use_space_char
71
+
72
+ Metric:
73
+ name: RecMetric
74
+ main_indicator: acc
75
+ is_filter: True
76
+
77
+ Train:
78
+ dataset:
79
+ name: RatioDataSetTVResize
80
+ ds_width: True
81
+ padding: false
82
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
83
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
84
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
85
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
86
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
87
+ ]
88
+ transforms:
89
+ - DecodeImagePIL: # load image
90
+ img_mode: RGB
91
+ - PARSeqAugPIL:
92
+ - CPPDLabelEncode: # Class handling label
93
+ pos_len: False
94
+ character_dict_path: *character_dict_path
95
+ use_space_char: *use_space_char
96
+ max_text_length: *max_text_length
97
+ - KeepKeys:
98
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
99
+ sampler:
100
+ name: RatioSampler
101
+ scales: [[128, 32]] # w, h
102
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
103
+ first_bs: &bs 256
104
+ fix_bs: false
105
+ divided_factor: [4, 16] # w, h
106
+ is_training: True
107
+ loader:
108
+ shuffle: True
109
+ batch_size_per_card: *bs
110
+ drop_last: True
111
+ max_ratio: &max_ratio 4
112
+ num_workers: 4
113
+
114
+ Eval:
115
+ dataset:
116
+ name: RatioDataSetTVResize
117
+ ds_width: True
118
+ padding: False
119
+ data_dir_list: [
120
+ '../evaluation/CUTE80',
121
+ '../evaluation/IC13_857',
122
+ '../evaluation/IC15_1811',
123
+ '../evaluation/IIIT5k',
124
+ '../evaluation/SVT',
125
+ '../evaluation/SVTP',
126
+ ]
127
+ transforms:
128
+ - DecodeImagePIL: # load image
129
+ img_mode: RGB
130
+ - CPPDLabelEncode: # Class handling label
131
+ pos_len: False
132
+ character_dict_path: *character_dict_path
133
+ use_space_char: *use_space_char
134
+ max_text_length: *max_text_length
135
+ - KeepKeys:
136
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
137
+ sampler:
138
+ name: RatioSampler
139
+ scales: [[128, 32]] # w, h
140
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
141
+ first_bs: *bs
142
+ fix_bs: false
143
+ divided_factor: [4, 16] # w, h
144
+ is_training: False
145
+ loader:
146
+ shuffle: False
147
+ drop_last: False
148
+ batch_size_per_card: *bs
149
+ max_ratio: *max_ratio
150
+ num_workers: 4
configs/rec/dan/resnet45_fpn_dan.yml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: DAN
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ return_list: True
42
+ Decoder:
43
+ name: DANDecoder
44
+ max_len: 25
45
+ channels_list: [64, 128, 256, 512]
46
+ strides_list: [[2, 2], [1, 1], [1, 1]]
47
+ in_shape: [8, 32]
48
+ depth: 4
49
+
50
+ Loss:
51
+ name: ARLoss
52
+
53
+ PostProcess:
54
+ name: ARLabelDecode
55
+
56
+ Metric:
57
+ name: RecMetric
58
+ main_indicator: acc
59
+ is_filter: True
60
+
61
+ Train:
62
+ dataset:
63
+ name: LMDBDataSet
64
+ data_dir: ../Union14M-L-LMDB-Filtered
65
+ transforms:
66
+ - DecodeImagePIL: # load image
67
+ img_mode: RGB
68
+ - PARSeqAugPIL:
69
+ - ARLabelEncode:
70
+ - RecTVResize:
71
+ image_shape: [32, 128]
72
+ padding: False
73
+ - KeepKeys:
74
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
75
+ loader:
76
+ shuffle: True
77
+ batch_size_per_card: 256
78
+ drop_last: True
79
+ num_workers: 4
80
+
81
+ Eval:
82
+ dataset:
83
+ name: LMDBDataSet
84
+ data_dir: ../evaluation
85
+ transforms:
86
+ - DecodeImagePIL: # load image
87
+ img_mode: RGB
88
+ - ARLabelEncode:
89
+ - RecTVResize:
90
+ image_shape: [32, 128]
91
+ padding: False
92
+ - KeepKeys:
93
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
94
+ loader:
95
+ shuffle: False
96
+ drop_last: False
97
+ batch_size_per_card: 256
98
+ num_workers: 2
configs/rec/dan/svtrv2_dan.yml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_dan
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # 4gpus 256bs/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: DAN
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ out_channels: 256
41
+ dims: [128, 256, 384]
42
+ depths: [6, 6, 6]
43
+ num_heads: [4, 8, 12]
44
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
45
+ local_k: [[5, 5], [5, 5], [-1, -1]]
46
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
47
+ last_stage: false
48
+ feat2d: True
49
+ Decoder:
50
+ name: DANDecoder
51
+ use_cam: False
52
+ max_len: 25
53
+
54
+ Loss:
55
+ name: ARLoss
56
+
57
+ PostProcess:
58
+ name: ARLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSetTVResize
68
+ ds_width: True
69
+ padding: false
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImagePIL: # load image
78
+ img_mode: RGB
79
+ - PARSeqAugPIL:
80
+ - ARLabelEncode:
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
83
+ sampler:
84
+ name: RatioSampler
85
+ scales: [[128, 32]] # w, h
86
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
87
+ first_bs: &bs 256
88
+ fix_bs: false
89
+ divided_factor: [4, 16] # w, h
90
+ is_training: True
91
+ loader:
92
+ shuffle: True
93
+ batch_size_per_card: *bs
94
+ drop_last: True
95
+ max_ratio: &max_ratio 4
96
+ num_workers: 4
97
+
98
+ Eval:
99
+ dataset:
100
+ name: RatioDataSetTVResize
101
+ ds_width: True
102
+ padding: False
103
+ data_dir_list: [
104
+ '../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - ARLabelEncode:
115
+ - KeepKeys:
116
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
117
+ sampler:
118
+ name: RatioSampler
119
+ scales: [[128, 32]] # w, h
120
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
121
+ first_bs: *bs
122
+ fix_bs: false
123
+ divided_factor: [4, 16] # w, h
124
+ is_training: False
125
+ loader:
126
+ shuffle: False
127
+ drop_last: False
128
+ batch_size_per_card: *bs
129
+ max_ratio: *max_ratio
130
+ num_workers: 4
configs/rec/focalsvtr/focalsvtr_ctc.yml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path
16
+ # ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: &max_text_length 25
18
+ use_space_char: &use_space_char False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
20
+
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: SVTR
37
+ Transform:
38
+ Encoder:
39
+ name: FocalSVTR
40
+ img_size: [32, 128]
41
+ depths: [6, 6, 6]
42
+ embed_dim: 96
43
+ sub_k: [[1, 1], [2, 1], [1, 1]]
44
+ focal_levels: [3, 3, 3]
45
+ out_channels: 256
46
+ last_stage: True
47
+ Decoder:
48
+ name: CTCDecoder
49
+
50
+ Loss:
51
+ name: CTCLoss
52
+ zero_infinity: True
53
+
54
+ PostProcess:
55
+ name: CTCLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+
59
+ Metric:
60
+ name: RecMetric
61
+ main_indicator: acc
62
+ is_filter: True
63
+
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSet
68
+ ds_width: True
69
+ padding: &padding False
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImage: # load image
78
+ img_mode: BGR
79
+ channel_first: False
80
+ - PARSeqAug:
81
+ - CTCLabelEncode: # Class handling label
82
+ character_dict_path: *character_dict_path
83
+ use_space_char: *use_space_char
84
+ max_text_length: *max_text_length
85
+ - KeepKeys:
86
+ keep_keys: ['image', 'label', 'length']
87
+ sampler:
88
+ name: RatioSampler
89
+ scales: [[128, 32]] # w, h
90
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
91
+ first_bs: &bs 256
92
+ fix_bs: false
93
+ divided_factor: [4, 16] # w, h
94
+ is_training: True
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: *bs
98
+ drop_last: True
99
+ max_ratio: 12
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: RatioDataSet
105
+ ds_width: True
106
+ padding: True
107
+ data_dir_list: ['../evaluation/CUTE80',
108
+ '../evaluation/IC13_857',
109
+ '../evaluation/IC15_1811',
110
+ '../evaluation/IIIT5k',
111
+ '../evaluation/SVT',
112
+ '../evaluation/SVTP',
113
+ ]
114
+ transforms:
115
+ - DecodeImage: # load image
116
+ img_mode: BGR
117
+ channel_first: False
118
+ - CTCLabelEncode: # Class handling label
119
+ character_dict_path: *character_dict_path
120
+ use_space_char: *use_space_char
121
+ max_text_length: *max_text_length
122
+ - KeepKeys:
123
+ keep_keys: ['image', 'label', 'length']
124
+ sampler:
125
+ name: RatioSampler
126
+ scales: [[128, 32]] # w, h
127
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
128
+ first_bs: 128
129
+ fix_bs: false
130
+ divided_factor: [4, 16] # w, h
131
+ is_training: False
132
+ loader:
133
+ shuffle: False
134
+ drop_last: False
135
+ batch_size_per_card: 128
136
+ max_ratio: 12
137
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img: ../ltb/img
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.00065
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: BGPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ out_channels: 256
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: GTCDecoder
55
+ infer_gtc: True
56
+ detach: False
57
+ gtc_decoder:
58
+ name: NRTRDecoder
59
+ num_encoder_layers: -1
60
+ beam_size: 0
61
+ num_decoder_layers: 2
62
+ nhead: 12
63
+ max_len: *max_text_length
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: GTCLoss
69
+ gtc_loss:
70
+ name: ARLoss
71
+
72
+ PostProcess:
73
+ name: GTCLabelDecode
74
+ gtc_label_decode:
75
+ name: ARLabelDecode
76
+ character_dict_path: *character_dict_path
77
+ use_space_char: *use_space_char
78
+
79
+ Metric:
80
+ name: RecGTCMetric
81
+ main_indicator: acc
82
+ is_filter: True
83
+
84
+ Train:
85
+ dataset:
86
+ name: RatioDataSet
87
+ ds_width: True
88
+ # max_ratio: &max_ratio 4
89
+ # min_ratio: 1
90
+ # base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
91
+ # base_h: &base_h 32
92
+ # padding: &padding False
93
+ padding: false
94
+ # padding_rand: true
95
+ # padding_doub: true
96
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
97
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
98
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
99
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
100
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
101
+ ]
102
+ transforms:
103
+ - DecodeImage: # load image
104
+ img_mode: BGR
105
+ channel_first: False
106
+ - PARSeqAug:
107
+ - GTCLabelEncode: # Class handling label
108
+ gtc_label_encode:
109
+ name: ARLabelEncode
110
+ character_dict_path: *character_dict_path
111
+ use_space_char: *use_space_char
112
+ max_text_length: *max_text_length
113
+ - KeepKeys:
114
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
115
+ sampler:
116
+ name: RatioSampler
117
+ scales: [[128, 32]] # w, h
118
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
119
+ first_bs: &bs 256
120
+ fix_bs: false
121
+ divided_factor: [4, 16] # w, h
122
+ is_training: True
123
+ loader:
124
+ shuffle: True
125
+ batch_size_per_card: *bs
126
+ drop_last: True
127
+ max_ratio: &max_ratio 4
128
+ num_workers: 4
129
+
130
+ Eval:
131
+ dataset:
132
+ name: RatioDataSet
133
+ ds_width: True
134
+ padding: False
135
+ data_dir_list: [
136
+ '../evaluation/CUTE80',
137
+ '../evaluation/IC13_857',
138
+ '../evaluation/IC15_1811',
139
+ '../evaluation/IIIT5k',
140
+ '../evaluation/SVT',
141
+ '../evaluation/SVTP',
142
+ ]
143
+ transforms:
144
+ - DecodeImage: # load image
145
+ img_mode: BGR
146
+ channel_first: False
147
+ - GTCLabelEncode: # Class handling label
148
+ gtc_label_encode:
149
+ name: ARLabelEncode
150
+ character_dict_path: *character_dict_path
151
+ use_space_char: *use_space_char
152
+ max_text_length: *max_text_length
153
+ - KeepKeys:
154
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
155
+ sampler:
156
+ name: RatioSampler
157
+ scales: [[128, 32]] # w, h
158
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
159
+ first_bs: *bs
160
+ fix_bs: false
161
+ divided_factor: [4, 16] # w, h
162
+ is_training: False
163
+ loader:
164
+ shuffle: False
165
+ drop_last: False
166
+ batch_size_per_card: *bs
167
+ max_ratio: *max_ratio
168
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 1000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img: ../ltb/img
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.000325
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: BGPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ out_channels: 256
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: GTCDecoder
55
+ infer_gtc: False
56
+ detach: False
57
+ gtc_decoder:
58
+ name: SMTRDecoder
59
+ num_layer: 1
60
+ ds: True
61
+ max_len: *max_text_length
62
+ next_mode: &next True
63
+ sub_str_len: &subsl 5
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: CTCLoss
69
+
70
+ PostProcess:
71
+ name: CTCLabelDecode
72
+ character_dict_path: *character_dict_path
73
+ use_space_char: *use_space_char
74
+
75
+ Metric:
76
+ name: RecMetric
77
+ main_indicator: acc
78
+ is_filter: True
79
+
80
+ Train:
81
+ dataset:
82
+ name: RatioDataSetTVResize
83
+ ds_width: True
84
+ padding: false
85
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
86
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
87
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
88
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
89
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
90
+ ]
91
+ transforms:
92
+ - DecodeImagePIL: # load image
93
+ img_mode: RGB
94
+ - PARSeqAugPIL:
95
+ - CTCLabelEncode: # Class handling label
96
+ character_dict_path: *character_dict_path
97
+ use_space_char: *use_space_char
98
+ max_text_length: *max_text_length
99
+ - KeepKeys:
100
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
101
+ sampler:
102
+ name: RatioSampler
103
+ scales: [[128, 32]] # w, h
104
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
105
+ first_bs: &bs 128
106
+ fix_bs: false
107
+ divided_factor: [4, 16] # w, h
108
+ is_training: True
109
+ loader:
110
+ shuffle: True
111
+ batch_size_per_card: *bs
112
+ drop_last: True
113
+ max_ratio: &max_ratio 12
114
+ num_workers: 4
115
+
116
+ Eval:
117
+ dataset:
118
+ name: RatioDataSetTVResize
119
+ ds_width: True
120
+ padding: False
121
+ data_dir_list: [
122
+ '../evaluation/CUTE80',
123
+ '../evaluation/IC13_857',
124
+ '../evaluation/IC15_1811',
125
+ '../evaluation/IIIT5k',
126
+ '../evaluation/SVT',
127
+ '../evaluation/SVTP',
128
+ ]
129
+ transforms:
130
+ - DecodeImagePIL: # load image
131
+ img_mode: RGB
132
+ - CTCLabelEncode: # Class handling label
133
+ character_dict_path: *character_dict_path
134
+ use_space_char: *use_space_char
135
+ max_text_length: *max_text_length
136
+ - KeepKeys:
137
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
138
+ sampler:
139
+ name: RatioSampler
140
+ scales: [[128, 32]] # w, h
141
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
142
+ first_bs: *bs
143
+ fix_bs: false
144
+ divided_factor: [4, 16] # w, h
145
+ is_training: False
146
+ loader:
147
+ shuffle: False
148
+ drop_last: False
149
+ batch_size_per_card: *bs
150
+ max_ratio: *max_ratio
151
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 1000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.000325
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: BGPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRv2LNConvTwo33
42
+ use_pos_embed: False
43
+ out_channels: 256
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: True
52
+ Decoder:
53
+ name: GTCDecoder
54
+ infer_gtc: True
55
+ detach: False
56
+ gtc_decoder:
57
+ name: SMTRDecoder
58
+ num_layer: 1
59
+ ds: True
60
+ max_len: *max_text_length
61
+ next_mode: &next True
62
+ sub_str_len: &subsl 5
63
+ infer_aug: True
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: GTCLoss
69
+ ctc_weight: 0.1
70
+ gtc_loss:
71
+ name: SMTRLoss
72
+
73
+ PostProcess:
74
+ name: GTCLabelDecode
75
+ gtc_label_decode:
76
+ name: SMTRLabelDecode
77
+ next_mode: *next
78
+ character_dict_path: *character_dict_path
79
+ use_space_char: *use_space_char
80
+ only_gtc: True
81
+
82
+ Metric:
83
+ name: RecGTCMetric
84
+ main_indicator: acc
85
+ is_filter: True
86
+
87
+ Train:
88
+ dataset:
89
+ name: RatioDataSetTVResize
90
+ ds_width: True
91
+ padding: false
92
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
93
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
94
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
95
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
96
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
97
+ ]
98
+ transforms:
99
+ - DecodeImagePIL: # load image
100
+ img_mode: RGB
101
+ - PARSeqAugPIL:
102
+ - SMTRLabelEncode: # Class handling label
103
+ sub_str_len: *subsl
104
+ character_dict_path: *character_dict_path
105
+ use_space_char: *use_space_char
106
+ max_text_length: *max_text_length
107
+ - KeepKeys:
108
+ keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
109
+ 'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
110
+ sampler:
111
+ name: RatioSampler
112
+ scales: [[128, 32]] # w, h
113
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
114
+ first_bs: &bs 256
115
+ fix_bs: false
116
+ divided_factor: [4, 16] # w, h
117
+ is_training: True
118
+ loader:
119
+ shuffle: True
120
+ batch_size_per_card: *bs
121
+ drop_last: True
122
+ max_ratio: &max_ratio 12
123
+ num_workers: 4
124
+
125
+ Eval:
126
+ dataset:
127
+ name: SimpleDataSet
128
+ data_dir: ../ltb/
129
+ label_file_list: ['../ltb/ultra_long_70_list.txt']
130
+ transforms:
131
+ - DecodeImage: # load image
132
+ img_mode: BGR
133
+ channel_first: False
134
+ - GTCLabelEncode: # Class handling label
135
+ gtc_label_encode:
136
+ name: ARLabelEncode
137
+ character_dict_path: *character_dict_path
138
+ use_space_char: *use_space_char
139
+ max_text_length: 200
140
+ - SliceResize:
141
+ image_shape: [3, 32, 128]
142
+ padding: False
143
+ max_ratio: 12
144
+ - KeepKeys:
145
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
146
+ loader:
147
+ shuffle: False
148
+ drop_last: False
149
+ batch_size_per_card: 1
150
+ num_workers: 2
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 60
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: AdamW
28
+ lr: 0.00065
29
+ weight_decay: 0.05
30
+ filter_bias_and_bn: True
31
+
32
+ LRScheduler:
33
+ name: OneCycleLR
34
+ warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
35
+ cycle_momentum: False
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: BGPD
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRv2LNConvTwo33
44
+ use_pos_embed: False
45
+ out_channels: 256
46
+ dims: [128, 256, 384]
47
+ depths: [6, 6, 6]
48
+ num_heads: [4, 8, 12]
49
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
50
+ local_k: [[5, 5], [5, 5], [-1, -1]]
51
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
52
+ last_stage: false
53
+ feat2d: True
54
+ Decoder:
55
+ name: GTCDecoder
56
+ infer_gtc: True
57
+ detach: False
58
+ gtc_decoder:
59
+ name: SMTRDecoder
60
+ num_layer: 1
61
+ ds: True
62
+ max_len: *max_text_length
63
+ next_mode: &next True
64
+ sub_str_len: &subsl 5
65
+ infer_aug: False
66
+ ctc_decoder:
67
+ name: RCTCDecoder
68
+
69
+ Loss:
70
+ name: GTCLoss
71
+ ctc_weight: 0.25
72
+ gtc_loss:
73
+ name: SMTRLoss
74
+
75
+ PostProcess:
76
+ name: GTCLabelDecode
77
+ gtc_label_decode:
78
+ name: SMTRLabelDecode
79
+ next_mode: *next
80
+ character_dict_path: *character_dict_path
81
+ use_space_char: *use_space_char
82
+ only_gtc: True
83
+
84
+ Metric:
85
+ name: RecMetric
86
+ main_indicator: acc
87
+ is_filter: True
88
+ stream: True
89
+
90
+ Train:
91
+ dataset:
92
+ name: RatioDataSetTVResize
93
+ ds_width: True
94
+ padding: false
95
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
96
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
97
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
98
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
99
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
100
+ ]
101
+ transforms:
102
+ - DecodeImagePIL: # load image
103
+ img_mode: RGB
104
+ - PARSeqAugPIL:
105
+ - SMTRLabelEncode: # Class handling label
106
+ sub_str_len: *subsl
107
+ character_dict_path: *character_dict_path
108
+ use_space_char: *use_space_char
109
+ max_text_length: *max_text_length
110
+ - KeepKeys:
111
+ keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
112
+ 'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
113
+ sampler:
114
+ name: RatioSampler
115
+ scales: [[128, 32]] # w, h
116
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
117
+ first_bs: &bs 256
118
+ fix_bs: false
119
+ divided_factor: [4, 16] # w, h
120
+ is_training: True
121
+ loader:
122
+ shuffle: True
123
+ batch_size_per_card: *bs
124
+ drop_last: True
125
+ max_ratio: &max_ratio 12
126
+ num_workers: 4
127
+
128
+ Eval:
129
+ dataset:
130
+ name: SimpleDataSet
131
+ data_dir: ../ltb/
132
+ label_file_list: ['../ltb/ultra_long_70_list.txt']
133
+ transforms:
134
+ - DecodeImagePIL: # load image
135
+ img_mode: RGB
136
+ - GTCLabelEncode: # Class handling label
137
+ gtc_label_encode:
138
+ name: ARLabelEncode
139
+ character_dict_path: *character_dict_path
140
+ use_space_char: *use_space_char
141
+ max_text_length: *max_text_length
142
+ - SliceTVResize:
143
+ image_shape: [32, 128]
144
+ padding: False
145
+ max_ratio: 4
146
+ - KeepKeys:
147
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
148
+ loader:
149
+ shuffle: False
150
+ drop_last: False
151
+ batch_size_per_card: 1
152
+ num_workers: 2
configs/rec/igtr/readme.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IGTR
2
+
3
+ - [IGTR](#igtr)
4
+ - [1. Introduction](#1-introduction)
5
+ - [2. Environment](#2-environment)
6
+ - [Dataset Preparation](#dataset-preparation)
7
+ - [3. Model Training / Evaluation](#3-model-training--evaluation)
8
+ - [Citation](#citation)
9
+
10
+ <a name="1"></a>
11
+
12
+ ## 1. Introduction
13
+
14
+ Paper:
15
+
16
+ > [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851)
17
+ > Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang
18
+
19
+ <a name="model"></a>
20
+ Multi-modal models show appealing performance in visual recognition tasks recently, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models are either inefficient or cannot be trivially upgraded to scene text recognition (STR) due to the composition difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops lightweight instruction encoder, cross-modal feature fusion module and multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that considerably differs from current methods. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and efficient inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of both rarely appearing and morphologically similar characters, which were previous challenges.
21
+
22
+ <a name="model"></a>
23
+ The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
24
+
25
+ - Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
26
+
27
+ | Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
28
+ | :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
29
+ | IGTR-PD | 97.6 | 95.2 | 97.6 | 88.4 | 91.6 | 95.5 | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
30
+ | IGTR-AR | 98.6 | 95.7 | 98.2 | 88.4 | 92.4 | 95.5 | 94.78 | as above |
31
+
32
+ - Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
33
+
34
+ | Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
35
+ | :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
36
+ | IGTR-PD | 76.9 | 30.6 | 59.1 | 63.3 | 77.8 | 62.5 | 66.7 | 62.40 | Same as the above table |
37
+ | IGTR-AR | 78.4 | 31.9 | 61.3 | 66.5 | 80.2 | 69.3 | 67.9 | 65.07 | as above |
38
+
39
+ - Trained on Union14M-L training dataset.
40
+
41
+ | Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
42
+ | :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
43
+ | IGTR-PD | 97.7 | 97.7 | 98.3 | 89.8 | 93.7 | 97.9 | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
44
+ | IGTR-AR | 98.1 | 98.4 | 98.7 | 90.5 | 94.9 | 98.3 | 96.48 | as above |
45
+ | IGTR-PD-60ep | 97.9 | 98.3 | 99.2 | 90.8 | 93.7 | 97.6 | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
46
+ | IGTR-AR-60ep | 98.4 | 98.1 | 99.3 | 91.5 | 94.3 | 97.6 | 96.54 | as above |
47
+ | IGTR-PD-PT | 98.6 | 98.0 | 99.1 | 91.7 | 96.8 | 99.0 | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
48
+ | IGTR-AR-PT | 98.8 | 98.3 | 99.2 | 92.0 | 96.8 | 99.0 | 97.34 | as above |
49
+
50
+ | Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
51
+ | :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
52
+ | IGTR-PD | 88.1 | 89.9 | 74.2 | 80.3 | 82.8 | 79.2 | 83.0 | 82.51 | Same as the above table |
53
+ | IGTR-AR | 90.4 | 91.2 | 77.0 | 82.4 | 84.7 | 84.0 | 84.4 | 84.86 | as above |
54
+ | IGTR-PD-60ep | 90.0 | 92.1 | 77.5 | 82.8 | 86.0 | 83.0 | 84.8 | 85.18 | Same as the above table |
55
+ | IGTR-AR-60ep | 91.0 | 93.0 | 78.7 | 84.6 | 87.3 | 84.8 | 85.6 | 86.43 | as above |
56
+ | IGTR-PD-PT | 92.4 | 92.1 | 80.7 | 83.6 | 87.7 | 86.9 | 85.0 | 86.92 | Same as the above table |
57
+ | IGTR-AR-PT | 93.0 | 92.9 | 81.3 | 83.4 | 88.6 | 88.7 | 85.6 | 87.65 | as above |
58
+
59
+ - Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
60
+
61
+ | Model | Scene | Web | Document | Handwriting | Avg | Config&Model&Log |
62
+ | :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
63
+ | IGTR-PD | 73.1 | 74.8 | 98.6 | 52.5 | 74.75 | |
64
+ | IGTR-AR | 75.1 | 76.4 | 98.7 | 55.3 | 76.37 | |
65
+ | IGTR-PD-TS | 73.5 | 75.9 | 98.7 | 54.5 | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
66
+ | IGTR-AR-TS | 75.6 | 77.0 | 98.8 | 57.3 | 77.17 | as above |
67
+ | IGTR-PD-Aug | 79.5 | 80.0 | 99.4 | 58.9 | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
68
+ | IGTR-AR-Aug | 82.0 | 81.7 | 99.5 | 63.8 | 81.74 | as above |
69
+
70
+ Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
71
+
72
+ <a name="2"></a>
73
+
74
+ ## 2. Environment
75
+
76
+ - [PyTorch](http://pytorch.org/) version >= 1.13.0
77
+ - Python version >= 3.7
78
+
79
+ ```shell
80
+ git clone -b develop https://github.com/Topdu/OpenOCR.git
81
+ cd OpenOCR
82
+ # A100 Ubuntu 20.04 Cuda 11.8
83
+ conda create -n openocr python==3.8
84
+ conda activate openocr
85
+ conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
86
+ pip install -r requirements.txt
87
+ ```
88
+
89
+ #### Dataset Preparation
90
+
91
+ [English dataset download](https://github.com/baudm/parseq)
92
+
93
+ [Union14M-L download](https://github.com/Mountchicken/Union14M)
94
+
95
+ [Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
96
+
97
+ The expected filesystem structure is as follows:
98
+
99
+ ```
100
+ benchmark_bctr
101
+ ├── benchmark_bctr_test
102
+ │ ├── document_test
103
+ │ ├── handwriting_test
104
+ │ ├── scene_test
105
+ │ └── web_test
106
+ └── benchmark_bctr_train
107
+ ├── document_train
108
+ ├── handwriting_train
109
+ ├── scene_train
110
+ └── web_train
111
+ evaluation
112
+ ├── CUTE80
113
+ ├── IC13_857
114
+ ├── IC15_1811
115
+ ├── IIIT5k
116
+ ├── SVT
117
+ └── SVTP
118
+ OpenOCR
119
+ synth
120
+ ├── MJ
121
+ │ ├── test
122
+ │ ├── train
123
+ │ └── val
124
+ └── ST
125
+ test # from PARSeq
126
+ ├── ArT
127
+ ├── COCOv1.4
128
+ ├── CUTE80
129
+ ├── IC13_1015
130
+ ├── IC13_1095
131
+ ├── IC13_857
132
+ ├── IC15_1811
133
+ ├── IC15_2077
134
+ ├── IIIT5k
135
+ ├── SVT
136
+ ├── SVTP
137
+ └── Uber
138
+ u14m # lmdb format
139
+ ├── artistic
140
+ ├── contextless
141
+ ├── curve
142
+ ├── general
143
+ ├── multi_oriented
144
+ ├── multi_words
145
+ └── salient
146
+ Union14M-LMDB-L # lmdb format
147
+ ├── train_challenging
148
+ ├── train_easy
149
+ ├── train_hard
150
+ ├── train_medium
151
+ └── train_normal
152
+ ```
153
+
154
+ <a name="3"></a>
155
+
156
+ ## 3. Model Training / Evaluation
157
+
158
+ Training:
159
+
160
+ ```shell
161
+ # The configuration file is available from the link provided in the table above.
162
+ # Multi GPU training
163
+ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
164
+ ```
165
+
166
+ Evaluation:
167
+
168
+ ```shell
169
+ # The configuration file is available from the link provided in the table above.
170
+ # en
171
+ python tools/eval_rec_all_ratio.py --c PATH/svtr_base_igtr_syn.yml
172
+ # ch
173
+ python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
174
+ ```
175
+
176
+ ## Citation
177
+
178
+ ```bibtex
179
+ @article{Du2024IGTR,
180
+ title = {Instruction-Guided Scene Text Recognition},
181
+ author = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
182
+ journal = {CoRR},
183
+ eprinttype = {arXiv},
184
+ primaryClass={cs.CV},
185
+ volume = {abs/2401.17851},
186
+ year = {2024},
187
+ url = {https://arxiv.org/abs/2401.17851}
188
+ }
189
+ ```
configs/rec/igtr/svtr_base_ds_igtr.yml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_igtr
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # 2gpus 384bs/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: IGTR
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRNet2DPos
43
+ img_size: [32, -1]
44
+ out_char_num: 25
45
+ out_channels: 256
46
+ patch_merging: 'Conv'
47
+ embed_dim: [128, 256, 384]
48
+ depth: [6, 6, 6]
49
+ num_heads: [4, 8, 12]
50
+ mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ use_first_sub: False
55
+ Decoder:
56
+ name: IGTRDecoder
57
+ dim: 384
58
+ num_layer: 1
59
+ ar: False
60
+ refine_iter: 0
61
+ # next_pred: True
62
+ next_pred: False
63
+ pos2d: True
64
+ ds: True
65
+ # pos_len: False
66
+ # rec_layer: 1
67
+
68
+
69
+ Loss:
70
+ name: IGTRLoss
71
+
72
+ PostProcess:
73
+ name: IGTRLabelDecode
74
+ character_dict_path: *character_dict_path
75
+ use_space_char: *use_space_char
76
+
77
+ Metric:
78
+ name: RecMetric
79
+ main_indicator: acc
80
+
81
+ Train:
82
+ dataset:
83
+ name: RatioDataSet
84
+ ds_width: True
85
+ padding: &padding False
86
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
87
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
88
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
89
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
90
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
91
+ ]
92
+ transforms:
93
+ - DecodeImage: # load image
94
+ img_mode: BGR
95
+ channel_first: False
96
+ - PARSeqAug:
97
+ - IGTRLabelEncode: # Class handling label
98
+ k: 8
99
+ prompt_error: False
100
+ character_dict_path: *character_dict_path
101
+ use_space_char: *use_space_char
102
+ max_text_length: *max_text_length
103
+ - KeepKeys:
104
+ keep_keys: ['image', 'label', 'prompt_pos_idx_list',
105
+ 'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
106
+ 'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
107
+ 'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
108
+ sampler:
109
+ name: RatioSampler
110
+ scales: [[128, 32]] # w, h
111
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
112
+ first_bs: &bs 384
113
+ fix_bs: false
114
+ divided_factor: [4, 16] # w, h
115
+ is_training: True
116
+ loader:
117
+ shuffle: True
118
+ batch_size_per_card: *bs
119
+ drop_last: True
120
+ max_ratio: &max_ratio 4
121
+ num_workers: 4
122
+
123
+ Eval:
124
+ dataset:
125
+ name: RatioDataSet
126
+ ds_width: True
127
+ padding: *padding
128
+ data_dir_list: ['../evaluation/CUTE80',
129
+ '../evaluation/IC13_857',
130
+ '../evaluation/IC15_1811',
131
+ '../evaluation/IIIT5k',
132
+ '../evaluation/SVT',
133
+ '../evaluation/SVTP']
134
+ transforms:
135
+ - DecodeImage: # load image
136
+ img_mode: BGR
137
+ channel_first: False
138
+ - ARLabelEncode: # Class handling label
139
+ character_dict_path: *character_dict_path
140
+ use_space_char: *use_space_char
141
+ max_text_length: *max_text_length
142
+ - KeepKeys:
143
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
144
+ sampler:
145
+ name: RatioSampler
146
+ scales: [[128, 32]] # w, h
147
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
148
+ first_bs: 256
149
+ fix_bs: false
150
+ divided_factor: [4, 16] # w, h
151
+ is_training: False
152
+ loader:
153
+ shuffle: False
154
+ drop_last: False
155
+ batch_size_per_card: 256
156
+ max_ratio: *max_ratio
157
+ num_workers: 4
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: LISTER
36
+ Transform:
37
+ Encoder:
38
+ name: FocalSVTR
39
+ img_size: [32, 128]
40
+ depths: [6, 6, 9]
41
+ embed_dim: 96
42
+ sub_k: [[1, 1], [2, 1], [1, 1]]
43
+ focal_levels: [3, 3, 3]
44
+ last_stage: False
45
+ feat2d: True
46
+ Decoder:
47
+ name: LISTERDecoder
48
+ detach_grad: False
49
+ attn_scaling: True
50
+ use_fem: False
51
+
52
+ Loss:
53
+ name: LISTERLoss
54
+
55
+ PostProcess:
56
+ name: LISTERLabelDecode
57
+
58
+ Metric:
59
+ name: RecMetric
60
+ main_indicator: acc
61
+ is_filter: True
62
+
63
+ Train:
64
+ dataset:
65
+ name: RatioDataSetTVResize
66
+ ds_width: True
67
+ padding: False
68
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
69
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
70
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
71
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
72
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
73
+ ]
74
+ transforms:
75
+ - DecodeImagePIL: # load image
76
+ img_mode: RGB
77
+ - PARSeqAugPIL:
78
+ - EPLabelEncode: # Class handling label
79
+ character_dict_path: *character_dict_path
80
+ use_space_char: *use_space_char
81
+ max_text_length: *max_text_length
82
+ - KeepKeys:
83
+ keep_keys: ['image', 'label', 'length']
84
+ sampler:
85
+ name: RatioSampler
86
+ scales: [[128, 32]] # w, h
87
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
88
+ first_bs: &bs 256
89
+ fix_bs: false
90
+ divided_factor: [4, 16] # w, h
91
+ is_training: True
92
+ loader:
93
+ shuffle: True
94
+ batch_size_per_card: *bs
95
+ drop_last: True
96
+ max_ratio: 12
97
+ num_workers: 4
98
+
99
+ Eval:
100
+ dataset:
101
+ name: RatioDataSetTVResize
102
+ ds_width: True
103
+ padding: False
104
+ data_dir_list: ['../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - EPLabelEncode: # Class handling label
115
+ character_dict_path: *character_dict_path
116
+ use_space_char: *use_space_char
117
+ max_text_length: *max_text_length
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'length']
120
+ sampler:
121
+ name: RatioSampler
122
+ scales: [[128, 32]] # w, h
123
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
124
+ first_bs: 256
125
+ fix_bs: false
126
+ divided_factor: [4, 16] # w, h
127
+ is_training: False
128
+ loader:
129
+ shuffle: False
130
+ drop_last: False
131
+ batch_size_per_card: *bs
132
+ max_ratio: 12
133
+ num_workers: 4
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.000325
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: LISTER
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ out_channels: 256
42
+ dims: [128, 256, 384]
43
+ depths: [6, 6, 6]
44
+ num_heads: [4, 8, 12]
45
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
46
+ local_k: [[5, 5], [5, 5], [-1, -1]]
47
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
48
+ last_stage: false
49
+ feat2d: True
50
+ Decoder:
51
+ name: LISTERDecoder
52
+ detach_grad: False
53
+ attn_scaling: True
54
+ use_fem: False
55
+
56
+ Loss:
57
+ name: LISTERLoss
58
+
59
+ PostProcess:
60
+ name: LISTERLabelDecode
61
+
62
+ Metric:
63
+ name: RecMetric
64
+ main_indicator: acc
65
+ is_filter: True
66
+
67
+ Train:
68
+ dataset:
69
+ name: RatioDataSetTVResize
70
+ ds_width: True
71
+ padding: False
72
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
77
+ ]
78
+ transforms:
79
+ - DecodeImagePIL: # load image
80
+ img_mode: RGB
81
+ - PARSeqAugPIL:
82
+ - EPLabelEncode: # Class handling label
83
+ character_dict_path: *character_dict_path
84
+ use_space_char: *use_space_char
85
+ max_text_length: *max_text_length
86
+ - KeepKeys:
87
+ keep_keys: ['image', 'label', 'length']
88
+ sampler:
89
+ name: RatioSampler
90
+ scales: [[128, 32]] # w, h
91
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
92
+ first_bs: &bs 128
93
+ fix_bs: false
94
+ divided_factor: [4, 16] # w, h
95
+ is_training: True
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: *bs
99
+ drop_last: True
100
+ max_ratio: 12
101
+ num_workers: 4
102
+
103
+ Eval:
104
+ dataset:
105
+ name: RatioDataSetTVResize
106
+ ds_width: True
107
+ padding: False
108
+ data_dir_list: ['../evaluation/CUTE80',
109
+ '../evaluation/IC13_857',
110
+ '../evaluation/IC15_1811',
111
+ '../evaluation/IIIT5k',
112
+ '../evaluation/SVT',
113
+ '../evaluation/SVTP',
114
+ ]
115
+ transforms:
116
+ - DecodeImagePIL: # load image
117
+ img_mode: RGB
118
+ - EPLabelEncode: # Class handling label
119
+ character_dict_path: *character_dict_path
120
+ use_space_char: *use_space_char
121
+ max_text_length: *max_text_length
122
+
123
+ - KeepKeys:
124
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
125
+ sampler:
126
+ name: RatioSampler
127
+ scales: [[128, 32]] # w, h
128
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
129
+ first_bs: 256
130
+ fix_bs: false
131
+ divided_factor: [4, 16] # w, h
132
+ is_training: False
133
+ loader:
134
+ shuffle: False
135
+ drop_last: False
136
+ batch_size_per_card: *bs
137
+ max_ratio: 12
138
+ num_workers: 4
configs/rec/lpv/svtr_base_lpv.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ # ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
14
+ checkpoints:
15
+ use_tensorboard: false
16
+ infer_img:
17
+ # for data or label process
18
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
23
+ use_amp: True
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: Adam
28
+ lr: 0.0001 # for 4gpus bs128/gpu
29
+ weight_decay: 0.0
30
+ filter_bias_and_bn: False
31
+ betas: [0.9, 0.99]
32
+
33
+ LRScheduler:
34
+ name: MultiStepLR
35
+ milestones: [12]
36
+ gamma: 0.1
37
+
38
+ Architecture:
39
+ model_type: rec
40
+ algorithm: LPV
41
+ in_channels: 3
42
+ Transform:
43
+ Encoder:
44
+ name: SVTRNet
45
+ img_size: [32, 128]
46
+ out_char_num: 25
47
+ out_channels: 256
48
+ patch_merging: 'Conv'
49
+ embed_dim: [128, 256, 384]
50
+ depth: [6, 6, 6]
51
+ num_heads: [4, 8, 12]
52
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
53
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
54
+ sub_k: [[1, 1], [1, 1]]
55
+ feature2d: True
56
+ last_stage: False
57
+ prenorm: True
58
+ Decoder:
59
+ name: LPVDecoder
60
+ num_layer: 3
61
+ max_len: *max_text_length
62
+ use_mask: True
63
+ dim_feedforward: 1536
64
+ nhead: 12
65
+ dropout: 0.1
66
+ trans_layer: 3
67
+
68
+ Loss:
69
+ name: LPVLoss
70
+
71
+ PostProcess:
72
+ name: ARLabelDecode
73
+ character_dict_path: *character_dict_path
74
+ use_space_char: *use_space_char
75
+
76
+ Metric:
77
+ name: RecMetric
78
+ main_indicator: acc
79
+ is_filter: True
80
+
81
+ Train:
82
+ dataset:
83
+ name: LMDBDataSet
84
+ data_dir: ../Union14M-L-LMDB-Filtered
85
+ transforms:
86
+ - DecodeImagePIL: # load image
87
+ img_mode: RGB
88
+ - PARSeqAugPIL:
89
+ - ARLabelEncode: # Class handling label
90
+ character_dict_path: *character_dict_path
91
+ use_space_char: *use_space_char
92
+ max_text_length: *max_text_length
93
+ - RecTVResize:
94
+ image_shape: [32, 128]
95
+ padding: False
96
+ - KeepKeys:
97
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
98
+ loader:
99
+ shuffle: True
100
+ batch_size_per_card: 128
101
+ drop_last: True
102
+ num_workers: 4
103
+
104
+ Eval:
105
+ dataset:
106
+ name: LMDBDataSet
107
+ data_dir: ../evaluation/
108
+ transforms:
109
+ - DecodeImagePIL: # load image
110
+ img_mode: RGB
111
+ - ARLabelEncode: # Class handling label
112
+ character_dict_path: *character_dict_path
113
+ use_space_char: *use_space_char
114
+ max_text_length: *max_text_length
115
+ - RecTVResize:
116
+ image_shape: [32, 128]
117
+ padding: False
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
120
+ loader:
121
+ shuffle: False
122
+ drop_last: False
123
+ batch_size_per_card: 128
124
+ num_workers: 4
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
22
+ use_amp: True
23
+ grad_clip_val: 20
24
+
25
+ Optimizer:
26
+ name: Adam
27
+ lr: 0.0001 # for 4gpus bs128/gpu
28
+ weight_decay: 0.0
29
+ filter_bias_and_bn: False
30
+ betas: [0.9, 0.99]
31
+
32
+ LRScheduler:
33
+ name: MultiStepLR
34
+ milestones: [12]
35
+ gamma: 0.1
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: LPV
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRNet
44
+ img_size: [32, 128]
45
+ out_char_num: 25
46
+ out_channels: 256
47
+ patch_merging: 'Conv'
48
+ embed_dim: [128, 256, 384]
49
+ depth: [6, 6, 6]
50
+ num_heads: [4, 8, 12]
51
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
52
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
53
+ sub_k: [[1, 1], [1, 1]]
54
+ feature2d: True
55
+ last_stage: False
56
+ prenorm: True
57
+ Decoder:
58
+ name: LPVDecoder
59
+ num_layer: 3
60
+ max_len: *max_text_length
61
+ use_mask: False
62
+ dim_feedforward: 1536
63
+ nhead: 12
64
+ dropout: 0.1
65
+ trans_layer: 3
66
+
67
+ Loss:
68
+ name: LPVLoss
69
+
70
+ PostProcess:
71
+ name: ARLabelDecode
72
+ character_dict_path: *character_dict_path
73
+ use_space_char: *use_space_char
74
+
75
+ Metric:
76
+ name: RecMetric
77
+ main_indicator: acc
78
+ is_filter: True
79
+
80
+ Train:
81
+ dataset:
82
+ name: LMDBDataSet
83
+ data_dir: ../Union14M-L-LMDB-Filtered
84
+ transforms:
85
+ - DecodeImagePIL: # load image
86
+ img_mode: RGB
87
+ - PARSeqAugPIL:
88
+ - ARLabelEncode: # Class handling label
89
+ character_dict_path: *character_dict_path
90
+ use_space_char: *use_space_char
91
+ max_text_length: *max_text_length
92
+ - RecTVResize:
93
+ image_shape: [32, 128]
94
+ padding: False
95
+ - KeepKeys:
96
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
97
+ loader:
98
+ shuffle: True
99
+ batch_size_per_card: 128
100
+ drop_last: True
101
+ num_workers: 4
102
+
103
+ Eval:
104
+ dataset:
105
+ name: LMDBDataSet
106
+ data_dir: ../evaluation/
107
+ transforms:
108
+ - DecodeImagePIL: # load image
109
+ img_mode: RGB
110
+ - ARLabelEncode: # Class handling label
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/lpv/svtrv2_lpv.yml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ # ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
14
+ checkpoints:
15
+ use_tensorboard: false
16
+ infer_img:
17
+ # for data or label process
18
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
23
+ use_amp: True
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: AdamW
28
+ lr: 0.000325 # for 4gpus bs128/gpu
29
+ weight_decay: 0.05
30
+ filter_bias_and_bn: True
31
+
32
+ LRScheduler:
33
+ name: OneCycleLR
34
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
35
+ cycle_momentum: False
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: LPV
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRv2LNConvTwo33
44
+ use_pos_embed: False
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: LPVDecoder
55
+ num_layer: 3
56
+ max_len: *max_text_length
57
+ use_mask: True
58
+ dim_feedforward: 1536
59
+ nhead: 12
60
+ dropout: 0.1
61
+ trans_layer: 3
62
+
63
+ Loss:
64
+ name: LPVLoss
65
+
66
+ PostProcess:
67
+ name: ARLabelDecode
68
+ character_dict_path: *character_dict_path
69
+ use_space_char: *use_space_char
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ Train:
77
+ dataset:
78
+ name: RatioDataSetTVResize
79
+ ds_width: True
80
+ padding: false
81
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
82
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
83
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
84
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
85
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
86
+ ]
87
+ transforms:
88
+ - DecodeImagePIL: # load image
89
+ img_mode: RGB
90
+ - PARSeqAugPIL:
91
+ - ARLabelEncode: # Class handling label
92
+ character_dict_path: *character_dict_path
93
+ use_space_char: *use_space_char
94
+ max_text_length: *max_text_length
95
+ - KeepKeys:
96
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
97
+ sampler:
98
+ name: RatioSampler
99
+ scales: [[128, 32]] # w, h
100
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
101
+ first_bs: &bs 128
102
+ fix_bs: false
103
+ divided_factor: [4, 16] # w, h
104
+ is_training: True
105
+ loader:
106
+ shuffle: True
107
+ batch_size_per_card: *bs
108
+ drop_last: True
109
+ max_ratio: &max_ratio 4
110
+ num_workers: 4
111
+
112
+ Eval:
113
+ dataset:
114
+ name: RatioDataSetTVResize
115
+ ds_width: True
116
+ padding: False
117
+ data_dir_list: [
118
+ '../evaluation/CUTE80',
119
+ '../evaluation/IC13_857',
120
+ '../evaluation/IC15_1811',
121
+ '../evaluation/IIIT5k',
122
+ '../evaluation/SVT',
123
+ '../evaluation/SVTP',
124
+ ]
125
+ transforms:
126
+ - DecodeImagePIL: # load image
127
+ img_mode: RGB
128
+ - ARLabelEncode: # Class handling label
129
+ character_dict_path: *character_dict_path
130
+ use_space_char: *use_space_char
131
+ max_text_length: *max_text_length
132
+ - KeepKeys:
133
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
134
+ sampler:
135
+ name: RatioSampler
136
+ scales: [[128, 32]] # w, h
137
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
138
+ first_bs: *bs
139
+ fix_bs: false
140
+ divided_factor: [4, 16] # w, h
141
+ is_training: False
142
+ loader:
143
+ shuffle: False
144
+ drop_last: False
145
+ batch_size_per_card: *bs
146
+ max_ratio: *max_ratio
147
+ num_workers: 4
configs/rec/lpv/svtrv2_lpv_wo_glrm.yml ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv_wo_glrm.txt
22
+ use_amp: True
23
+ grad_clip_val: 20
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.000325 # for 4gpus bs128/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: LPV
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: True
52
+ Decoder:
53
+ name: LPVDecoder
54
+ num_layer: 3
55
+ max_len: *max_text_length
56
+ use_mask: False
57
+ dim_feedforward: 1536
58
+ nhead: 12
59
+ dropout: 0.1
60
+ trans_layer: 3
61
+
62
+ Loss:
63
+ name: LPVLoss
64
+
65
+ PostProcess:
66
+ name: ARLabelDecode
67
+ character_dict_path: *character_dict_path
68
+ use_space_char: *use_space_char
69
+
70
+ Metric:
71
+ name: RecMetric
72
+ main_indicator: acc
73
+ is_filter: True
74
+
75
+ Train:
76
+ dataset:
77
+ name: RatioDataSetTVResize
78
+ ds_width: True
79
+ padding: false
80
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
81
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
82
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
83
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
84
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
85
+ ]
86
+ transforms:
87
+ - DecodeImagePIL: # load image
88
+ img_mode: RGB
89
+ - PARSeqAugPIL:
90
+ - ARLabelEncode: # Class handling label
91
+ character_dict_path: *character_dict_path
92
+ use_space_char: *use_space_char
93
+ max_text_length: *max_text_length
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
96
+ sampler:
97
+ name: RatioSampler
98
+ scales: [[128, 32]] # w, h
99
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
100
+ first_bs: &bs 128
101
+ fix_bs: false
102
+ divided_factor: [4, 16] # w, h
103
+ is_training: True
104
+ loader:
105
+ shuffle: True
106
+ batch_size_per_card: *bs
107
+ drop_last: True
108
+ max_ratio: &max_ratio 4
109
+ num_workers: 4
110
+
111
+ Eval:
112
+ dataset:
113
+ name: RatioDataSetTVResize
114
+ ds_width: True
115
+ padding: False
116
+ data_dir_list: [
117
+ '../evaluation/CUTE80',
118
+ '../evaluation/IC13_857',
119
+ '../evaluation/IC15_1811',
120
+ '../evaluation/IIIT5k',
121
+ '../evaluation/SVT',
122
+ '../evaluation/SVTP',
123
+ ]
124
+ transforms:
125
+ - DecodeImagePIL: # load image
126
+ img_mode: RGB
127
+ - ARLabelEncode: # Class handling label
128
+ character_dict_path: *character_dict_path
129
+ use_space_char: *use_space_char
130
+ max_text_length: *max_text_length
131
+ - KeepKeys:
132
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
133
+ sampler:
134
+ name: RatioSampler
135
+ scales: [[128, 32]] # w, h
136
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
137
+ first_bs: *bs
138
+ fix_bs: false
139
+ divided_factor: [4, 16] # w, h
140
+ is_training: False
141
+ loader:
142
+ shuffle: False
143
+ drop_last: False
144
+ batch_size_per_card: *bs
145
+ max_ratio: *max_ratio
146
+ num_workers: 4
configs/rec/maerec/vit_nrtr.yml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_nrtr_ft_mae/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ # ./open_ocr_vit_small_params.pth
14
+ checkpoints:
15
+ use_tensorboard: false
16
+ infer_img:
17
+ # for data or label process
18
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_nrtr_ft_mae.txt
23
+ use_amp: True
24
+ project_name: maerec
25
+
26
+ Optimizer:
27
+ name: AdamW
28
+ lr: 0.00065 # for 4gpus bs256/gpu
29
+ weight_decay: 0.05
30
+ filter_bias_and_bn: True
31
+
32
+ LRScheduler:
33
+ name: OneCycleLR
34
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
35
+ cycle_momentum: False
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: BGPD
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: ViT
44
+ img_size: [32, 128]
45
+ patch_size: [4, 4]
46
+ embed_dim: 384
47
+ depth: 12
48
+ num_heads: 6
49
+ mlp_ratio: 4
50
+ qkv_bias: True
51
+ use_cls_token: True
52
+ Decoder:
53
+ name: NRTRDecoder
54
+ num_encoder_layers: -1
55
+ beam_size: 0
56
+ num_decoder_layers: 6
57
+ nhead: 8
58
+ max_len: *max_text_length
59
+
60
+ Loss:
61
+ name: ARLoss
62
+
63
+ PostProcess:
64
+ name: ARLabelDecode
65
+ character_dict_path: *character_dict_path
66
+ use_space_char: *use_space_char
67
+
68
+ Metric:
69
+ name: RecMetric
70
+ main_indicator: acc
71
+ is_filter: True
72
+
73
+ Train:
74
+ dataset:
75
+ name: LMDBDataSet
76
+ data_dir: ../Union14M-L-LMDB-Filtered
77
+ transforms:
78
+ - DecodeImagePIL: # load image
79
+ img_mode: RGB
80
+ - PARSeqAugPIL:
81
+ - ARLabelEncode: # Class handling label
82
+ character_dict_path: *character_dict_path
83
+ use_space_char: *use_space_char
84
+ max_text_length: *max_text_length
85
+ - RecTVResize:
86
+ image_shape: [32, 128]
87
+ padding: False
88
+ - KeepKeys:
89
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
90
+ loader:
91
+ shuffle: True
92
+ batch_size_per_card: 256
93
+ drop_last: True
94
+ num_workers: 4
95
+
96
+ Eval:
97
+ dataset:
98
+ name: LMDBDataSet
99
+ data_dir: ../evaluation/
100
+ transforms:
101
+ - DecodeImagePIL: # load image
102
+ img_mode: RGB
103
+ - ARLabelEncode: # Class handling label
104
+ character_dict_path: *character_dict_path
105
+ use_space_char: *use_space_char
106
+ max_text_length: *max_text_length
107
+ - RecTVResize:
108
+ image_shape: [32, 128]
109
+ padding: False
110
+ - KeepKeys:
111
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
112
+ loader:
113
+ shuffle: False
114
+ drop_last: False
115
+ batch_size_per_card: 256
116
+ num_workers: 4
configs/rec/matrn/resnet45_trans_matrn.yml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_matrn/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_matrn.txt
20
+ grad_clip_val: 20
21
+ use_amp: True
22
+
23
+ Optimizer:
24
+ name: Adam
25
+ lr: 0.000133 # 4gpus 128bs/gpu
26
+ weight_decay: 0.0
27
+ filter_bias_and_bn: False
28
+
29
+ LRScheduler:
30
+ name: MultiStepLR
31
+ milestones: [12, 18]
32
+ gamma: 0.1
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: MATRN
37
+ Transform:
38
+ Encoder:
39
+ name: ResNet45
40
+ in_channels: 3
41
+ strides: [2, 1, 2, 1, 1]
42
+ Decoder:
43
+ name: MATRNDecoder
44
+ iter_size: 3
45
+
46
+ Loss:
47
+ name: ABINetLoss
48
+ align_weight: 3.0
49
+
50
+ PostProcess:
51
+ name: ABINetLabelDecode
52
+
53
+ Metric:
54
+ name: RecMetric
55
+ main_indicator: acc
56
+ is_filter: True
57
+
58
+ Train:
59
+ dataset:
60
+ name: LMDBDataSet
61
+ data_dir: ../Union14M-L-LMDB-Filtered
62
+ transforms:
63
+ - DecodeImagePIL: # load image
64
+ img_mode: RGB
65
+ - PARSeqAugPIL:
66
+ - ABINetLabelEncode:
67
+ - RecTVResize:
68
+ image_shape: [32, 128]
69
+ padding: False
70
+ - KeepKeys:
71
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
72
+ loader:
73
+ shuffle: True
74
+ batch_size_per_card: 128
75
+ drop_last: True
76
+ num_workers: 4
77
+
78
+ Eval:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../evaluation
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - ABINetLabelEncode:
86
+ - RecTVResize:
87
+ image_shape: [32, 128]
88
+ padding: False
89
+ - KeepKeys:
90
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
91
+ loader:
92
+ shuffle: False
93
+ drop_last: False
94
+ batch_size_per_card: 256
95
+ num_workers: 2
configs/rec/matrn/svtrv2_matrn.yml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_matrn/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_svtrv2_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_matrn.txt
20
+ use_amp: True
21
+ grad_clip_val: 20
22
+
23
+ Optimizer:
24
+ name: AdamW
25
+ lr: 0.000325 # for 4gpus bs128/gpu
26
+ weight_decay: 0.05
27
+ filter_bias_and_bn: True
28
+
29
+ LRScheduler:
30
+ name: OneCycleLR
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: MATRN
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ dims: [128, 256, 384]
42
+ depths: [6, 6, 6]
43
+ num_heads: [4, 8, 12]
44
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
45
+ local_k: [[5, 5], [5, 5], [-1, -1]]
46
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
47
+ last_stage: false
48
+ feat2d: True
49
+ Decoder:
50
+ name: MATRNDecoder
51
+ iter_size: 3
52
+ num_layers: 0
53
+
54
+ Loss:
55
+ name: ABINetLoss
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSetTVResize
68
+ ds_width: True
69
+ padding: false
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImagePIL: # load image
78
+ img_mode: RGB
79
+ - PARSeqAugPIL:
80
+ - ABINetLabelEncode:
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
83
+ sampler:
84
+ name: RatioSampler
85
+ scales: [[128, 32]] # w, h
86
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
87
+ first_bs: &bs 128
88
+ fix_bs: false
89
+ divided_factor: [4, 16] # w, h
90
+ is_training: True
91
+ loader:
92
+ shuffle: True
93
+ batch_size_per_card: *bs
94
+ drop_last: True
95
+ max_ratio: &max_ratio 4
96
+ num_workers: 4
97
+
98
+ Eval:
99
+ dataset:
100
+ name: RatioDataSetTVResize
101
+ ds_width: True
102
+ padding: False
103
+ data_dir_list: [
104
+ '../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - ABINetLabelEncode:
115
+ - KeepKeys:
116
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
117
+ sampler:
118
+ name: RatioSampler
119
+ scales: [[128, 32]] # w, h
120
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
121
+ first_bs: *bs
122
+ fix_bs: false
123
+ divided_factor: [4, 16] # w, h
124
+ is_training: False
125
+ loader:
126
+ shuffle: False
127
+ drop_last: False
128
+ batch_size_per_card: *bs
129
+ max_ratio: *max_ratio
130
+ num_workers: 4
configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_mgpstr_only_char/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_mgpstr_only_char.txt
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # 4gpus 256bs/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: MGPSTR
35
+ Transform:
36
+ Encoder:
37
+ name: SVTRv2LNConvTwo33
38
+ use_pos_embed: False
39
+ out_channels: 256
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: false
48
+ Decoder:
49
+ name: MGPDecoder
50
+ only_char: &only_char True
51
+
52
+ Loss:
53
+ name: MGPLoss
54
+ only_char: *only_char
55
+
56
+ PostProcess:
57
+ name: MPGLabelDecode
58
+ character_dict_path: *character_dict_path
59
+ use_space_char: *use_space_char
60
+ only_char: *only_char
61
+
62
+ Metric:
63
+ name: RecMetric
64
+ main_indicator: acc
65
+ is_filter: True
66
+
67
+ Train:
68
+ dataset:
69
+ name: RatioDataSetTVResize
70
+ ds_width: True
71
+ padding: false
72
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
73
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
74
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
75
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
76
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
77
+ ]
78
+ transforms:
79
+ - DecodeImagePIL: # load image
80
+ img_mode: RGB
81
+ - PARSeqAugPIL:
82
+ - MGPLabelEncode: # Class handling label
83
+ character_dict_path: *character_dict_path
84
+ use_space_char: *use_space_char
85
+ max_text_length: *max_text_length
86
+ only_char: *only_char
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
89
+ sampler:
90
+ name: RatioSampler
91
+ scales: [[128, 32]] # w, h
92
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
93
+ first_bs: &bs 256
94
+ fix_bs: false
95
+ divided_factor: [4, 16] # w, h
96
+ is_training: True
97
+ loader:
98
+ shuffle: True
99
+ batch_size_per_card: *bs
100
+ drop_last: True
101
+ max_ratio: &max_ratio 4
102
+ num_workers: 4
103
+
104
+ Eval:
105
+ dataset:
106
+ name: RatioDataSetTVResize
107
+ ds_width: True
108
+ padding: False
109
+ data_dir_list: [
110
+ '../evaluation/CUTE80',
111
+ '../evaluation/IC13_857',
112
+ '../evaluation/IC15_1811',
113
+ '../evaluation/IIIT5k',
114
+ '../evaluation/SVT',
115
+ '../evaluation/SVTP',
116
+ ]
117
+ transforms:
118
+ - DecodeImagePIL: # load image
119
+ img_mode: RGB
120
+ - MGPLabelEncode: # Class handling label
121
+ character_dict_path: *character_dict_path
122
+ use_space_char: *use_space_char
123
+ max_text_length: *max_text_length
124
+ only_char: *only_char
125
+ - KeepKeys:
126
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
127
+ sampler:
128
+ name: RatioSampler
129
+ scales: [[128, 32]] # w, h
130
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
131
+ first_bs: *bs
132
+ fix_bs: false
133
+ divided_factor: [4, 16] # w, h
134
+ is_training: False
135
+ loader:
136
+ shuffle: False
137
+ drop_last: False
138
+ batch_size_per_card: *bs
139
+ max_ratio: *max_ratio
140
+ num_workers: 4
configs/rec/mgpstr/vit_base_mgpstr_only_char.yml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_base_mgpstr/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
20
+ grad_clip_val: 5
21
+ project_name: mgpstr_base
22
+
23
+ Optimizer:
24
+ name: Adam
25
+ lr: 0.000325 # 4gpus 128bs/gpu
26
+ weight_decay: 0.
27
+ filter_bias_and_bn: False
28
+
29
+ LRScheduler:
30
+ name: OneCycleLR
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: MGPSTR
37
+ Transform:
38
+ Encoder:
39
+ name: ViT
40
+ img_size: [32,128]
41
+ patch_size: [4, 4]
42
+ embed_dim: 768
43
+ depth: 12
44
+ num_heads: 12
45
+ mlp_ratio: 4
46
+ qkv_bias: True
47
+ Decoder:
48
+ name: MGPDecoder
49
+ only_char: &only_char True
50
+
51
+ Loss:
52
+ name: MGPLoss
53
+ only_char: *only_char
54
+
55
+ PostProcess:
56
+ name: MPGLabelDecode
57
+ character_dict_path: *character_dict_path
58
+ use_space_char: *use_space_char
59
+ only_char: *only_char
60
+
61
+ Metric:
62
+ name: RecMetric
63
+ main_indicator: acc
64
+ is_filter: True
65
+
66
+ Train:
67
+ dataset:
68
+ name: LMDBDataSet
69
+ data_dir: ../Union14M-L-LMDB-Filtered
70
+ transforms:
71
+ - DecodeImagePIL: # load image
72
+ img_mode: RGB
73
+ - PARSeqAugPIL:
74
+ - MGPLabelEncode: # Class handling label
75
+ character_dict_path: *character_dict_path
76
+ use_space_char: *use_space_char
77
+ max_text_length: *max_text_length
78
+ only_char: *only_char
79
+ - RecTVResize:
80
+ image_shape: [32, 128]
81
+ padding: False
82
+ - KeepKeys:
83
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
84
+ loader:
85
+ shuffle: True
86
+ batch_size_per_card: 128
87
+ drop_last: True
88
+ num_workers: 4
89
+
90
+ Eval:
91
+ dataset:
92
+ name: LMDBDataSet
93
+ data_dir: ../evaluation/
94
+ transforms:
95
+ - DecodeImagePIL: # load image
96
+ img_mode: RGB
97
+ - MGPLabelEncode: # Class handling label
98
+ character_dict_path: *character_dict_path
99
+ use_space_char: *use_space_char
100
+ max_text_length: *max_text_length
101
+ only_char: *only_char
102
+ - RecTVResize:
103
+ image_shape: [32, 128]
104
+ padding: False
105
+ - KeepKeys:
106
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
107
+ loader:
108
+ shuffle: False
109
+ drop_last: False
110
+ batch_size_per_card: 256
111
+ num_workers: 2
configs/rec/mgpstr/vit_large_mgpstr_only_char.yml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_base_mgpstr_only_char/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
20
+ grad_clip_val: 5
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.000325 # 4gpus 128bs/gpu
25
+ weight_decay: 0.
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: MGPSTR
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 4]
41
+ embed_dim: 1024
42
+ depth: 24
43
+ num_heads: 16
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: MGPDecoder
48
+ only_char: &only_char True
49
+
50
+ Loss:
51
+ name: MGPLoss
52
+ only_char: *only_char
53
+
54
+ PostProcess:
55
+ name: MPGLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+ only_char: *only_char
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - MGPLabelEncode: # Class handling label
74
+ character_dict_path: *character_dict_path
75
+ use_space_char: *use_space_char
76
+ max_text_length: *max_text_length
77
+ only_char: *only_char
78
+ - RecTVResize:
79
+ image_shape: [32, 128]
80
+ padding: False
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
83
+ loader:
84
+ shuffle: True
85
+ batch_size_per_card: 128
86
+ drop_last: True
87
+ num_workers: 4
88
+
89
+ Eval:
90
+ dataset:
91
+ name: LMDBDataSet
92
+ data_dir: ../evaluation/
93
+ transforms:
94
+ - DecodeImagePIL: # load image
95
+ img_mode: RGB
96
+ - MGPLabelEncode: # Class handling label
97
+ character_dict_path: *character_dict_path
98
+ use_space_char: *use_space_char
99
+ max_text_length: *max_text_length
100
+ only_char: *only_char
101
+ - RecTVResize:
102
+ image_shape: [32, 128]
103
+ padding: False
104
+ - KeepKeys:
105
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
106
+ loader:
107
+ shuffle: False
108
+ drop_last: False
109
+ batch_size_per_card: 256
110
+ num_workers: 2
configs/rec/mgpstr/vit_mgpstr.yml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_mgpstr/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [100000, 2000]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr.txt
20
+ grad_clip_val: 5
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.000325 # 4gpus 128bs/gpu
25
+ weight_decay: 0.
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: MGPSTR
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 4]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: MGPDecoder
48
+ only_char: &only_char False
49
+
50
+ Loss:
51
+ name: MGPLoss
52
+ only_char: *only_char
53
+
54
+ PostProcess:
55
+ name: MPGLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+ only_char: *only_char
59
+
60
+ Metric:
61
+ name: RecMPGMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - MGPLabelEncode: # Class handling label
74
+ character_dict_path: *character_dict_path
75
+ use_space_char: *use_space_char
76
+ max_text_length: *max_text_length
77
+ only_char: *only_char
78
+ - RecTVResize:
79
+ image_shape: [32, 128]
80
+ padding: False
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
83
+ loader:
84
+ shuffle: True
85
+ batch_size_per_card: 128
86
+ drop_last: True
87
+ num_workers: 4
88
+
89
+ Eval:
90
+ dataset:
91
+ name: LMDBDataSet
92
+ data_dir: ../evaluation/
93
+ transforms:
94
+ - DecodeImagePIL: # load image
95
+ img_mode: RGB
96
+ - MGPLabelEncode: # Class handling label
97
+ character_dict_path: *character_dict_path
98
+ use_space_char: *use_space_char
99
+ max_text_length: *max_text_length
100
+ only_char: *only_char
101
+ - RecTVResize:
102
+ image_shape: [32, 128]
103
+ padding: False
104
+ - KeepKeys:
105
+ keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
106
+ loader:
107
+ shuffle: False
108
+ drop_last: False
109
+ batch_size_per_card: 256
110
+ num_workers: 2
configs/rec/mgpstr/vit_mgpstr_only_char.yml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_mgpstr_only_char/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
20
+ grad_clip_val: 5
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.000325 # 4gpus 128bs/gpu
25
+ weight_decay: 0.
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: MGPSTR
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 4]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: MGPDecoder
48
+ only_char: &only_char True
49
+
50
+ Loss:
51
+ name: MGPLoss
52
+ only_char: *only_char
53
+
54
+ PostProcess:
55
+ name: MPGLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+ only_char: *only_char
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - MGPLabelEncode: # Class handling label
74
+ character_dict_path: *character_dict_path
75
+ use_space_char: *use_space_char
76
+ max_text_length: *max_text_length
77
+ only_char: *only_char
78
+ - RecTVResize:
79
+ image_shape: [32, 128]
80
+ padding: False
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
83
+ loader:
84
+ shuffle: True
85
+ batch_size_per_card: 128
86
+ drop_last: True
87
+ num_workers: 4
88
+
89
+ Eval:
90
+ dataset:
91
+ name: LMDBDataSet
92
+ data_dir: ../evaluation/
93
+ transforms:
94
+ - DecodeImagePIL: # load image
95
+ img_mode: RGB
96
+ - MGPLabelEncode: # Class handling label
97
+ character_dict_path: *character_dict_path
98
+ use_space_char: *use_space_char
99
+ max_text_length: *max_text_length
100
+ only_char: *only_char
101
+ - RecTVResize:
102
+ image_shape: [32, 128]
103
+ padding: False
104
+ - KeepKeys:
105
+ keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
106
+ loader:
107
+ shuffle: False
108
+ drop_last: False
109
+ batch_size_per_card: 256
110
+ num_workers: 2
configs/rec/moran/resnet31_lstm_moran.yml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet31_lstm_moran
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/predicts_moran.txt
19
+ use_amp: True
20
+ grad_clip_val: 1.0
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 1gpus bs1024/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: MORAN
36
+ Transform:
37
+ name: MORN
38
+ target_shape: [32, 128]
39
+ Encoder:
40
+ name: ResNet_ASTER
41
+ Decoder:
42
+ name: ASTERDecoder
43
+
44
+ Loss:
45
+ name: ARLoss
46
+
47
+ Metric:
48
+ name: RecMetric
49
+ main_indicator: acc
50
+ is_filter: True
51
+
52
+ PostProcess:
53
+ name: ARLabelDecode
54
+
55
+ Train:
56
+ dataset:
57
+ name: LMDBDataSet
58
+ data_dir: ../Union14M-L-LMDB-Filtered
59
+ transforms:
60
+ - DecodeImagePIL: # load image
61
+ img_mode: RGB
62
+ - PARSeqAugPIL:
63
+ - ARLabelEncode: # Class handling label
64
+ - RecTVResize:
65
+ image_shape: [64, 256]
66
+ padding: False
67
+ - KeepKeys:
68
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
69
+ loader:
70
+ shuffle: True
71
+ batch_size_per_card: 1024
72
+ drop_last: True
73
+ num_workers: 4
74
+
75
+ Eval:
76
+ dataset:
77
+ name: LMDBDataSet
78
+ data_dir: ../evaluation
79
+ transforms:
80
+ - DecodeImagePIL: # load image
81
+ img_mode: RGB
82
+ - ARLabelEncode: # Class handling label
83
+ - RecTVResize:
84
+ image_shape: [64, 256]
85
+ padding: False
86
+ - KeepKeys:
87
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
88
+ loader:
89
+ shuffle: False
90
+ drop_last: False
91
+ batch_size_per_card: 256
92
+ num_workers: 2
configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/focalsvtr_nrtr_maxrtio12
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img: ../ltb/img
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_nrtr_maxrtio12.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: NRTR
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: FocalSVTR
42
+ img_size: [32, 128]
43
+ depths: [6, 6, 6]
44
+ embed_dim: 96
45
+ sub_k: [[1, 1], [2, 1], [1, 1]]
46
+ focal_levels: [3, 3, 3]
47
+ last_stage: False
48
+ Decoder:
49
+ name: NRTRDecoder
50
+ num_encoder_layers: -1
51
+ beam_size: 0
52
+ num_decoder_layers: 2
53
+ nhead: 12
54
+ max_len: *max_text_length
55
+
56
+ Loss:
57
+ name: ARLoss
58
+
59
+ PostProcess:
60
+ name: ARLabelDecode
61
+ character_dict_path: *character_dict_path
62
+ use_space_char: *use_space_char
63
+
64
+ Metric:
65
+ name: RecMetric
66
+ main_indicator: acc
67
+ is_filter: True
68
+
69
+ Train:
70
+ dataset:
71
+ name: RatioDataSet
72
+ ds_width: True
73
+ padding: &padding True
74
+ padding_rand: True
75
+ padding_doub: True
76
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
77
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
78
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
79
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
80
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
81
+ ]
82
+ transforms:
83
+ - DecodeImage: # load image
84
+ img_mode: BGR
85
+ channel_first: False
86
+ - PARSeqAug:
87
+ - ARLabelEncode: # Class handling label
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - KeepKeys:
92
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
93
+ sampler:
94
+ name: RatioSampler
95
+ scales: [[128, 32]] # w, h
96
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
97
+ first_bs: &bs 256
98
+ fix_bs: false
99
+ divided_factor: [4, 16] # w, h
100
+ is_training: True
101
+ loader:
102
+ shuffle: True
103
+ batch_size_per_card: *bs
104
+ drop_last: True
105
+ max_ratio: &max_ratio 12
106
+ num_workers: 4
107
+
108
+ Eval:
109
+ dataset:
110
+ name: RatioDataSet
111
+ ds_width: True
112
+ padding: False
113
+ padding_rand: False
114
+ data_dir_list: [
115
+ '../evaluation/CUTE80',
116
+ '../evaluation/IC13_857',
117
+ '../evaluation/IC15_1811',
118
+ '../evaluation/IIIT5k',
119
+ '../evaluation/SVT',
120
+ '../evaluation/SVTP',
121
+ ]
122
+ transforms:
123
+ - DecodeImage: # load image
124
+ img_mode: BGR
125
+ channel_first: False
126
+ - ARLabelEncode: # Class handling label
127
+ character_dict_path: *character_dict_path
128
+ use_space_char: *use_space_char
129
+ max_text_length: *max_text_length
130
+ - KeepKeys:
131
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
132
+ sampler:
133
+ name: RatioSampler
134
+ scales: [[128, 32]] # w, h
135
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
136
+ first_bs: 128
137
+ fix_bs: false
138
+ divided_factor: [4, 16] # w, h
139
+ is_training: False
140
+ loader:
141
+ shuffle: False
142
+ drop_last: False
143
+ max_ratio: *max_ratio
144
+ batch_size_per_card: 128
145
+ num_workers: 4
configs/rec/nrtr/nrtr.yml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/nrtr/
7
+ save_epoch_step: 1
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_nrtr.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: BGPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: NRTREncoder
42
+ Decoder:
43
+ name: NRTRDecoder
44
+ num_encoder_layers: 6
45
+ beam_size: 0
46
+ num_decoder_layers: 6
47
+ nhead: 8
48
+ max_len: *max_text_length
49
+
50
+
51
+ Loss:
52
+ name: ARLoss
53
+
54
+ PostProcess:
55
+ name: ARLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+
59
+ Metric:
60
+ name: RecMetric
61
+ main_indicator: acc
62
+ is_filter: True
63
+
64
+ Train:
65
+ dataset:
66
+ name: LMDBDataSet
67
+ data_dir: ../Union14M-L-LMDB-Filtered
68
+ transforms:
69
+ - DecodeImagePIL: # load image
70
+ img_mode: RGB
71
+ - PARSeqAugPIL:
72
+ - ARLabelEncode: # Class handling label
73
+ character_dict_path: *character_dict_path
74
+ use_space_char: *use_space_char
75
+ max_text_length: *max_text_length
76
+ - RecTVResize:
77
+ image_shape: [32, 128]
78
+ padding: False
79
+ - KeepKeys:
80
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
81
+ loader:
82
+ shuffle: True
83
+ batch_size_per_card: 256
84
+ drop_last: True
85
+ num_workers: 4
86
+
87
+ Eval:
88
+ dataset:
89
+ name: LMDBDataSet
90
+ data_dir: ../evaluation/
91
+ transforms:
92
+ - DecodeImagePIL: # load image
93
+ img_mode: RGB
94
+ - ARLabelEncode: # Class handling label
95
+ character_dict_path: *character_dict_path
96
+ use_space_char: *use_space_char
97
+ max_text_length: *max_text_length
98
+ - RecTVResize:
99
+ image_shape: [32, 128]
100
+ padding: False
101
+ - KeepKeys:
102
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
103
+ loader:
104
+ shuffle: False
105
+ drop_last: False
106
+ batch_size_per_card: 256
107
+ num_workers: 2