default_scope = 'mmdet' default_hooks = dict( timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=100), param_scheduler=dict(type='ParamSchedulerHook'), checkpoint=dict( type='CheckpointHook', interval=1, max_keep_ckpts=5, save_best='auto'), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='DetVisualizationHook')) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), dist_cfg=dict(backend='nccl')) vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='DetLocalVisualizer', vis_backends=[dict(type='LocalVisBackend')], name='visualizer', save_dir='./') log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) log_level = 'INFO' load_from = './model.pth' resume = True train_cfg = dict( type='EpochBasedTrainLoop', max_epochs=12, val_interval=12, dynamic_intervals=[(10, 1)]) val_cfg = dict(type='ValLoop') test_cfg = dict( type='TestLoop', pipeline=[ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ]) param_scheduler = [ dict( type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0, end=1000), dict( type='CosineAnnealingLR', eta_min=1.25e-05, begin=6, end=12, T_max=6, by_epoch=True, convert_to_iter_based=True) ] optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=0.00025, weight_decay=0.05), paramwise_cfg=dict( norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) auto_scale_lr = dict(enable=False, base_batch_size=16) dataset_type = 'CocoDataset' data_root = 'data/coco/' file_client_args = dict(backend='disk') train_pipeline = [ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), dict( type='RandomResize', scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='CachedMixUp', img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='PackDetInputs') ] test_pipeline = [ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] tta_model = dict( type='DetTTAModel', tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100)) img_scales = [(640, 640), (320, 320), (960, 960)] tta_pipeline = [ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='TestTimeAug', transforms=[[{ 'type': 'Resize', 'scale': (640, 640), 'keep_ratio': True }, { 'type': 'Resize', 'scale': (320, 320), 'keep_ratio': True }, { 'type': 'Resize', 'scale': (960, 960), 'keep_ratio': True }], [{ 'type': 'RandomFlip', 'prob': 1.0 }, { 'type': 'RandomFlip', 'prob': 0.0 }], [{ 'type': 'Pad', 'size': (960, 960), 'pad_val': { 'img': (114, 114, 114) } }], [{ 'type': 'PackDetInputs', 'meta_keys': ('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'flip', 'flip_direction') }]]) ] model = dict( type='RTMDet', data_preprocessor=dict( type='DetDataPreprocessor', mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], bgr_to_rgb=False, batch_augments=None), backbone=dict( type='CSPNeXt', arch='P5', expand_ratio=0.5, deepen_factor=0.67, widen_factor=0.75, channel_attention=True, norm_cfg=dict(type='SyncBN'), act_cfg=dict(type='SiLU', inplace=True)), neck=dict( type='CSPNeXtPAFPN', in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2, expand_ratio=0.5, norm_cfg=dict(type='SyncBN'), act_cfg=dict(type='SiLU', inplace=True)), bbox_head=dict( type='RTMDetInsSepBNHead', num_classes=80, in_channels=192, stacked_convs=2, share_conv=True, pred_kernel_size=1, feat_channels=192, act_cfg=dict(type='SiLU', inplace=True), norm_cfg=dict(type='SyncBN', requires_grad=True), anchor_generator=dict( type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]), bbox_coder=dict(type='DistancePointBBoxCoder'), loss_cls=dict( type='QualityFocalLoss', use_sigmoid=True, beta=2.0, loss_weight=1.0), loss_bbox=dict(type='GIoULoss', loss_weight=2.0), loss_mask=dict( type='DiceLoss', loss_weight=2.0, eps=5e-06, reduction='mean')), train_cfg=dict( assigner=dict(type='DynamicSoftLabelAssigner', topk=13), allowed_border=-1, pos_weight=-1, debug=False), test_cfg=dict( nms_pre=400, min_bbox_size=0, score_thr=0.4, nms=dict(type='nms', iou_threshold=0.6), max_per_img=50, mask_thr_binary=0.5)) train_pipeline_stage2 = [ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict( type='RandomResize', scale=(640, 640), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict(type='PackDetInputs') ] train_dataloader = dict( batch_size=2, num_workers=1, batch_sampler=None, pin_memory=True, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='ConcatDataset', datasets=[ dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json', pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict( type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), dict( type='RandomResize', scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='CachedMixUp', img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='PackDetInputs') ]) ])) val_dataloader = dict( batch_size=1, num_workers=10, dataset=dict( pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ], type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json', test_mode=True), persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False)) test_dataloader = dict( batch_size=1, num_workers=10, dataset=dict( pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ], type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions2.json', test_mode=True), persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False)) max_epochs = 12 stage2_num_epochs = 2 base_lr = 0.00025 interval = 12 val_evaluator = dict( proposal_nums=(100, 1, 10), metric=['bbox', 'segm'], type='CocoMetric', ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json' ) test_evaluator = dict( proposal_nums=(100, 1, 10), metric=['bbox', 'segm'], type='CocoMetric', ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json' ) custom_hooks = [ dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0002, update_buffers=True, priority=49), dict( type='PipelineSwitchHook', switch_epoch=10, switch_pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict( type='RandomResize', scale=(640, 640), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict(type='PackDetInputs') ]) ] work_dir = '/home/erik/Riksarkivet/Projects/HTR_Pipeline/models/checkpoints/rtmdet_lines_pr_2' train_batch_size_per_gpu = 2 val_batch_size_per_gpu = 1 train_num_workers = 1 num_classes = 1 metainfo = dict(classes='text_line', palette=[(220, 20, 60)]) icdar_2019 = dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_regions2.json', pipeline=[ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), dict( type='RandomResize', scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='CachedMixUp', img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='PackDetInputs') ]) icdar_2019_test = dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_lines.json', test_mode=True, pipeline=[ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ]) police_records = dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json', pipeline=[ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), dict( type='RandomResize', scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='CachedMixUp', img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='PackDetInputs') ]) train_list = [ dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_lines2.json', pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, poly2mask=False), dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), dict( type='RandomResize', scale=(1280, 1280), ratio_range=(0.1, 2.0), keep_ratio=True), dict( type='RandomCrop', crop_size=(640, 640), recompute_bbox=True, allow_negative_crop=True), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='CachedMixUp', img_scale=(640, 640), ratio_range=(1.0, 1.0), max_cached_images=20, pad_val=(114, 114, 114)), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), dict(type='PackDetInputs') ]) ] test_list = [ dict( type='CocoDataset', metainfo=dict(classes='text_line', palette=[(220, 20, 60)]), data_prefix=dict( img= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/' ), ann_file= '/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/clean/gt_files/coco_lines.json', test_mode=True, pipeline=[ dict( type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict( type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ]) ] pipeline = [ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), dict(type='Resize', scale=(640, 640), keep_ratio=True), dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] launcher = 'pytorch'