guanxiongsun commited on
Commit
0ebd6fe
1 Parent(s): de7583c
work_dirs/stpn_swint_adam_9x/20240204_030125.log ADDED
The diff for this file is too large to render. See raw diff
 
work_dirs/stpn_swint_adam_9x/20240204_030125.log.json ADDED
The diff for this file is too large to render. See raw diff
 
work_dirs/stpn_swint_adam_9x/epoch_9_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bcb39c4070df69ae917cd237f36cc9c77eec63a53ce94ae9b7a931aefdd27b7
3
+ size 180353653
work_dirs/stpn_swint_adam_9x/eval.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {'all': 0.8515168953546883, 'fast': 0.6405709450111929, 'medium': 0.8412701278128932, 'slow': 0.9141449563100874, 'airplane': 0.9586996519276003, 'antelope': 0.8799440834409841, 'bear': 0.8989029949927739, 'bicycle': 0.8851157502725769, 'bird': 0.7930013993678566, 'bus': 0.841979109569196, 'car': 0.7758164365133777, 'cattle': 0.802800559124309, 'dog': 0.8453745668140737, 'domestic_cat': 0.9140264245981315, 'elephant': 0.8546385510194372, 'fox':
2
+ 0.947818798999815, 'giant_panda': 0.8667739758302728, 'hamster': 0.9850564156153161, 'horse': 0.8874280101304849, 'lion': 0.7234216680206619, 'lizard': 0.8713093258061801, 'monkey': 0.6710894126913831, 'motorcycle': 0.9198253019671686, 'rabbit': 0.7994001086999526, 'red_panda': 0.8903292259476213, 'sheep': 0.7809233256814476, 'snake': 0.8029446576625736, 'squirrel': 0.6965247167195919, 'tiger': 0.9354936714466412, 'train': 0.8845634667175416, 'turtle': 0.81486794558347,
3
+ 'watercraft': 0.8363646138283387, 'whale': 0.8189654860172317, 'zebra': 0.9621072056346469}
work_dirs/stpn_swint_adam_9x/stpn_swint_adam_9x.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_config = dict(interval=9)
2
+ log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
3
+ custom_hooks = [dict(type='NumClassCheckHook')]
4
+ dist_params = dict(backend='nccl')
5
+ log_level = 'INFO'
6
+ load_from = None
7
+ resume_from = None
8
+ workflow = [('train', 1)]
9
+ optimizer = dict(
10
+ type='AdamW',
11
+ lr=2.5e-05,
12
+ betas=(0.9, 0.999),
13
+ weight_decay=0.05,
14
+ paramwise_cfg=dict(
15
+ custom_keys=dict(
16
+ absolute_pos_embed=dict(decay_mult=0.0),
17
+ relative_position_bias_table=dict(decay_mult=0.0),
18
+ norm=dict(decay_mult=0.0))))
19
+ optimizer_config = dict(grad_clip=None)
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=500,
24
+ warmup_ratio=0.3333333333333333,
25
+ step=[6])
26
+ runner = dict(type='EpochBasedRunner', max_epochs=9)
27
+ pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
28
+ is_video_model = True
29
+ model = dict(
30
+ type='STPN',
31
+ detector=dict(
32
+ type='FasterRCNN',
33
+ backbone=dict(
34
+ type='STPNSwinTransformer',
35
+ embed_dims=96,
36
+ depths=[2, 2, 6, 2],
37
+ num_heads=[3, 6, 12, 24],
38
+ window_size=7,
39
+ mlp_ratio=4,
40
+ qkv_bias=True,
41
+ qk_scale=None,
42
+ drop_rate=0.0,
43
+ attn_drop_rate=0.0,
44
+ drop_path_rate=0.2,
45
+ patch_norm=True,
46
+ with_cp=False,
47
+ convert_weights=True,
48
+ init_cfg=dict(
49
+ type='Pretrained',
50
+ checkpoint=
51
+ 'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
52
+ ),
53
+ prompt_cfg=dict(
54
+ num_tokens=5,
55
+ location='prepend',
56
+ deep=False,
57
+ dropout=0.0,
58
+ initiation='random')),
59
+ neck=dict(
60
+ type='FPN',
61
+ in_channels=[96, 192, 384, 768],
62
+ out_channels=256,
63
+ num_outs=5),
64
+ rpn_head=dict(
65
+ type='RPNHead',
66
+ in_channels=256,
67
+ feat_channels=256,
68
+ anchor_generator=dict(
69
+ type='AnchorGenerator',
70
+ scales=[8],
71
+ ratios=[0.5, 1.0, 2.0],
72
+ strides=[4, 8, 16, 32, 64]),
73
+ bbox_coder=dict(
74
+ type='DeltaXYWHBBoxCoder',
75
+ target_means=[0.0, 0.0, 0.0, 0.0],
76
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
77
+ loss_cls=dict(
78
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
79
+ loss_bbox=dict(
80
+ type='SmoothL1Loss', beta=0.1111111111111111,
81
+ loss_weight=1.0)),
82
+ roi_head=dict(
83
+ type='StandardRoIHead',
84
+ bbox_roi_extractor=dict(
85
+ type='SingleRoIExtractor',
86
+ roi_layer=dict(
87
+ type='RoIAlign', output_size=7, sampling_ratio=0),
88
+ out_channels=256,
89
+ featmap_strides=[4, 8, 16, 32]),
90
+ bbox_head=dict(
91
+ type='Shared2FCBBoxHead',
92
+ in_channels=256,
93
+ fc_out_channels=1024,
94
+ roi_feat_size=7,
95
+ num_classes=30,
96
+ bbox_coder=dict(
97
+ type='DeltaXYWHBBoxCoder',
98
+ target_means=[0.0, 0.0, 0.0, 0.0],
99
+ target_stds=[0.2, 0.2, 0.2, 0.2]),
100
+ reg_class_agnostic=False,
101
+ loss_cls=dict(
102
+ type='CrossEntropyLoss',
103
+ use_sigmoid=False,
104
+ loss_weight=1.0),
105
+ loss_bbox=dict(
106
+ type='SmoothL1Loss',
107
+ beta=0.1111111111111111,
108
+ loss_weight=1.0))),
109
+ train_cfg=dict(
110
+ rpn=dict(
111
+ assigner=dict(
112
+ type='MaxIoUAssigner',
113
+ pos_iou_thr=0.7,
114
+ neg_iou_thr=0.3,
115
+ min_pos_iou=0.3,
116
+ match_low_quality=True,
117
+ ignore_iof_thr=-1),
118
+ sampler=dict(
119
+ type='RandomSampler',
120
+ num=256,
121
+ pos_fraction=0.5,
122
+ neg_pos_ub=-1,
123
+ add_gt_as_proposals=False),
124
+ allowed_border=-1,
125
+ pos_weight=-1,
126
+ debug=False),
127
+ rpn_proposal=dict(
128
+ nms_pre=1000,
129
+ max_per_img=300,
130
+ nms=dict(type='nms', iou_threshold=0.7),
131
+ min_bbox_size=0),
132
+ rcnn=dict(
133
+ assigner=dict(
134
+ type='MaxIoUAssigner',
135
+ pos_iou_thr=0.5,
136
+ neg_iou_thr=0.5,
137
+ min_pos_iou=0.5,
138
+ match_low_quality=True,
139
+ ignore_iof_thr=-1),
140
+ sampler=dict(
141
+ type='RandomSampler',
142
+ num=256,
143
+ pos_fraction=0.25,
144
+ neg_pos_ub=-1,
145
+ add_gt_as_proposals=True),
146
+ mask_size=28,
147
+ pos_weight=-1,
148
+ debug=False)),
149
+ test_cfg=dict(
150
+ rpn=dict(
151
+ nms_pre=1000,
152
+ max_per_img=300,
153
+ nms=dict(type='nms', iou_threshold=0.7),
154
+ min_bbox_size=0),
155
+ rcnn=dict(
156
+ score_thr=0.0001,
157
+ nms=dict(type='nms', iou_threshold=0.5),
158
+ max_per_img=100,
159
+ mask_thr_binary=0.5))))
160
+ dataset_type = 'ImagenetVIDDataset'
161
+ data_root = 'data/ILSVRC/'
162
+ img_norm_cfg = dict(
163
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
164
+ train_pipeline = [
165
+ dict(type='LoadMultiImagesFromFile'),
166
+ dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=False),
167
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
168
+ dict(
169
+ type='AutoAugment',
170
+ policies=[[{
171
+ 'type':
172
+ 'SeqResize',
173
+ 'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333),
174
+ (608, 1333), (640, 1333), (672, 1333), (704, 1333),
175
+ (736, 1333), (768, 1333), (800, 1333)],
176
+ 'multiscale_mode':
177
+ 'value',
178
+ 'keep_ratio':
179
+ True
180
+ }],
181
+ [{
182
+ 'type': 'SeqResize',
183
+ 'img_scale': [(400, 1333), (500, 1333), (600, 1333)],
184
+ 'multiscale_mode': 'value',
185
+ 'keep_ratio': True
186
+ }, {
187
+ 'type': 'SeqRandomCrop',
188
+ 'crop_type': 'absolute_range',
189
+ 'crop_size': (384, 600),
190
+ 'allow_negative_crop': True
191
+ }, {
192
+ 'type': 'SeqMaxSizePad'
193
+ }, {
194
+ 'type':
195
+ 'SeqResize2',
196
+ 'img_scale': [(480, 1333), (512, 1333), (544, 1333),
197
+ (576, 1333), (608, 1333), (640, 1333),
198
+ (672, 1333), (704, 1333), (736, 1333),
199
+ (768, 1333), (800, 1333)],
200
+ 'multiscale_mode':
201
+ 'value',
202
+ 'keep_ratio':
203
+ True
204
+ }]]),
205
+ dict(
206
+ type='SeqNormalize',
207
+ mean=[123.675, 116.28, 103.53],
208
+ std=[58.395, 57.12, 57.375],
209
+ to_rgb=True),
210
+ dict(type='SeqPad', size_divisor=16),
211
+ dict(type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels']),
212
+ dict(type='ConcatVideoReferences'),
213
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
214
+ ]
215
+ test_pipeline = [
216
+ dict(type='LoadMultiImagesFromFile'),
217
+ dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
218
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
219
+ dict(
220
+ type='SeqNormalize',
221
+ mean=[123.675, 116.28, 103.53],
222
+ std=[58.395, 57.12, 57.375],
223
+ to_rgb=True),
224
+ dict(type='SeqPad', size_divisor=16),
225
+ dict(
226
+ type='VideoCollect',
227
+ keys=['img'],
228
+ meta_keys=('num_left_ref_imgs', 'frame_stride')),
229
+ dict(type='ConcatVideoReferences'),
230
+ dict(type='MultiImagesToTensor', ref_prefix='ref'),
231
+ dict(type='ToList')
232
+ ]
233
+ data = dict(
234
+ samples_per_gpu=1,
235
+ workers_per_gpu=4,
236
+ train=[
237
+ dict(
238
+ type='ImagenetVIDDataset',
239
+ ann_file='data/ILSVRC/annotations/imagenet_vid_train.json',
240
+ img_prefix='data/ILSVRC/Data/VID',
241
+ ref_img_sampler=dict(
242
+ num_ref_imgs=2,
243
+ frame_range=9,
244
+ filter_key_img=True,
245
+ method='bilateral_uniform'),
246
+ pipeline=[
247
+ dict(type='LoadMultiImagesFromFile'),
248
+ dict(
249
+ type='SeqLoadAnnotations', with_bbox=True,
250
+ with_mask=False),
251
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
252
+ dict(
253
+ type='AutoAugment',
254
+ policies=[[{
255
+ 'type':
256
+ 'SeqResize',
257
+ 'img_scale': [(480, 1333), (512, 1333), (544, 1333),
258
+ (576, 1333), (608, 1333), (640, 1333),
259
+ (672, 1333), (704, 1333), (736, 1333),
260
+ (768, 1333), (800, 1333)],
261
+ 'multiscale_mode':
262
+ 'value',
263
+ 'keep_ratio':
264
+ True
265
+ }],
266
+ [{
267
+ 'type':
268
+ 'SeqResize',
269
+ 'img_scale': [(400, 1333), (500, 1333),
270
+ (600, 1333)],
271
+ 'multiscale_mode':
272
+ 'value',
273
+ 'keep_ratio':
274
+ True
275
+ }, {
276
+ 'type': 'SeqRandomCrop',
277
+ 'crop_type': 'absolute_range',
278
+ 'crop_size': (384, 600),
279
+ 'allow_negative_crop': True
280
+ }, {
281
+ 'type': 'SeqMaxSizePad'
282
+ }, {
283
+ 'type':
284
+ 'SeqResize2',
285
+ 'img_scale': [(480, 1333), (512, 1333),
286
+ (544, 1333), (576, 1333),
287
+ (608, 1333), (640, 1333),
288
+ (672, 1333), (704, 1333),
289
+ (736, 1333), (768, 1333),
290
+ (800, 1333)],
291
+ 'multiscale_mode':
292
+ 'value',
293
+ 'keep_ratio':
294
+ True
295
+ }]]),
296
+ dict(
297
+ type='SeqNormalize',
298
+ mean=[123.675, 116.28, 103.53],
299
+ std=[58.395, 57.12, 57.375],
300
+ to_rgb=True),
301
+ dict(type='SeqPad', size_divisor=16),
302
+ dict(
303
+ type='VideoCollect',
304
+ keys=['img', 'gt_bboxes', 'gt_labels']),
305
+ dict(type='ConcatVideoReferences'),
306
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
307
+ ]),
308
+ dict(
309
+ type='ImagenetVIDDataset',
310
+ load_as_video=False,
311
+ ann_file='data/ILSVRC/annotations/imagenet_det_30plus1cls.json',
312
+ img_prefix='data/ILSVRC/Data/DET',
313
+ ref_img_sampler=dict(
314
+ num_ref_imgs=2,
315
+ frame_range=0,
316
+ filter_key_img=False,
317
+ method='bilateral_uniform'),
318
+ pipeline=[
319
+ dict(type='LoadMultiImagesFromFile'),
320
+ dict(
321
+ type='SeqLoadAnnotations', with_bbox=True,
322
+ with_mask=False),
323
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
324
+ dict(
325
+ type='AutoAugment',
326
+ policies=[[{
327
+ 'type':
328
+ 'SeqResize',
329
+ 'img_scale': [(480, 1333), (512, 1333), (544, 1333),
330
+ (576, 1333), (608, 1333), (640, 1333),
331
+ (672, 1333), (704, 1333), (736, 1333),
332
+ (768, 1333), (800, 1333)],
333
+ 'multiscale_mode':
334
+ 'value',
335
+ 'keep_ratio':
336
+ True
337
+ }],
338
+ [{
339
+ 'type':
340
+ 'SeqResize',
341
+ 'img_scale': [(400, 1333), (500, 1333),
342
+ (600, 1333)],
343
+ 'multiscale_mode':
344
+ 'value',
345
+ 'keep_ratio':
346
+ True
347
+ }, {
348
+ 'type': 'SeqRandomCrop',
349
+ 'crop_type': 'absolute_range',
350
+ 'crop_size': (384, 600),
351
+ 'allow_negative_crop': True
352
+ }, {
353
+ 'type': 'SeqMaxSizePad'
354
+ }, {
355
+ 'type':
356
+ 'SeqResize2',
357
+ 'img_scale': [(480, 1333), (512, 1333),
358
+ (544, 1333), (576, 1333),
359
+ (608, 1333), (640, 1333),
360
+ (672, 1333), (704, 1333),
361
+ (736, 1333), (768, 1333),
362
+ (800, 1333)],
363
+ 'multiscale_mode':
364
+ 'value',
365
+ 'keep_ratio':
366
+ True
367
+ }]]),
368
+ dict(
369
+ type='SeqNormalize',
370
+ mean=[123.675, 116.28, 103.53],
371
+ std=[58.395, 57.12, 57.375],
372
+ to_rgb=True),
373
+ dict(type='SeqPad', size_divisor=16),
374
+ dict(
375
+ type='VideoCollect',
376
+ keys=['img', 'gt_bboxes', 'gt_labels']),
377
+ dict(type='ConcatVideoReferences'),
378
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
379
+ ])
380
+ ],
381
+ val=dict(
382
+ type='ImagenetVIDDataset',
383
+ ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
384
+ img_prefix='data/ILSVRC/Data/VID',
385
+ ref_img_sampler=dict(
386
+ num_ref_imgs=14,
387
+ frame_range=[-7, 7],
388
+ method='test_with_adaptive_stride'),
389
+ pipeline=[
390
+ dict(type='LoadMultiImagesFromFile'),
391
+ dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
392
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
393
+ dict(
394
+ type='SeqNormalize',
395
+ mean=[123.675, 116.28, 103.53],
396
+ std=[58.395, 57.12, 57.375],
397
+ to_rgb=True),
398
+ dict(type='SeqPad', size_divisor=16),
399
+ dict(
400
+ type='VideoCollect',
401
+ keys=['img'],
402
+ meta_keys=('num_left_ref_imgs', 'frame_stride')),
403
+ dict(type='ConcatVideoReferences'),
404
+ dict(type='MultiImagesToTensor', ref_prefix='ref'),
405
+ dict(type='ToList')
406
+ ],
407
+ test_mode=True),
408
+ test=dict(
409
+ type='ImagenetVIDDataset',
410
+ ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
411
+ img_prefix='data/ILSVRC/Data/VID',
412
+ ref_img_sampler=dict(
413
+ num_ref_imgs=14,
414
+ frame_range=[-7, 7],
415
+ method='test_with_adaptive_stride'),
416
+ pipeline=[
417
+ dict(type='LoadMultiImagesFromFile'),
418
+ dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
419
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
420
+ dict(
421
+ type='SeqNormalize',
422
+ mean=[123.675, 116.28, 103.53],
423
+ std=[58.395, 57.12, 57.375],
424
+ to_rgb=True),
425
+ dict(type='SeqPad', size_divisor=16),
426
+ dict(
427
+ type='VideoCollect',
428
+ keys=['img'],
429
+ meta_keys=('num_left_ref_imgs', 'frame_stride')),
430
+ dict(type='ConcatVideoReferences'),
431
+ dict(type='MultiImagesToTensor', ref_prefix='ref'),
432
+ dict(type='ToList')
433
+ ],
434
+ test_mode=True))
435
+ total_epochs = 9
436
+ evaluation = dict(metric=['bbox'], vid_style=True, interval=9)
437
+ work_dir = './work_dirs/stpn_swint_adam_9x'
438
+ gpu_ids = range(0, 8)