adonaivera commited on
Commit
e9b779d
1 Parent(s): d0c89a1

Upload 7 files

Browse files
default_runtime.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_scope = 'mmyolo'
2
+
3
+ default_hooks = dict(
4
+ timer=dict(type='IterTimerHook'),
5
+ logger=dict(type='LoggerHook', interval=50),
6
+ param_scheduler=dict(type='ParamSchedulerHook'),
7
+ checkpoint=dict(type='CheckpointHook', interval=1),
8
+ sampler_seed=dict(type='DistSamplerSeedHook'),
9
+ visualization=dict(type='mmdet.DetVisualizationHook'))
10
+
11
+ env_cfg = dict(
12
+ cudnn_benchmark=False,
13
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
14
+ dist_cfg=dict(backend='nccl'),
15
+ )
16
+
17
+ vis_backends = [dict(type='LocalVisBackend')]
18
+ visualizer = dict(
19
+ type='mmdet.DetLocalVisualizer',
20
+ vis_backends=vis_backends,
21
+ name='visualizer')
22
+ log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
23
+
24
+ log_level = 'INFO'
25
+ load_from = None
26
+ resume = False
27
+
28
+ # Example to use different file client
29
+ # Method 1: simply set the data root and let the file I/O module
30
+ # automatically infer from prefix (not support LMDB and Memcache yet)
31
+
32
+ # data_root = 's3://openmmlab/datasets/detection/coco/'
33
+
34
+ # Method 2: Use `backend_args`, `file_client_args` in versions
35
+ # before MMDet 3.0.0rc6
36
+ # backend_args = dict(
37
+ # backend='petrel',
38
+ # path_mapping=dict({
39
+ # './data/': 's3://openmmlab/datasets/detection/',
40
+ # 'data/': 's3://openmmlab/datasets/detection/'
41
+ # }))
42
+
43
+ backend_args = None
det_p5_tta.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO: Need to solve the problem of multiple backend_args parameters
2
+ # _backend_args = dict(
3
+ # backend='petrel',
4
+ # path_mapping=dict({
5
+ # './data/': 's3://openmmlab/datasets/detection/',
6
+ # 'data/': 's3://openmmlab/datasets/detection/'
7
+ # }))
8
+
9
+ _backend_args = None
10
+
11
+ tta_model = dict(
12
+ type='mmdet.DetTTAModel',
13
+ tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300))
14
+
15
+ img_scales = [(640, 640), (320, 320), (960, 960)]
16
+
17
+ # LoadImageFromFile
18
+ # / | \
19
+ # (RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) # noqa
20
+ # / \ / \ / \
21
+ # RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa
22
+ # | | | | | |
23
+ # LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn
24
+ # | | | | | |
25
+ # PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa
26
+
27
+ _multiscale_resize_transforms = [
28
+ dict(
29
+ type='Compose',
30
+ transforms=[
31
+ dict(type='YOLOv5KeepRatioResize', scale=s),
32
+ dict(
33
+ type='LetterResize',
34
+ scale=s,
35
+ allow_scale_up=False,
36
+ pad_val=dict(img=114))
37
+ ]) for s in img_scales
38
+ ]
39
+
40
+ tta_pipeline = [
41
+ dict(type='LoadImageFromFile', backend_args=_backend_args),
42
+ dict(
43
+ type='TestTimeAug',
44
+ transforms=[
45
+ _multiscale_resize_transforms,
46
+ [
47
+ dict(type='mmdet.RandomFlip', prob=1.),
48
+ dict(type='mmdet.RandomFlip', prob=0.)
49
+ ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)],
50
+ [
51
+ dict(
52
+ type='mmdet.PackDetInputs',
53
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
54
+ 'scale_factor', 'pad_param', 'flip',
55
+ 'flip_direction'))
56
+ ]
57
+ ])
58
+ ]
yolov8_l_syncbn_fast_8xb16-500e_coco.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './yolov8_m_syncbn_fast_8xb16-500e_coco.py'
2
+
3
+ # ========================modified parameters======================
4
+ deepen_factor = 1.00
5
+ widen_factor = 1.00
6
+ last_stage_out_channels = 512
7
+
8
+ mixup_prob = 0.15
9
+
10
+ # =======================Unmodified in most cases==================
11
+ pre_transform = _base_.pre_transform
12
+ mosaic_affine_transform = _base_.mosaic_affine_transform
13
+ last_transform = _base_.last_transform
14
+
15
+ model = dict(
16
+ backbone=dict(
17
+ last_stage_out_channels=last_stage_out_channels,
18
+ deepen_factor=deepen_factor,
19
+ widen_factor=widen_factor),
20
+ neck=dict(
21
+ deepen_factor=deepen_factor,
22
+ widen_factor=widen_factor,
23
+ in_channels=[256, 512, last_stage_out_channels],
24
+ out_channels=[256, 512, last_stage_out_channels]),
25
+ bbox_head=dict(
26
+ head_module=dict(
27
+ widen_factor=widen_factor,
28
+ in_channels=[256, 512, last_stage_out_channels])))
29
+
30
+ train_pipeline = [
31
+ *pre_transform, *mosaic_affine_transform,
32
+ dict(
33
+ type='YOLOv5MixUp',
34
+ prob=mixup_prob,
35
+ pre_transform=[*pre_transform, *mosaic_affine_transform]),
36
+ *last_transform
37
+ ]
38
+
39
+ train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
yolov8_m_syncbn_fast_8xb16-500e_coco.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py'
2
+
3
+ # ========================modified parameters======================
4
+ deepen_factor = 0.67
5
+ widen_factor = 0.75
6
+ last_stage_out_channels = 768
7
+
8
+ affine_scale = 0.9
9
+ mixup_prob = 0.1
10
+
11
+ # =======================Unmodified in most cases==================
12
+ img_scale = _base_.img_scale
13
+ pre_transform = _base_.pre_transform
14
+ last_transform = _base_.last_transform
15
+
16
+ model = dict(
17
+ backbone=dict(
18
+ last_stage_out_channels=last_stage_out_channels,
19
+ deepen_factor=deepen_factor,
20
+ widen_factor=widen_factor),
21
+ neck=dict(
22
+ deepen_factor=deepen_factor,
23
+ widen_factor=widen_factor,
24
+ in_channels=[256, 512, last_stage_out_channels],
25
+ out_channels=[256, 512, last_stage_out_channels]),
26
+ bbox_head=dict(
27
+ head_module=dict(
28
+ widen_factor=widen_factor,
29
+ in_channels=[256, 512, last_stage_out_channels])))
30
+
31
+ mosaic_affine_transform = [
32
+ dict(
33
+ type='Mosaic',
34
+ img_scale=img_scale,
35
+ pad_val=114.0,
36
+ pre_transform=pre_transform),
37
+ dict(
38
+ type='YOLOv5RandomAffine',
39
+ max_rotate_degree=0.0,
40
+ max_shear_degree=0.0,
41
+ max_aspect_ratio=100,
42
+ scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
43
+ # img_scale is (width, height)
44
+ border=(-img_scale[0] // 2, -img_scale[1] // 2),
45
+ border_val=(114, 114, 114))
46
+ ]
47
+
48
+ # enable mixup
49
+ train_pipeline = [
50
+ *pre_transform, *mosaic_affine_transform,
51
+ dict(
52
+ type='YOLOv5MixUp',
53
+ prob=mixup_prob,
54
+ pre_transform=[*pre_transform, *mosaic_affine_transform]),
55
+ *last_transform
56
+ ]
57
+
58
+ train_pipeline_stage2 = [
59
+ *pre_transform,
60
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
61
+ dict(
62
+ type='LetterResize',
63
+ scale=img_scale,
64
+ allow_scale_up=True,
65
+ pad_val=dict(img=114.0)),
66
+ dict(
67
+ type='YOLOv5RandomAffine',
68
+ max_rotate_degree=0.0,
69
+ max_shear_degree=0.0,
70
+ scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
71
+ max_aspect_ratio=100,
72
+ border_val=(114, 114, 114)), *last_transform
73
+ ]
74
+
75
+ train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
76
+ _base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2
yolov8_s_syncbn_fast_8xb16-500e_coco.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ['./default_runtime.py', './det_p5_tta.py']
2
+
3
+ # ========================Frequently modified parameters======================
4
+ # -----data related-----
5
+ data_root = 'data/coco/' # Root path of data
6
+ # Path of train annotation file
7
+ train_ann_file = 'annotations/instances_train2017.json'
8
+ train_data_prefix = 'train2017/' # Prefix of train image path
9
+ # Path of val annotation file
10
+ val_ann_file = 'annotations/instances_val2017.json'
11
+ val_data_prefix = 'val2017/' # Prefix of val image path
12
+
13
+ num_classes = 80 # Number of classes for classification
14
+ # Batch size of a single GPU during training
15
+ train_batch_size_per_gpu = 16
16
+ # Worker to pre-fetch data for each single GPU during training
17
+ train_num_workers = 8
18
+ # persistent_workers must be False if num_workers is 0
19
+ persistent_workers = True
20
+
21
+ # -----train val related-----
22
+ # Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs
23
+ base_lr = 0.01
24
+ max_epochs = 500 # Maximum training epochs
25
+ # Disable mosaic augmentation for final 10 epochs (stage 2)
26
+ close_mosaic_epochs = 10
27
+
28
+ model_test_cfg = dict(
29
+ # The config of multi-label for multi-class prediction.
30
+ multi_label=True,
31
+ # The number of boxes before NMS
32
+ nms_pre=30000,
33
+ score_thr=0.001, # Threshold to filter out boxes.
34
+ nms=dict(type='nms', iou_threshold=0.7), # NMS type and threshold
35
+ max_per_img=300) # Max number of detections of each image
36
+
37
+ # ========================Possible modified parameters========================
38
+ # -----data related-----
39
+ img_scale = (640, 640) # width, height
40
+ # Dataset type, this will be used to define the dataset
41
+ dataset_type = 'YOLOv5CocoDataset'
42
+ # Batch size of a single GPU during validation
43
+ val_batch_size_per_gpu = 1
44
+ # Worker to pre-fetch data for each single GPU during validation
45
+ val_num_workers = 2
46
+
47
+ # Config of batch shapes. Only on val.
48
+ # We tested YOLOv8-m will get 0.02 higher than not using it.
49
+ batch_shapes_cfg = None
50
+ # You can turn on `batch_shapes_cfg` by uncommenting the following lines.
51
+ # batch_shapes_cfg = dict(
52
+ # type='BatchShapePolicy',
53
+ # batch_size=val_batch_size_per_gpu,
54
+ # img_size=img_scale[0],
55
+ # # The image scale of padding should be divided by pad_size_divisor
56
+ # size_divisor=32,
57
+ # # Additional paddings for pixel scale
58
+ # extra_pad_ratio=0.5)
59
+
60
+ # -----model related-----
61
+ # The scaling factor that controls the depth of the network structure
62
+ deepen_factor = 0.33
63
+ # The scaling factor that controls the width of the network structure
64
+ widen_factor = 0.5
65
+ # Strides of multi-scale prior box
66
+ strides = [8, 16, 32]
67
+ # The output channel of the last stage
68
+ last_stage_out_channels = 1024
69
+ num_det_layers = 3 # The number of model output scales
70
+ norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config
71
+
72
+ # -----train val related-----
73
+ affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio
74
+ # YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes
75
+ max_aspect_ratio = 100
76
+ tal_topk = 10 # Number of bbox selected in each level
77
+ tal_alpha = 0.5 # A Hyper-parameter related to alignment_metrics
78
+ tal_beta = 6.0 # A Hyper-parameter related to alignment_metrics
79
+ # TODO: Automatically scale loss_weight based on number of detection layers
80
+ loss_cls_weight = 0.5
81
+ loss_bbox_weight = 7.5
82
+ # Since the dfloss is implemented differently in the official
83
+ # and mmdet, we're going to divide loss_weight by 4.
84
+ loss_dfl_weight = 1.5 / 4
85
+ lr_factor = 0.01 # Learning rate scaling factor
86
+ weight_decay = 0.0005
87
+ # Save model checkpoint and validation intervals in stage 1
88
+ save_epoch_intervals = 10
89
+ # validation intervals in stage 2
90
+ val_interval_stage2 = 1
91
+ # The maximum checkpoints to keep.
92
+ max_keep_ckpts = 2
93
+ # Single-scale training is recommended to
94
+ # be turned on, which can speed up training.
95
+ env_cfg = dict(cudnn_benchmark=True)
96
+
97
+ # ===============================Unmodified in most cases====================
98
+ model = dict(
99
+ type='YOLODetector',
100
+ data_preprocessor=dict(
101
+ type='YOLOv5DetDataPreprocessor',
102
+ mean=[0., 0., 0.],
103
+ std=[255., 255., 255.],
104
+ bgr_to_rgb=True),
105
+ backbone=dict(
106
+ type='YOLOv8CSPDarknet',
107
+ arch='P5',
108
+ last_stage_out_channels=last_stage_out_channels,
109
+ deepen_factor=deepen_factor,
110
+ widen_factor=widen_factor,
111
+ norm_cfg=norm_cfg,
112
+ act_cfg=dict(type='SiLU', inplace=True)),
113
+ neck=dict(
114
+ type='YOLOv8PAFPN',
115
+ deepen_factor=deepen_factor,
116
+ widen_factor=widen_factor,
117
+ in_channels=[256, 512, last_stage_out_channels],
118
+ out_channels=[256, 512, last_stage_out_channels],
119
+ num_csp_blocks=3,
120
+ norm_cfg=norm_cfg,
121
+ act_cfg=dict(type='SiLU', inplace=True)),
122
+ bbox_head=dict(
123
+ type='YOLOv8Head',
124
+ head_module=dict(
125
+ type='YOLOv8HeadModule',
126
+ num_classes=num_classes,
127
+ in_channels=[256, 512, last_stage_out_channels],
128
+ widen_factor=widen_factor,
129
+ reg_max=16,
130
+ norm_cfg=norm_cfg,
131
+ act_cfg=dict(type='SiLU', inplace=True),
132
+ featmap_strides=strides),
133
+ prior_generator=dict(
134
+ type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides),
135
+ bbox_coder=dict(type='DistancePointBBoxCoder'),
136
+ # scaled based on number of detection layers
137
+ loss_cls=dict(
138
+ type='mmdet.CrossEntropyLoss',
139
+ use_sigmoid=True,
140
+ reduction='none',
141
+ loss_weight=loss_cls_weight),
142
+ loss_bbox=dict(
143
+ type='IoULoss',
144
+ iou_mode='ciou',
145
+ bbox_format='xyxy',
146
+ reduction='sum',
147
+ loss_weight=loss_bbox_weight,
148
+ return_iou=False),
149
+ loss_dfl=dict(
150
+ type='mmdet.DistributionFocalLoss',
151
+ reduction='mean',
152
+ loss_weight=loss_dfl_weight)),
153
+ train_cfg=dict(
154
+ assigner=dict(
155
+ type='BatchTaskAlignedAssigner',
156
+ num_classes=num_classes,
157
+ use_ciou=True,
158
+ topk=tal_topk,
159
+ alpha=tal_alpha,
160
+ beta=tal_beta,
161
+ eps=1e-9)),
162
+ test_cfg=model_test_cfg)
163
+
164
+ albu_train_transforms = [
165
+ dict(type='Blur', p=0.01),
166
+ dict(type='MedianBlur', p=0.01),
167
+ dict(type='ToGray', p=0.01),
168
+ dict(type='CLAHE', p=0.01)
169
+ ]
170
+
171
+ pre_transform = [
172
+ dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
173
+ dict(type='LoadAnnotations', with_bbox=True)
174
+ ]
175
+
176
+ last_transform = [
177
+ dict(
178
+ type='mmdet.Albu',
179
+ transforms=albu_train_transforms,
180
+ bbox_params=dict(
181
+ type='BboxParams',
182
+ format='pascal_voc',
183
+ label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
184
+ keymap={
185
+ 'img': 'image',
186
+ 'gt_bboxes': 'bboxes'
187
+ }),
188
+ dict(type='YOLOv5HSVRandomAug'),
189
+ dict(type='mmdet.RandomFlip', prob=0.5),
190
+ dict(
191
+ type='mmdet.PackDetInputs',
192
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
193
+ 'flip_direction'))
194
+ ]
195
+
196
+ train_pipeline = [
197
+ *pre_transform,
198
+ dict(
199
+ type='Mosaic',
200
+ img_scale=img_scale,
201
+ pad_val=114.0,
202
+ pre_transform=pre_transform),
203
+ dict(
204
+ type='YOLOv5RandomAffine',
205
+ max_rotate_degree=0.0,
206
+ max_shear_degree=0.0,
207
+ scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
208
+ max_aspect_ratio=max_aspect_ratio,
209
+ # img_scale is (width, height)
210
+ border=(-img_scale[0] // 2, -img_scale[1] // 2),
211
+ border_val=(114, 114, 114)),
212
+ *last_transform
213
+ ]
214
+
215
+ train_pipeline_stage2 = [
216
+ *pre_transform,
217
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
218
+ dict(
219
+ type='LetterResize',
220
+ scale=img_scale,
221
+ allow_scale_up=True,
222
+ pad_val=dict(img=114.0)),
223
+ dict(
224
+ type='YOLOv5RandomAffine',
225
+ max_rotate_degree=0.0,
226
+ max_shear_degree=0.0,
227
+ scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
228
+ max_aspect_ratio=max_aspect_ratio,
229
+ border_val=(114, 114, 114)), *last_transform
230
+ ]
231
+
232
+ train_dataloader = dict(
233
+ batch_size=train_batch_size_per_gpu,
234
+ num_workers=train_num_workers,
235
+ persistent_workers=persistent_workers,
236
+ pin_memory=True,
237
+ sampler=dict(type='DefaultSampler', shuffle=True),
238
+ collate_fn=dict(type='yolov5_collate'),
239
+ dataset=dict(
240
+ type=dataset_type,
241
+ data_root=data_root,
242
+ ann_file=train_ann_file,
243
+ data_prefix=dict(img=train_data_prefix),
244
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
245
+ pipeline=train_pipeline))
246
+
247
+ test_pipeline = [
248
+ dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
249
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
250
+ dict(
251
+ type='LetterResize',
252
+ scale=img_scale,
253
+ allow_scale_up=False,
254
+ pad_val=dict(img=114)),
255
+ dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
256
+ dict(
257
+ type='mmdet.PackDetInputs',
258
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
259
+ 'scale_factor', 'pad_param'))
260
+ ]
261
+
262
+ val_dataloader = dict(
263
+ batch_size=val_batch_size_per_gpu,
264
+ num_workers=val_num_workers,
265
+ persistent_workers=persistent_workers,
266
+ pin_memory=True,
267
+ drop_last=False,
268
+ sampler=dict(type='DefaultSampler', shuffle=False),
269
+ dataset=dict(
270
+ type=dataset_type,
271
+ data_root=data_root,
272
+ test_mode=True,
273
+ data_prefix=dict(img=val_data_prefix),
274
+ ann_file=val_ann_file,
275
+ pipeline=test_pipeline,
276
+ batch_shapes_cfg=batch_shapes_cfg))
277
+
278
+ test_dataloader = val_dataloader
279
+
280
+ param_scheduler = None
281
+ optim_wrapper = dict(
282
+ type='OptimWrapper',
283
+ clip_grad=dict(max_norm=10.0),
284
+ optimizer=dict(
285
+ type='SGD',
286
+ lr=base_lr,
287
+ momentum=0.937,
288
+ weight_decay=weight_decay,
289
+ nesterov=True,
290
+ batch_size_per_gpu=train_batch_size_per_gpu),
291
+ constructor='YOLOv5OptimizerConstructor')
292
+
293
+ default_hooks = dict(
294
+ param_scheduler=dict(
295
+ type='YOLOv5ParamSchedulerHook',
296
+ scheduler_type='linear',
297
+ lr_factor=lr_factor,
298
+ max_epochs=max_epochs),
299
+ checkpoint=dict(
300
+ type='CheckpointHook',
301
+ interval=save_epoch_intervals,
302
+ save_best='auto',
303
+ max_keep_ckpts=max_keep_ckpts))
304
+
305
+ custom_hooks = [
306
+ dict(
307
+ type='EMAHook',
308
+ ema_type='ExpMomentumEMA',
309
+ momentum=0.0001,
310
+ update_buffers=True,
311
+ strict_load=False,
312
+ priority=49),
313
+ dict(
314
+ type='mmdet.PipelineSwitchHook',
315
+ switch_epoch=max_epochs - close_mosaic_epochs,
316
+ switch_pipeline=train_pipeline_stage2)
317
+ ]
318
+
319
+ val_evaluator = dict(
320
+ type='mmdet.CocoMetric',
321
+ proposal_nums=(100, 1, 10),
322
+ ann_file=data_root + val_ann_file,
323
+ metric='bbox')
324
+ test_evaluator = val_evaluator
325
+
326
+ train_cfg = dict(
327
+ type='EpochBasedTrainLoop',
328
+ max_epochs=max_epochs,
329
+ val_interval=save_epoch_intervals,
330
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
331
+ val_interval_stage2)])
332
+
333
+ val_cfg = dict(type='ValLoop')
334
+ test_cfg = dict(type='TestLoop')
yolov8l-world.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e56623553f30137149da28097b882b3413fa2a00cce88d19e426475b70da5dc
3
+ size 444388398
yolov8l-world.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('yolov8_l_syncbn_fast_8xb16-500e_coco.py')
2
+ custom_imports = dict(imports=['yolo_world'],
3
+ allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 1203
7
+ num_training_classes = 80
8
+ max_epochs = 100 # Maximum training epochs
9
+ close_mosaic_epochs = 2
10
+ save_epoch_intervals = 2
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-3
15
+ weight_decay = 0.05 / 2
16
+ train_batch_size_per_gpu = 16
17
+
18
+ # model settings
19
+ model = dict(
20
+ type='YOLOWorldDetector',
21
+ mm_neck=True,
22
+ num_train_classes=num_training_classes,
23
+ num_test_classes=num_classes,
24
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
25
+ backbone=dict(
26
+ _delete_=True,
27
+ type='MultiModalYOLOBackbone',
28
+ image_model={{_base_.model.backbone}},
29
+ text_model=dict(
30
+ type='HuggingCLIPLanguageBackbone',
31
+ model_name='openai/clip-vit-base-patch32',
32
+ frozen_modules=['all'])),
33
+ neck=dict(type='YOLOWorldPAFPN',
34
+ guide_channels=text_channels,
35
+ embed_channels=neck_embed_channels,
36
+ num_heads=neck_num_heads,
37
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
+ num_csp_blocks=2),
39
+ bbox_head=dict(type='YOLOWorldHead',
40
+ head_module=dict(type='YOLOWorldHeadModule',
41
+ embed_dims=text_channels,
42
+ use_bn_head=True,
43
+ num_classes=num_training_classes)),
44
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
45
+
46
+ # dataset settings
47
+ text_transform = [
48
+ dict(type='RandomLoadText',
49
+ num_neg_samples=(num_classes, num_classes),
50
+ max_num_samples=num_training_classes,
51
+ padding_to_max=True,
52
+ padding_value=''),
53
+ dict(type='mmdet.PackDetInputs',
54
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
55
+ 'flip_direction', 'texts'))
56
+ ]
57
+ train_pipeline = [
58
+ *_base_.pre_transform,
59
+ dict(type='MultiModalMosaic',
60
+ img_scale=_base_.img_scale,
61
+ pad_val=114.0,
62
+ pre_transform=_base_.pre_transform),
63
+ dict(
64
+ type='YOLOv5RandomAffine',
65
+ max_rotate_degree=0.0,
66
+ max_shear_degree=0.0,
67
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
68
+ max_aspect_ratio=_base_.max_aspect_ratio,
69
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
70
+ border_val=(114, 114, 114)),
71
+ *_base_.last_transform[:-1],
72
+ *text_transform,
73
+ ]
74
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
75
+ obj365v1_train_dataset = dict(
76
+ type='MultiModalDataset',
77
+ dataset=dict(
78
+ type='YOLOv5Objects365V1Dataset',
79
+ data_root='data/objects365v1/',
80
+ ann_file='annotations/objects365_train.json',
81
+ data_prefix=dict(img='train/'),
82
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
83
+ class_text_path='data/captions/obj365v1_class_captions.json',
84
+ pipeline=train_pipeline)
85
+
86
+ mg_train_dataset = dict(
87
+ type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(
103
+ batch_size=train_batch_size_per_gpu,
104
+ collate_fn=dict(type='yolow_collate'),
105
+ dataset=dict(
106
+ _delete_=True,
107
+ type='ConcatDataset',
108
+ datasets=[
109
+ obj365v1_train_dataset,
110
+ flickr_train_dataset,
111
+ mg_train_dataset
112
+ ],
113
+ ignore_keys=['classes', 'palette']))
114
+
115
+ test_pipeline = [
116
+ *_base_.test_pipeline[:-1],
117
+ dict(type='LoadText'),
118
+ dict(type='mmdet.PackDetInputs',
119
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
120
+ 'scale_factor', 'pad_param', 'texts'))
121
+ ]
122
+ coco_val_dataset = dict(
123
+ _delete_=True,
124
+ type='MultiModalDataset',
125
+ dataset=dict(
126
+ type='YOLOv5LVISV1Dataset',
127
+ data_root='data/lvis/',
128
+ test_mode=True,
129
+ ann_file='annotations/'
130
+ 'lvis_v1_minival_inserted_image_name.json',
131
+ data_prefix=dict(img=''),
132
+ batch_shapes_cfg=None),
133
+ class_text_path='data/captions/lvis_v1_class_captions.json',
134
+ pipeline=test_pipeline)
135
+ val_dataloader = dict(dataset=coco_val_dataset)
136
+ test_dataloader = val_dataloader
137
+
138
+ val_evaluator = dict(
139
+ type='mmdet.LVISMetric',
140
+ ann_file='data/lvis/annotations/'
141
+ 'lvis_v1_minival_inserted_image_name.json',
142
+ metric='bbox')
143
+ test_evaluator = val_evaluator
144
+
145
+ # training settings
146
+ default_hooks = dict(
147
+ param_scheduler=dict(max_epochs=max_epochs),
148
+ checkpoint=dict(interval=save_epoch_intervals,
149
+ rule='greater'))
150
+ custom_hooks = [
151
+ dict(type='EMAHook',
152
+ ema_type='ExpMomentumEMA',
153
+ momentum=0.0001,
154
+ update_buffers=True,
155
+ strict_load=False,
156
+ priority=49),
157
+ dict(type='mmdet.PipelineSwitchHook',
158
+ switch_epoch=max_epochs - close_mosaic_epochs,
159
+ switch_pipeline=train_pipeline_stage2)
160
+ ]
161
+ train_cfg = dict(
162
+ max_epochs=max_epochs,
163
+ val_interval=10,
164
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
165
+ _base_.val_interval_stage2)])
166
+ optim_wrapper = dict(optimizer=dict(
167
+ _delete_=True,
168
+ type='AdamW',
169
+ lr=base_lr,
170
+ weight_decay=weight_decay,
171
+ batch_size_per_gpu=train_batch_size_per_gpu),
172
+ paramwise_cfg=dict(
173
+ bias_decay_mult=0.0,
174
+ norm_decay_mult=0.0,
175
+ custom_keys={
176
+ 'backbone.text_model':
177
+ dict(lr_mult=0.01),
178
+ 'logit_scale':
179
+ dict(weight_decay=0.0)
180
+ }),
181
+ constructor='YOLOWv5OptimizerConstructor')