adonaivera commited on
Commit
201693a
1 Parent(s): e9b779d

Upload 7 files

Browse files
yolov8l-world.py CHANGED
@@ -30,16 +30,17 @@ model = dict(
30
  type='HuggingCLIPLanguageBackbone',
31
  model_name='openai/clip-vit-base-patch32',
32
  frozen_modules=['all'])),
33
- neck=dict(type='YOLOWorldPAFPN',
34
  guide_channels=text_channels,
35
  embed_channels=neck_embed_channels,
36
  num_heads=neck_num_heads,
37
  block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
- num_csp_blocks=2),
 
 
39
  bbox_head=dict(type='YOLOWorldHead',
40
  head_module=dict(type='YOLOWorldHeadModule',
41
  embed_dims=text_channels,
42
- use_bn_head=True,
43
  num_classes=num_training_classes)),
44
  train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
45
 
@@ -80,37 +81,33 @@ obj365v1_train_dataset = dict(
80
  ann_file='annotations/objects365_train.json',
81
  data_prefix=dict(img='train/'),
82
  filter_cfg=dict(filter_empty_gt=False, min_size=32)),
83
- class_text_path='data/captions/obj365v1_class_captions.json',
84
  pipeline=train_pipeline)
85
 
86
- mg_train_dataset = dict(
87
- type='YOLOv5MixedGroundingDataset',
88
- data_root='data/mixed_grounding/',
89
- ann_file='annotations/final_mixed_train_no_coco.json',
90
- data_prefix=dict(img='gqa/images/'),
91
- filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
- pipeline=train_pipeline)
93
 
94
  flickr_train_dataset = dict(
95
  type='YOLOv5MixedGroundingDataset',
96
  data_root='data/flickr/',
97
  ann_file='annotations/final_flickr_separateGT_train.json',
98
- data_prefix=dict(img='images/'),
99
  filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
  pipeline=train_pipeline)
101
 
102
- train_dataloader = dict(
103
- batch_size=train_batch_size_per_gpu,
104
- collate_fn=dict(type='yolow_collate'),
105
- dataset=dict(
106
- _delete_=True,
107
- type='ConcatDataset',
108
- datasets=[
109
- obj365v1_train_dataset,
110
- flickr_train_dataset,
111
- mg_train_dataset
112
- ],
113
- ignore_keys=['classes', 'palette']))
114
 
115
  test_pipeline = [
116
  *_base_.test_pipeline[:-1],
@@ -122,31 +119,26 @@ test_pipeline = [
122
  coco_val_dataset = dict(
123
  _delete_=True,
124
  type='MultiModalDataset',
125
- dataset=dict(
126
- type='YOLOv5LVISV1Dataset',
127
- data_root='data/lvis/',
128
- test_mode=True,
129
- ann_file='annotations/'
130
- 'lvis_v1_minival_inserted_image_name.json',
131
- data_prefix=dict(img=''),
132
- batch_shapes_cfg=None),
133
- class_text_path='data/captions/lvis_v1_class_captions.json',
134
  pipeline=test_pipeline)
135
  val_dataloader = dict(dataset=coco_val_dataset)
136
  test_dataloader = val_dataloader
137
 
138
- val_evaluator = dict(
139
- type='mmdet.LVISMetric',
140
- ann_file='data/lvis/annotations/'
141
- 'lvis_v1_minival_inserted_image_name.json',
142
- metric='bbox')
143
  test_evaluator = val_evaluator
144
 
145
  # training settings
146
- default_hooks = dict(
147
- param_scheduler=dict(max_epochs=max_epochs),
148
- checkpoint=dict(interval=save_epoch_intervals,
149
- rule='greater'))
150
  custom_hooks = [
151
  dict(type='EMAHook',
152
  ema_type='ExpMomentumEMA',
@@ -158,24 +150,22 @@ custom_hooks = [
158
  switch_epoch=max_epochs - close_mosaic_epochs,
159
  switch_pipeline=train_pipeline_stage2)
160
  ]
161
- train_cfg = dict(
162
- max_epochs=max_epochs,
163
- val_interval=10,
164
- dynamic_intervals=[((max_epochs - close_mosaic_epochs),
165
- _base_.val_interval_stage2)])
166
  optim_wrapper = dict(optimizer=dict(
167
  _delete_=True,
168
  type='AdamW',
169
  lr=base_lr,
170
  weight_decay=weight_decay,
171
  batch_size_per_gpu=train_batch_size_per_gpu),
172
- paramwise_cfg=dict(
173
- bias_decay_mult=0.0,
174
- norm_decay_mult=0.0,
175
- custom_keys={
176
- 'backbone.text_model':
177
- dict(lr_mult=0.01),
178
- 'logit_scale':
179
- dict(weight_decay=0.0)
180
- }),
181
- constructor='YOLOWv5OptimizerConstructor')
 
30
  type='HuggingCLIPLanguageBackbone',
31
  model_name='openai/clip-vit-base-patch32',
32
  frozen_modules=['all'])),
33
+ neck=dict(type='YOLOWolrdDualPAFPN',
34
  guide_channels=text_channels,
35
  embed_channels=neck_embed_channels,
36
  num_heads=neck_num_heads,
37
  block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
39
+ embed_channels=256,
40
+ num_heads=8)),
41
  bbox_head=dict(type='YOLOWorldHead',
42
  head_module=dict(type='YOLOWorldHeadModule',
43
  embed_dims=text_channels,
 
44
  num_classes=num_training_classes)),
45
  train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
 
 
81
  ann_file='annotations/objects365_train.json',
82
  data_prefix=dict(img='train/'),
83
  filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
  pipeline=train_pipeline)
86
 
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
 
93
 
94
  flickr_train_dataset = dict(
95
  type='YOLOv5MixedGroundingDataset',
96
  data_root='data/flickr/',
97
  ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
  filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
  pipeline=train_pipeline)
101
 
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
 
 
 
111
 
112
  test_pipeline = [
113
  *_base_.test_pipeline[:-1],
 
119
  coco_val_dataset = dict(
120
  _delete_=True,
121
  type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_val.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
 
 
129
  pipeline=test_pipeline)
130
  val_dataloader = dict(dataset=coco_val_dataset)
131
  test_dataloader = val_dataloader
132
 
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_val.json',
135
+ metric='bbox')
 
 
136
  test_evaluator = val_evaluator
137
 
138
  # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
 
142
  custom_hooks = [
143
  dict(type='EMAHook',
144
  ema_type='ExpMomentumEMA',
 
150
  switch_epoch=max_epochs - close_mosaic_epochs,
151
  switch_pipeline=train_pipeline_stage2)
152
  ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
 
157
  optim_wrapper = dict(optimizer=dict(
158
  _delete_=True,
159
  type='AdamW',
160
  lr=base_lr,
161
  weight_decay=weight_decay,
162
  batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
 
yolov8lx-world.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a5eea3b42020210ea31f0cbd5e1c1687ca00069267b55088beb934fbb7cf48a
3
+ size 444389737
yolov8lx-world.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('yolov8_l_syncbn_fast_8xb16-500e_coco.py')
2
+ custom_imports = dict(imports=['yolo_world'],
3
+ allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 1203
7
+ num_training_classes = 80
8
+ max_epochs = 100 # Maximum training epochs
9
+ close_mosaic_epochs = 2
10
+ save_epoch_intervals = 2
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-3
15
+ weight_decay = 0.05 / 2
16
+ train_batch_size_per_gpu = 16
17
+
18
+ # model settings
19
+ model = dict(
20
+ type='YOLOWorldDetector',
21
+ mm_neck=True,
22
+ num_train_classes=num_training_classes,
23
+ num_test_classes=num_classes,
24
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
25
+ backbone=dict(
26
+ _delete_=True,
27
+ type='MultiModalYOLOBackbone',
28
+ image_model={{_base_.model.backbone}},
29
+ text_model=dict(
30
+ type='HuggingCLIPLanguageBackbone',
31
+ model_name='openai/clip-vit-base-patch32',
32
+ frozen_modules=['all'])),
33
+ neck=dict(type='YOLOWolrdDualPAFPN',
34
+ guide_channels=text_channels,
35
+ embed_channels=neck_embed_channels,
36
+ num_heads=neck_num_heads,
37
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
39
+ embed_channels=256,
40
+ num_heads=8)),
41
+ bbox_head=dict(type='YOLOWorldHead',
42
+ head_module=dict(type='YOLOWorldHeadModule',
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
yolov8m-world.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b7bd1bed30eba666db8c9aeb17076270955c91ec15e4220262091b6471ec89e
3
+ size 370349069
yolov8m-world.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('yolov8_m_syncbn_fast_8xb16-500e_coco.py')
2
+ custom_imports = dict(imports=['yolo_world'],
3
+ allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 1203
7
+ num_training_classes = 80
8
+ max_epochs = 100 # Maximum training epochs
9
+ close_mosaic_epochs = 2
10
+ save_epoch_intervals = 2
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-3
15
+ weight_decay = 0.05 / 2
16
+ train_batch_size_per_gpu = 16
17
+
18
+ # model settings
19
+ model = dict(
20
+ type='YOLOWorldDetector',
21
+ mm_neck=True,
22
+ num_train_classes=num_training_classes,
23
+ num_test_classes=num_classes,
24
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
25
+ backbone=dict(
26
+ _delete_=True,
27
+ type='MultiModalYOLOBackbone',
28
+ image_model={{_base_.model.backbone}},
29
+ text_model=dict(
30
+ type='HuggingCLIPLanguageBackbone',
31
+ model_name='openai/clip-vit-base-patch32',
32
+ frozen_modules=['all'])),
33
+ neck=dict(type='YOLOWolrdDualPAFPN',
34
+ guide_channels=text_channels,
35
+ embed_channels=neck_embed_channels,
36
+ num_heads=neck_num_heads,
37
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
39
+ embed_channels=256,
40
+ num_heads=8)),
41
+ bbox_head=dict(type='YOLOWorldHead',
42
+ head_module=dict(type='YOLOWorldHeadModule',
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
yolov8s-world.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18bea4d2f2b0af1d4a2d7353bd113184b84313ff285acd5690a4aa4e71da08a6
3
+ size 307536897
yolov8s-world.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('yolov8_s_syncbn_fast_8xb16-500e_coco.py')
2
+ custom_imports = dict(imports=['yolo_world'],
3
+ allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 1203
7
+ num_training_classes = 80
8
+ max_epochs = 100 # Maximum training epochs
9
+ close_mosaic_epochs = 2
10
+ save_epoch_intervals = 2
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-3
15
+ weight_decay = 0.05 / 2
16
+ train_batch_size_per_gpu = 16
17
+
18
+ # model settings
19
+ model = dict(
20
+ type='YOLOWorldDetector',
21
+ mm_neck=True,
22
+ num_train_classes=num_training_classes,
23
+ num_test_classes=num_classes,
24
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
25
+ backbone=dict(
26
+ _delete_=True,
27
+ type='MultiModalYOLOBackbone',
28
+ image_model={{_base_.model.backbone}},
29
+ text_model=dict(
30
+ type='HuggingCLIPLanguageBackbone',
31
+ model_name='openai/clip-vit-base-patch32',
32
+ frozen_modules=['all'])),
33
+ neck=dict(type='YOLOWolrdDualPAFPN',
34
+ guide_channels=text_channels,
35
+ embed_channels=neck_embed_channels,
36
+ num_heads=neck_num_heads,
37
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
38
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
39
+ embed_channels=256,
40
+ num_heads=8)),
41
+ bbox_head=dict(type='YOLOWorldHead',
42
+ head_module=dict(type='YOLOWorldHeadModule',
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')