andriizadaianchuk commited on
Commit
e2fdd40
1 Parent(s): 6b27857

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +468 -0
config.yaml ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ use_epochs: false
3
+ num_workers: 4
4
+ batch_size: ${experiment.batch_size_per_gpu}
5
+ _target_: ocl.datasets.WebdatasetDataModule
6
+ train_shards: ${oc.env:DATASET_PREFIX}/vg_disjoint_coco/train/shard-{000000..001217}.tar
7
+ train_size: 118287
8
+ val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar
9
+ val_size: 5000
10
+ test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar
11
+ test_size: 40670
12
+ use_autopadding: true
13
+ eval_transforms:
14
+ 03a_preprocessing:
15
+ _target_: ocl.transforms.Map
16
+ transform:
17
+ _target_: torchvision.transforms.Compose
18
+ transforms:
19
+ - _target_: ocl.preprocessing.SelectConditioningInfoVG
20
+ num_max_binds: ${experiment.num_slots}
21
+ num_slots: ${experiment.num_slots}
22
+ - _target_: ocl.preprocessing.CopyFields
23
+ mapping:
24
+ instance_mask: instance_mask_v2
25
+ fields:
26
+ - image
27
+ - instance_mask
28
+ - instance_bbox
29
+ - name
30
+ - bbox_centroids
31
+ - name_embedding
32
+ - selected_indices
33
+ - contrastive_loss_mask
34
+ - all_bbox_centroids
35
+ batch_transform: false
36
+ 03c_preprocessing:
37
+ _target_: ocl.transforms.SimpleTransform
38
+ transforms:
39
+ image:
40
+ _target_: torchvision.transforms.Compose
41
+ transforms:
42
+ - '${lambda_fn:''lambda image: image.copy()''}'
43
+ - _target_: torchvision.transforms.v2.ToImage
44
+ - _target_: torchvision.transforms.v2.ToDtype
45
+ dtype: ${torch_dtype:float32}
46
+ scale: true
47
+ - _target_: torchvision.transforms.v2.Normalize
48
+ mean:
49
+ - 0.485
50
+ - 0.456
51
+ - 0.406
52
+ std:
53
+ - 0.229
54
+ - 0.224
55
+ - 0.225
56
+ instance_mask:
57
+ _target_: torchvision.transforms.Compose
58
+ transforms:
59
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
60
+ output_axis: -3
61
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
62
+ - _target_: ocl.preprocessing.DenseMaskToTensor
63
+ instance_mask_v2:
64
+ _target_: torchvision.transforms.Compose
65
+ transforms:
66
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
67
+ output_axis: -3
68
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
69
+ - _target_: ocl.preprocessing.DenseMaskToTensor
70
+ batch_transform: false
71
+ train_transforms:
72
+ 03a_preprocessing:
73
+ _target_: ocl.transforms.Map
74
+ transform:
75
+ _target_: torchvision.transforms.Compose
76
+ transforms:
77
+ - _target_: ocl.preprocessing.SelectConditioningInfoVG
78
+ num_max_binds: ${experiment.num_slots}
79
+ num_slots: ${experiment.num_slots}
80
+ - _target_: ocl.preprocessing.CopyFields
81
+ mapping:
82
+ instance_mask: instance_mask_v2
83
+ fields:
84
+ - image
85
+ - instance_mask
86
+ - instance_bbox
87
+ - name
88
+ - bbox_centroids
89
+ - name_embedding
90
+ - selected_indices
91
+ - contrastive_loss_mask
92
+ batch_transform: false
93
+ 03b_preprocessing:
94
+ _target_: ocl.transforms.SimpleTransform
95
+ transforms:
96
+ image:
97
+ _target_: torchvision.transforms.Compose
98
+ transforms:
99
+ - '${lambda_fn:''lambda image: image.copy()''}'
100
+ - _target_: torchvision.transforms.v2.ToImage
101
+ - _target_: torchvision.transforms.v2.ToDtype
102
+ dtype: ${torch_dtype:float32}
103
+ scale: true
104
+ - _target_: torchvision.transforms.v2.Normalize
105
+ mean:
106
+ - 0.485
107
+ - 0.456
108
+ - 0.406
109
+ std:
110
+ - 0.229
111
+ - 0.224
112
+ - 0.225
113
+ name_embedding:
114
+ _target_: torchvision.transforms.Compose
115
+ transforms:
116
+ - '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}'
117
+ - _target_: ocl.preprocessing.ToTensor
118
+ bbox_centroids:
119
+ _target_: torchvision.transforms.Compose
120
+ transforms:
121
+ - '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}'
122
+ - _target_: ocl.preprocessing.ToTensor
123
+ all_bbox_centroids:
124
+ _target_: torchvision.transforms.Compose
125
+ transforms:
126
+ - '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}'
127
+ - _target_: ocl.preprocessing.ToTensor
128
+ selected_indices:
129
+ _target_: torchvision.transforms.Compose
130
+ transforms:
131
+ - '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}'
132
+ - _target_: ocl.preprocessing.ToTensor
133
+ contrastive_loss_mask:
134
+ _target_: torchvision.transforms.Compose
135
+ transforms:
136
+ - '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}'
137
+ - _target_: ocl.preprocessing.ToTensor
138
+ instance_mask:
139
+ _target_: torchvision.transforms.Compose
140
+ transforms:
141
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
142
+ output_axis: -3
143
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
144
+ - _target_: ocl.preprocessing.DenseMaskToTensor
145
+ instance_mask_v2:
146
+ _target_: torchvision.transforms.Compose
147
+ transforms:
148
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
149
+ output_axis: -3
150
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
151
+ - _target_: ocl.preprocessing.DenseMaskToTensor
152
+ batch_transform: false
153
+ models:
154
+ feature_extractor:
155
+ _target_: routed.ocl.feature_extractors.TimmFeatureExtractor
156
+ model_name: ${experiment.timm_model}
157
+ pretrained: ${when_testing:false,true}
158
+ freeze: true
159
+ feature_level: 12
160
+ video_path: input.image
161
+ dynamic_img_size: true
162
+ mapping:
163
+ _target_: routed.ocl.mapping.MLPMapping
164
+ dim: ${experiment.feature_dim}
165
+ x_path: feature_extractor
166
+ conditioning:
167
+ _target_: routed.ocl.conditioning.LangConditioning
168
+ n_slots: ${experiment.num_slots}
169
+ object_dim: ${experiment.slot_dim}
170
+ dual_conditioning: false
171
+ name_embedding_path: input.name_embedding
172
+ batch_size_path: input.batch_size
173
+ mask_path: input.contrastive_loss_mask
174
+ perceptual_grouping:
175
+ _target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping
176
+ feature_dim: ${.object_dim}
177
+ object_dim: ${experiment.slot_dim}
178
+ use_projection_bias: false
179
+ positional_embedding:
180
+ _target_: ocl.neural_networks.wrappers.Sequential
181
+ _args_:
182
+ - _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed
183
+ - _target_: ocl.neural_networks.build_two_layer_mlp
184
+ input_dim: ${experiment.feature_dim}
185
+ output_dim: ${....feature_dim}
186
+ hidden_dim: '${mul: ${experiment.feature_dim}, 2}'
187
+ initial_layer_norm: true
188
+ ff_mlp:
189
+ _target_: ocl.neural_networks.build_two_layer_mlp
190
+ input_dim: ${..object_dim}
191
+ output_dim: ${..object_dim}
192
+ hidden_dim: '${mul: ${..object_dim}, 4}'
193
+ initial_layer_norm: true
194
+ residual: true
195
+ feature_path: mapping
196
+ conditioning_path: conditioning
197
+ attn_aggregation:
198
+ _target_: routed.ocl.heads.PatchClipAttentionAggregationHead
199
+ dim: ${experiment.feature_dim}
200
+ attn_path: perceptual_grouping.feature_attributions
201
+ x_path: input.image
202
+ projector_slots:
203
+ _target_: routed.ocl.heads.SlotProjectorHead
204
+ dim: 512
205
+ embedding_dim: 512
206
+ slots_path: attn_aggregation
207
+ dual_embedding:
208
+ _target_: routed.ocl.heads.CLIPLangEmbeddingHead
209
+ embedding_dim: 512
210
+ names_batch_path: input.name
211
+ dec_conditioning:
212
+ _target_: routed.ocl.decoder_conditioning.EncodeLangConditioning
213
+ dim: ${experiment.slot_dim}
214
+ language_path: input.name_embedding
215
+ mask_path: input.contrastive_loss_mask
216
+ object_decoder:
217
+ _target_: routed.ocl.decoding.PatchDecoder
218
+ decoder:
219
+ _target_: ocl.neural_networks.build_mlp
220
+ _partial_: true
221
+ features:
222
+ - 2048
223
+ - 2048
224
+ - 2048
225
+ object_dim: ${experiment.slot_dim}
226
+ output_dim: ${experiment.feature_dim}
227
+ num_patches: ${experiment.num_patches}
228
+ object_features_path: perceptual_grouping.objects
229
+ image_path: input.image
230
+ conditioned: true
231
+ condition_info_path: dec_conditioning
232
+ optimizers:
233
+ opt0:
234
+ _target_: ocl.optimization.OptimizationWrapper
235
+ optimizer:
236
+ _target_: torch.optim.AdamW
237
+ _partial_: true
238
+ lr: ${experiment.total_lr}
239
+ lr_scheduler:
240
+ _target_: ocl.scheduling.exponential_decay_after_optional_warmup
241
+ _partial_: true
242
+ decay_rate: 0.5
243
+ decay_steps: 100000
244
+ warmup_steps: 10000
245
+ parameter_groups:
246
+ _target_: ocl.optimization.ParameterGroupCreator
247
+ param_groups:
248
+ grouping:
249
+ params:
250
+ - models.perceptual_grouping
251
+ - models.conditioning
252
+ - models.object_decoder
253
+ - models.dec_conditioning
254
+ lr: ${experiment.total_lr}
255
+ weight_decay: 0.0
256
+ encoder:
257
+ params:
258
+ - models.mapping
259
+ - models.projector_slots
260
+ lr: ${experiment.mapping_lr}
261
+ weight_decay: 0.0
262
+ losses:
263
+ mse:
264
+ _target_: routed.ocl.losses.ReconstructionLoss
265
+ loss_type: mse
266
+ input_path: object_decoder.reconstruction
267
+ target_path: feature_extractor.features
268
+ contrastive_loss:
269
+ _target_: routed.ocl.losses.DiagonalContrastiveLoss
270
+ x1_path: projector_slots
271
+ x2_path: dual_embedding
272
+ contrastive_loss_mask_path: input.contrastive_loss_mask
273
+ temp: 0.1
274
+ batch_contrastive: true
275
+ weight: 0.2
276
+ visualizations:
277
+ input:
278
+ _target_: routed.ocl.visualizations.Image
279
+ n_instances: 32
280
+ denormalization:
281
+ _target_: ocl.preprocessing.Denormalize
282
+ mean:
283
+ - 0.485
284
+ - 0.456
285
+ - 0.406
286
+ std:
287
+ - 0.229
288
+ - 0.224
289
+ - 0.225
290
+ image_path: input.image
291
+ masks:
292
+ _target_: routed.ocl.visualizations.Mask
293
+ mask_path: object_decoder.masks_as_image
294
+ pred_segmentation:
295
+ _target_: routed.ocl.visualizations.Segmentation
296
+ denormalization:
297
+ _target_: ocl.preprocessing.Denormalize
298
+ mean:
299
+ - 0.485
300
+ - 0.456
301
+ - 0.406
302
+ std:
303
+ - 0.229
304
+ - 0.224
305
+ - 0.225
306
+ image_path: input.image
307
+ mask_path: object_decoder.masks_as_image
308
+ pred_segmentation_with_text:
309
+ _target_: routed.ocl.visualizations.SegmentationWithText
310
+ n_instances: 32
311
+ denormalization:
312
+ _target_: ocl.preprocessing.Denormalize
313
+ mean:
314
+ - 0.485
315
+ - 0.456
316
+ - 0.406
317
+ std:
318
+ - 0.229
319
+ - 0.224
320
+ - 0.225
321
+ image_path: input.image
322
+ mask_path: object_decoder.masks_as_image
323
+ gt_masks_path: input.instance_mask_v2
324
+ selected_indices_path: input.selected_indices
325
+ text_path: input.name
326
+ bbox_centroids_path: input.all_bbox_centroids
327
+ trainer:
328
+ _target_: pytorch_lightning.trainer.trainer.Trainer
329
+ accelerator: auto
330
+ strategy: auto
331
+ devices: 1
332
+ num_nodes: 1
333
+ precision: null
334
+ logger:
335
+ - _target_: pytorch_lightning.loggers.TensorBoardLogger
336
+ save_dir: .
337
+ name: tb
338
+ version: ''
339
+ - _target_: pytorch_lightning.loggers.WandbLogger
340
+ project: ${slice:${hydra:runtime.choices.experiment},"/", 1}_${slice:${hydra:runtime.choices.experiment},"/",
341
+ 2}
342
+ name: ${slice:${hydra:runtime.choices.experiment},"/","3:"}
343
+ log_model: false
344
+ callbacks: ${oc.dict.values:experiment.callbacks}
345
+ fast_dev_run: false
346
+ max_epochs: -1
347
+ min_epochs: null
348
+ max_steps: 100000
349
+ min_steps: null
350
+ max_time: null
351
+ limit_train_batches: null
352
+ limit_val_batches: null
353
+ limit_test_batches: null
354
+ limit_predict_batches: null
355
+ overfit_batches: 0.0
356
+ val_check_interval: 5000
357
+ check_val_every_n_epoch: null
358
+ num_sanity_val_steps: null
359
+ log_every_n_steps: 100
360
+ enable_checkpointing: null
361
+ enable_progress_bar: null
362
+ enable_model_summary: null
363
+ accumulate_grad_batches: 1
364
+ gradient_clip_val: 1.0
365
+ gradient_clip_algorithm: null
366
+ deterministic: null
367
+ benchmark: null
368
+ inference_mode: true
369
+ use_distributed_sampler: true
370
+ profiler: null
371
+ detect_anomaly: false
372
+ barebones: false
373
+ plugins: null
374
+ sync_batchnorm: false
375
+ reload_dataloaders_every_n_epochs: 0
376
+ default_root_dir: .
377
+ training_vis_frequency: 10000
378
+ training_metrics:
379
+ acc_sc:
380
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
381
+ mode: sc
382
+ slot_emb_path: projector_slots
383
+ ctrl_emb_path: dual_embedding
384
+ mask_idx_path: input.contrastive_loss_mask
385
+ acc_cs:
386
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
387
+ mode: cs
388
+ slot_emb_path: projector_slots
389
+ ctrl_emb_path: dual_embedding
390
+ mask_idx_path: input.contrastive_loss_mask
391
+ acc_avg:
392
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
393
+ mode: average
394
+ slot_emb_path: projector_slots
395
+ ctrl_emb_path: dual_embedding
396
+ mask_idx_path: input.contrastive_loss_mask
397
+ evaluation_metrics:
398
+ binding_hits:
399
+ _target_: routed.ocl.metrics.BindingHits
400
+ prediction_path: object_decoder.masks_as_image
401
+ target_path: input.instance_mask_v2
402
+ selected_indices_path: input.selected_indices
403
+ use_threshold: false
404
+ matching: best_overlap
405
+ ignore_overlaps: false
406
+ instance_ari:
407
+ _target_: routed.ocl.metrics.ARIMetric
408
+ prediction_path: object_decoder.masks_as_image
409
+ target_path: input.instance_mask_v2
410
+ foreground: false
411
+ convert_target_one_hot: true
412
+ ignore_overlaps: true
413
+ instance_mbo:
414
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
415
+ prediction_path: object_decoder.masks_as_image
416
+ target_path: input.instance_mask
417
+ use_threshold: false
418
+ matching: best_overlap
419
+ ignore_overlaps: true
420
+ gt_matched_instance_mbo:
421
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
422
+ prediction_path: object_decoder.masks_as_image
423
+ target_path: input.instance_mask_v2
424
+ selected_indices_path: input.selected_indices
425
+ use_threshold: false
426
+ matching: best_overlap
427
+ ignore_overlaps: true
428
+ acc_sc:
429
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
430
+ mode: sc
431
+ slot_emb_path: projector_slots
432
+ ctrl_emb_path: dual_embedding
433
+ mask_idx_path: input.contrastive_loss_mask
434
+ acc_cs:
435
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
436
+ mode: cs
437
+ slot_emb_path: projector_slots
438
+ ctrl_emb_path: dual_embedding
439
+ mask_idx_path: input.contrastive_loss_mask
440
+ acc_avg:
441
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
442
+ mode: average
443
+ slot_emb_path: projector_slots
444
+ ctrl_emb_path: dual_embedding
445
+ mask_idx_path: input.contrastive_loss_mask
446
+ load_checkpoint: null
447
+ load_checkpoint_partial: null
448
+ modules_to_load: null
449
+ trainable_models: null
450
+ seed: null
451
+ experiment:
452
+ callbacks: {}
453
+ checkpoint_every_n_steps: 1000
454
+ image_size: 224
455
+ mask_size: ${.image_size}
456
+ batch_size_per_gpu: 64
457
+ base_learning_rate: 0.0004
458
+ max_num_binds: 7
459
+ slot_dim: 256
460
+ num_slots: 7
461
+ timm_model: vit_small_patch14_dinov2.lvd142m
462
+ feature_dim: '${timm_model_dim: ${.timm_model}}'
463
+ num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}'
464
+ num_patches_per_side: '${isqrt: ${.num_patches}}'
465
+ patch_size: '${timm_model_patch_size: ${.timm_model}}'
466
+ total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}'
467
+ total_lr: 0.0004
468
+ mapping_lr: '${mul: 0.05, ${.total_lr}}'