Add files using upload-large-folder tool
Browse files- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/config.txt +228 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/eval_results.txt +110 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_cls_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.4 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_reg_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.5 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_vtm_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.6 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/events.out.tfevents.1729871205.autodl-container-b3ec4da47b-bc5fbea1.663163.0 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/events.out.tfevents.1729871205.autodl-container-b3ec4da47b-bc5fbea1.663164.0 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.1 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.2 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.3 +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/model_3_21.966959215281364.pth.tar +3 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/nlq_predictions_epoch_val_top10_3.json +0 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/nlq_predictions_epoch_val_top10_3_noscore.json +0 -0
- ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/run.sh +1 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/config.txt +226 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/eval_results.txt +44 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_cls_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.4 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_reg_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.5 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_vtm_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.6 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632235.autodl-container-b3ec4da47b-bc5fbea1.31554.0 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632235.autodl-container-b3ec4da47b-bc5fbea1.31555.0 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632793.autodl-container-b3ec4da47b-bc5fbea1.33573.0 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632793.autodl-container-b3ec4da47b-bc5fbea1.33574.0 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.1 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.2 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.3 +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/model_2_26.834358523725836.pth.tar +3 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/nlq_predictions_epoch_val_top10_2.json +0 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/nlq_predictions_epoch_val_top10_2_noscore.json +0 -0
- ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/run.sh +1 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/config.txt +223 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/eval_results.txt +56 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458057.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458059.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458060.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906438.autodl-container-b3ec4da47b-bc5fbea1.460310.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460307.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460308.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460309.0 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.1 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.2 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.3 +3 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6.json +0 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_Bayesian.json +0 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_noscore.json +0 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_noscore_Bayesian.json +0 -0
- goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/run.sh +1 -0
- pretrain_naq/egovlp/model_5_pretrain_egovlp.pth.tar +3 -0
- pretrain_naq/internvideo/model_7_pretrain.pth.tar +3 -0
- tacos/c3d/scratch/tacos_c3d_glove_weight1_5e-5_objectmambafinetune150/config.txt +228 -0
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/config.txt
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'dataset': {'classname_feat_concat': 'only',
|
| 2 |
+
'classname_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/classname-clip-base/a_photo_of.pt',
|
| 3 |
+
'default_fps': 30,
|
| 4 |
+
'downsample_rate': 1,
|
| 5 |
+
'enable_temporal_jittering': False,
|
| 6 |
+
'feat_stride': 16.0,
|
| 7 |
+
'fix_video_frames': 0,
|
| 8 |
+
'input_txt_dim': 512,
|
| 9 |
+
'input_vid_dim': 256,
|
| 10 |
+
'json_file': 'ego4d_data/nlq_v1/ego4d_nlq_val_v1_lemma.jsonl',
|
| 11 |
+
'lavila_caption_dir': '/root/autodl-tmp/data/ego4d/nlq/lavila/narration',
|
| 12 |
+
'max_seq_len': 2560,
|
| 13 |
+
'num_classes': 1,
|
| 14 |
+
'num_frames': 16.0,
|
| 15 |
+
'object_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/co-detr/class-score0.6-minnum10-lmdb',
|
| 16 |
+
'object_feat_type': 'class-score',
|
| 17 |
+
'text_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/SnAG/nlq_v1_clip_token_features',
|
| 18 |
+
'train_jsonl_file': 'ego4d_data/nlq_v1/ego4d_nlq_train_v1_lemma_clean.jsonl',
|
| 19 |
+
'val_jsonl_file': 'ego4d_data/nlq_v1/ego4d_nlq_val_v1_lemma.jsonl',
|
| 20 |
+
'val_text_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/SnAG/nlq_v1_clip_token_features',
|
| 21 |
+
'video_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/egovlp_lmdb'},
|
| 22 |
+
'dataset_name': 'ego4d_multitask',
|
| 23 |
+
'devices': 'cuda:0',
|
| 24 |
+
'init_rand_seed': 12345678,
|
| 25 |
+
'loader': {'batch_size': 4, 'num_workers': 4},
|
| 26 |
+
'model': {'backbone_arch': [2, 4, 4, 0, 6],
|
| 27 |
+
'backbone_type': 'ObjectMambaTransformer',
|
| 28 |
+
'embd_dim': 512,
|
| 29 |
+
'embd_kernel_size': 3,
|
| 30 |
+
'embd_with_ln': True,
|
| 31 |
+
'fpn_dim': 512,
|
| 32 |
+
'fpn_start_level': 0,
|
| 33 |
+
'fpn_type': 'identity',
|
| 34 |
+
'fpn_with_ln': True,
|
| 35 |
+
'generator': {'generator_type': 'point'},
|
| 36 |
+
'head_dim': 512,
|
| 37 |
+
'head_kernel_size': 3,
|
| 38 |
+
'head_num_layers': 3,
|
| 39 |
+
'head_with_ln': True,
|
| 40 |
+
'input_txt_dim': 512,
|
| 41 |
+
'input_vid_dim': 256,
|
| 42 |
+
'max_buffer_len_factor': 4.0,
|
| 43 |
+
'max_query': 230,
|
| 44 |
+
'max_seq_len': 2560,
|
| 45 |
+
'max_shot_num': 1800,
|
| 46 |
+
'multiscale_encoder_cfg': [{'layer_cfg': {'mha_win_size': 9,
|
| 47 |
+
'n_ds_strides': [2, 2],
|
| 48 |
+
'n_embd': 256,
|
| 49 |
+
'n_head': 4,
|
| 50 |
+
'path_pdrop': 0.1},
|
| 51 |
+
'layer_num': 6,
|
| 52 |
+
'layer_type': 'TransformerBlock'},
|
| 53 |
+
{'layer_cfg': {'in_channels': [256,
|
| 54 |
+
256,
|
| 55 |
+
256,
|
| 56 |
+
256,
|
| 57 |
+
256,
|
| 58 |
+
256,
|
| 59 |
+
256],
|
| 60 |
+
'out_channel': 256},
|
| 61 |
+
'layer_num': 1,
|
| 62 |
+
'layer_type': 'FPNLayernorm'}],
|
| 63 |
+
'n_head': 4,
|
| 64 |
+
'n_mha_win_size': 9,
|
| 65 |
+
'nlq_heads_cfg': {'center_sample_radius': 1.5,
|
| 66 |
+
'cls_head_cfg': {'empty_cls': [],
|
| 67 |
+
'feat_dim': 256,
|
| 68 |
+
'input_dim': 256,
|
| 69 |
+
'kernel_size': 3,
|
| 70 |
+
'num_classes': 1,
|
| 71 |
+
'num_layers': 3,
|
| 72 |
+
'prior_prob': 0.01,
|
| 73 |
+
'with_ln': True},
|
| 74 |
+
'duration_thresh': 0.001,
|
| 75 |
+
'iou_threshold': 0.1,
|
| 76 |
+
'loss_normalizer': 200,
|
| 77 |
+
'loss_normalizer_momentum': 0.9,
|
| 78 |
+
'max_seg_num': 5,
|
| 79 |
+
'min_score': 0.001,
|
| 80 |
+
'pre_nms_thresh': 0.001,
|
| 81 |
+
'pre_nms_topk': 2000,
|
| 82 |
+
'reg_head_cfg': {'feat_dim': 256,
|
| 83 |
+
'fpn_levels': 7,
|
| 84 |
+
'input_dim': 256,
|
| 85 |
+
'kernel_size': 3,
|
| 86 |
+
'num_layers': 3,
|
| 87 |
+
'with_ln': True},
|
| 88 |
+
'reg_loss_weight': 1.0,
|
| 89 |
+
'train_label_smoothing': 0.1},
|
| 90 |
+
'num_classes': 1,
|
| 91 |
+
'obj_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 92 |
+
'n_in': 512,
|
| 93 |
+
'n_out': 256,
|
| 94 |
+
'num_layer': 2},
|
| 95 |
+
'layer_num': 1,
|
| 96 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 97 |
+
{'layer_cfg': {'n_embd': 256, 'path_pdrop': 0.1},
|
| 98 |
+
'layer_num': 4,
|
| 99 |
+
'layer_type': 'ObjectEncoderBlock'}],
|
| 100 |
+
'object_dim': 512,
|
| 101 |
+
'object_use_cross_model': True,
|
| 102 |
+
'object_win_size': 1,
|
| 103 |
+
'regression_range': [[0, 4],
|
| 104 |
+
[2, 8],
|
| 105 |
+
[4, 16],
|
| 106 |
+
[8, 32],
|
| 107 |
+
[16, 64],
|
| 108 |
+
[32, 128],
|
| 109 |
+
[64, 10000]],
|
| 110 |
+
'scale_factor': 2,
|
| 111 |
+
'tasks': ['NLQ', 'VTM'],
|
| 112 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 113 |
+
'ext_score_file': None,
|
| 114 |
+
'iou_threshold': 0.1,
|
| 115 |
+
'max_seg_num': 5,
|
| 116 |
+
'min_score': 0.001,
|
| 117 |
+
'multiclass_nms': True,
|
| 118 |
+
'nms_method': 'soft',
|
| 119 |
+
'nms_sigma': 0.75,
|
| 120 |
+
'pre_nms_thresh': 0.001,
|
| 121 |
+
'pre_nms_topk': 2000,
|
| 122 |
+
'test_num': 1,
|
| 123 |
+
'test_start_epoch': 2,
|
| 124 |
+
'voting_thresh': 0.9},
|
| 125 |
+
'text_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 126 |
+
'n_in': 512,
|
| 127 |
+
'n_out': 256,
|
| 128 |
+
'num_layer': 2},
|
| 129 |
+
'layer_num': 1,
|
| 130 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 131 |
+
{'layer_cfg': {'n_embd': 256,
|
| 132 |
+
'n_head': 4,
|
| 133 |
+
'path_pdrop': 0.1},
|
| 134 |
+
'layer_num': 4,
|
| 135 |
+
'layer_type': 'TransformerBlock'}],
|
| 136 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 137 |
+
'center_sample': 'radius',
|
| 138 |
+
'center_sample_radius': 1.5,
|
| 139 |
+
'clip_grad_l2norm': 1.0,
|
| 140 |
+
'cls_prior_prob': 0.01,
|
| 141 |
+
'dropout': 0.0,
|
| 142 |
+
'droppath': 0.1,
|
| 143 |
+
'head_empty_cls': [],
|
| 144 |
+
'init_loss_norm': 200,
|
| 145 |
+
'iou_loss_weight': 1.0,
|
| 146 |
+
'label_smoothing': 0.1,
|
| 147 |
+
'loss_weight': 1.0,
|
| 148 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 149 |
+
'num_decoder_layer': 6},
|
| 150 |
+
'use_abs_pe': True,
|
| 151 |
+
'use_lmha_in_fpn': True,
|
| 152 |
+
'use_rel_pe': False,
|
| 153 |
+
'video_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 154 |
+
'kernel_size': 3,
|
| 155 |
+
'n_hidden': 256,
|
| 156 |
+
'n_in': 256,
|
| 157 |
+
'n_out': 256,
|
| 158 |
+
'num_layer': 2},
|
| 159 |
+
'layer_num': 1,
|
| 160 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 161 |
+
{'layer_cfg': {'mamba_arch': ['bimamba1',
|
| 162 |
+
'mlp',
|
| 163 |
+
'obj'],
|
| 164 |
+
'n_embd': 256,
|
| 165 |
+
'n_head': 4,
|
| 166 |
+
'path_pdrop': 0.1},
|
| 167 |
+
'layer_num': 4,
|
| 168 |
+
'layer_type': 'ObjectMambaBlock'}],
|
| 169 |
+
'vtm_heads_cfg': {'loss_weight': 1.5,
|
| 170 |
+
'multiscale': False,
|
| 171 |
+
'shot_aggregator_cfg': {'layer_cfg': {'cross_mixer_cfg': {'block_cfg': {'n_embd': 256,
|
| 172 |
+
'n_head': 4},
|
| 173 |
+
'block_type': 'MaskedMHA'},
|
| 174 |
+
'num_layer': 1,
|
| 175 |
+
'path_pdrop': 0.1,
|
| 176 |
+
'query_num': 5,
|
| 177 |
+
'self_mixer_cfg': {'block_cfg': {'n_embd': 256,
|
| 178 |
+
'n_head': 4},
|
| 179 |
+
'block_type': 'MaskedMHCA'}},
|
| 180 |
+
'layer_type': 'QFormerLayer'},
|
| 181 |
+
'similarity_head_cfg': {'layer_cfg': {'con_dim': 256,
|
| 182 |
+
'x_dim': 256,
|
| 183 |
+
'y_dim': 256},
|
| 184 |
+
'layer_type': 'Cosine'},
|
| 185 |
+
'soft_label': False}},
|
| 186 |
+
'model_name': 'MultiTaskArch',
|
| 187 |
+
'opt': {'backbone_lr_weight': 1,
|
| 188 |
+
'epochs': 6,
|
| 189 |
+
'learning_rate': 0.0004,
|
| 190 |
+
'momentum': 0.9,
|
| 191 |
+
'schedule_gamma': 0.1,
|
| 192 |
+
'schedule_steps': [],
|
| 193 |
+
'schedule_type': 'cosine',
|
| 194 |
+
'type': 'AdamW',
|
| 195 |
+
'warmup': True,
|
| 196 |
+
'warmup_epochs': 4,
|
| 197 |
+
'weight_decay': 0.05},
|
| 198 |
+
'output_folder': '/root/autodl-tmp/model/GroundNLQ/ckpt/',
|
| 199 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 200 |
+
'ext_score_file': None,
|
| 201 |
+
'iou_threshold': 0.1,
|
| 202 |
+
'max_seg_num': 5,
|
| 203 |
+
'min_score': 0.001,
|
| 204 |
+
'multiclass_nms': True,
|
| 205 |
+
'nms_method': 'soft',
|
| 206 |
+
'nms_sigma': 0.75,
|
| 207 |
+
'pre_nms_thresh': 0.001,
|
| 208 |
+
'pre_nms_topk': 2000,
|
| 209 |
+
'test_num': 1,
|
| 210 |
+
'test_start_epoch': 2,
|
| 211 |
+
'voting_thresh': 0.9},
|
| 212 |
+
'track': 'goal_step',
|
| 213 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 214 |
+
'center_sample': 'radius',
|
| 215 |
+
'center_sample_radius': 1.5,
|
| 216 |
+
'clip_grad_l2norm': 1.0,
|
| 217 |
+
'cls_prior_prob': 0.01,
|
| 218 |
+
'dropout': 0.0,
|
| 219 |
+
'droppath': 0.1,
|
| 220 |
+
'head_empty_cls': [],
|
| 221 |
+
'init_loss_norm': 200,
|
| 222 |
+
'iou_loss_weight': 1.0,
|
| 223 |
+
'label_smoothing': 0.1,
|
| 224 |
+
'loss_weight': 1.0,
|
| 225 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 226 |
+
'num_decoder_layer': 6},
|
| 227 |
+
'train_split': ['training'],
|
| 228 |
+
'val_split': ['validation']}
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/eval_results.txt
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
+----------+----------+----------+----------+----------+----------+
|
| 2 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 3 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 4 |
+
+----------+----------+----------+----------+----------+----------+
|
| 5 |
+
| 8.00 | 4.31 | 23.26 | 12.13 | 23.26 | 12.13 |
|
| 6 |
+
+----------+----------+----------+----------+----------+----------+avgiou=8.002065
|
| 7 |
+
epoch0
|
| 8 |
+
final_loss 0.67 (1.66)
|
| 9 |
+
cls_loss 0.24 (0.91)
|
| 10 |
+
reg_loss 0.17 (0.39)
|
| 11 |
+
vtm_loss 0.18 (0.24)
|
| 12 |
+
+----------+----------+----------+----------+----------+----------+
|
| 13 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 14 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 15 |
+
+----------+----------+----------+----------+----------+----------+
|
| 16 |
+
| 20.03 | 13.29 | 41.35 | 29.04 | 41.35 | 29.04 |
|
| 17 |
+
+----------+----------+----------+----------+----------+----------+avgiou=20.030976
|
| 18 |
+
epoch1
|
| 19 |
+
final_loss 0.71 (1.61)
|
| 20 |
+
cls_loss 0.29 (0.90)
|
| 21 |
+
reg_loss 0.15 (0.37)
|
| 22 |
+
vtm_loss 0.18 (0.23)
|
| 23 |
+
+----------+----------+----------+----------+----------+----------+
|
| 24 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 25 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 26 |
+
+----------+----------+----------+----------+----------+----------+
|
| 27 |
+
| 21.35 | 14.61 | 43.78 | 32.11 | 43.78 | 32.11 |
|
| 28 |
+
+----------+----------+----------+----------+----------+----------+avgiou=21.347445
|
| 29 |
+
epoch2
|
| 30 |
+
final_loss 0.63 (1.59)
|
| 31 |
+
cls_loss 0.22 (0.89)
|
| 32 |
+
reg_loss 0.15 (0.36)
|
| 33 |
+
vtm_loss 0.18 (0.23)
|
| 34 |
+
+----------+----------+----------+----------+----------+----------+
|
| 35 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 36 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 37 |
+
+----------+----------+----------+----------+----------+----------+
|
| 38 |
+
| 21.97 | 15.20 | 44.61 | 32.96 | 44.61 | 32.96 |
|
| 39 |
+
+----------+----------+----------+----------+----------+----------+avgiou=21.966959
|
| 40 |
+
epoch3
|
| 41 |
+
final_loss 0.54 (1.60)
|
| 42 |
+
cls_loss 0.16 (0.90)
|
| 43 |
+
reg_loss 0.11 (0.36)
|
| 44 |
+
vtm_loss 0.18 (0.23)
|
| 45 |
+
+----------+----------+----------+----------+----------+----------+
|
| 46 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 47 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 48 |
+
+----------+----------+----------+----------+----------+----------+
|
| 49 |
+
| 21.55 | 14.79 | 42.82 | 31.62 | 42.82 | 31.62 |
|
| 50 |
+
+----------+----------+----------+----------+----------+----------+avgiou=21.553949
|
| 51 |
+
epoch4
|
| 52 |
+
final_loss 0.68 (1.64)
|
| 53 |
+
cls_loss 0.26 (0.93)
|
| 54 |
+
reg_loss 0.16 (0.36)
|
| 55 |
+
vtm_loss 0.17 (0.23)
|
| 56 |
+
+----------+----------+----------+----------+----------+----------+
|
| 57 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 58 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 59 |
+
+----------+----------+----------+----------+----------+----------+
|
| 60 |
+
| 20.55 | 14.09 | 41.40 | 30.23 | 41.40 | 30.23 |
|
| 61 |
+
+----------+----------+----------+----------+----------+----------+avgiou=20.547238
|
| 62 |
+
epoch5
|
| 63 |
+
final_loss 0.61 (1.69)
|
| 64 |
+
cls_loss 0.20 (0.98)
|
| 65 |
+
reg_loss 0.13 (0.36)
|
| 66 |
+
vtm_loss 0.19 (0.23)
|
| 67 |
+
+----------+----------+----------+----------+----------+----------+
|
| 68 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 69 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 70 |
+
+----------+----------+----------+----------+----------+----------+
|
| 71 |
+
| 19.90 | 13.42 | 39.03 | 28.32 | 39.03 | 28.32 |
|
| 72 |
+
+----------+----------+----------+----------+----------+----------+avgiou=19.901910
|
| 73 |
+
epoch6
|
| 74 |
+
final_loss 0.72 (1.78)
|
| 75 |
+
cls_loss 0.23 (1.04)
|
| 76 |
+
reg_loss 0.13 (0.37)
|
| 77 |
+
vtm_loss 0.24 (0.25)
|
| 78 |
+
+----------+----------+----------+----------+----------+----------+
|
| 79 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 80 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 81 |
+
+----------+----------+----------+----------+----------+----------+
|
| 82 |
+
| 18.61 | 12.39 | 36.91 | 27.00 | 36.91 | 27.00 |
|
| 83 |
+
+----------+----------+----------+----------+----------+----------+avgiou=18.611255
|
| 84 |
+
epoch7
|
| 85 |
+
final_loss 0.64 (1.85)
|
| 86 |
+
cls_loss 0.20 (1.08)
|
| 87 |
+
reg_loss 0.10 (0.38)
|
| 88 |
+
vtm_loss 0.23 (0.26)
|
| 89 |
+
+----------+----------+----------+----------+----------+----------+
|
| 90 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 91 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 92 |
+
+----------+----------+----------+----------+----------+----------+
|
| 93 |
+
| 17.73 | 11.51 | 35.26 | 25.27 | 35.26 | 25.27 |
|
| 94 |
+
+----------+----------+----------+----------+----------+----------+avgiou=17.733609
|
| 95 |
+
epoch8
|
| 96 |
+
final_loss 0.74 (1.92)
|
| 97 |
+
cls_loss 0.25 (1.12)
|
| 98 |
+
reg_loss 0.13 (0.38)
|
| 99 |
+
vtm_loss 0.23 (0.28)
|
| 100 |
+
+----------+----------+----------+----------+----------+----------+
|
| 101 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 102 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 103 |
+
+----------+----------+----------+----------+----------+----------+
|
| 104 |
+
| 17.42 | 11.51 | 33.66 | 24.26 | 33.66 | 24.26 |
|
| 105 |
+
+----------+----------+----------+----------+----------+----------+avgiou=17.423851
|
| 106 |
+
epoch9
|
| 107 |
+
final_loss 0.70 (1.95)
|
| 108 |
+
cls_loss 0.25 (1.14)
|
| 109 |
+
reg_loss 0.11 (0.38)
|
| 110 |
+
vtm_loss 0.23 (0.29)
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_cls_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65f92dbc196d204a5bbf192b730190cda2c485c7914f7e4e2dcd701038c89a6d
|
| 3 |
+
size 5486
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_reg_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ee9efa1cd173238395f72430ea8fd85b71d712a988b1191c069ddec5230161c
|
| 3 |
+
size 5486
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/EVAL_GPU_0_all_losses_vtm_loss/events.out.tfevents.1729874043.autodl-container-b3ec4da47b-bc5fbea1.663163.6
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d610e01810c519a7e41dc31e8ecf599103f3e00022a6fc343e597d52875fddad
|
| 3 |
+
size 5486
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/events.out.tfevents.1729871205.autodl-container-b3ec4da47b-bc5fbea1.663163.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4045b4b0d93a6cdee08d58dce600856cb25a359a55874c68b4e03b29e86e87d
|
| 3 |
+
size 20484
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/events.out.tfevents.1729871205.autodl-container-b3ec4da47b-bc5fbea1.663164.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44b680fd20310c4773deaddbb8c93562931ca0921fef7900c2a67c03ce628aa6
|
| 3 |
+
size 88
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53de0e531f68e19336dfb0fe941113e31478e4918067f9435f9efb3b0069cee7
|
| 3 |
+
size 7407
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b50081741ae2873022666decf314f8cb1a5c2ab4af4d7fdf103d21877b1b481d
|
| 3 |
+
size 7407
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1729871427.autodl-container-b3ec4da47b-bc5fbea1.663163.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:995c1012741d37f3f379da79a4ef7399703746711bb50f4160d4b5e41ed92a7f
|
| 3 |
+
size 7407
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/model_3_21.966959215281364.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:941a926ec2879bd0cc2a245a2eaa5ac4c03cbbed2deebc26c86ae15ed59c340b
|
| 3 |
+
size 122083183
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/nlq_predictions_epoch_val_top10_3.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/nlq_predictions_epoch_val_top10_3_noscore.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ego4d_nlq_v1/egovlp/finetuned/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4_objectmambafinetune173/run.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
bash tools/train_ego4d_finetune_head_twogpu.sh configs/ego4d_nlq_v1_multitask_egovlp_256_finetune_2e-4.yaml /root/autodl-tmp/model/GroundNLQ/ckpt/save/model_5_pretrain_egovlp.pth.tar objectmambafinetune173 0,1
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/config.txt
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'dataset': {'classname_feat_concat': 'only',
|
| 2 |
+
'classname_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/classname-clip-base/a_photo_of.pt',
|
| 3 |
+
'default_fps': 30,
|
| 4 |
+
'downsample_rate': 1,
|
| 5 |
+
'enable_temporal_jittering': False,
|
| 6 |
+
'feat_stride': 16.0,
|
| 7 |
+
'fix_video_frames': 0,
|
| 8 |
+
'input_txt_dim': 512,
|
| 9 |
+
'input_vid_dim': 2304,
|
| 10 |
+
'json_file': './ego4d_data/ego4d_nlq_v2_ori_data/nlq_val.json',
|
| 11 |
+
'lavila_caption_dir': '/root/autodl-tmp/data/ego4d/nlq/lavila/narration',
|
| 12 |
+
'max_seq_len': 2560,
|
| 13 |
+
'num_classes': 1,
|
| 14 |
+
'num_frames': 16.0,
|
| 15 |
+
'object_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/co-detr/class-score0.6-minnum10-lmdb',
|
| 16 |
+
'object_feat_type': 'class-score',
|
| 17 |
+
'text_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/nlq_v2_clip_token_features',
|
| 18 |
+
'train_jsonl_file': './ego4d_data/ego4d_nlq_train_v2_label_lemma.jsonl',
|
| 19 |
+
'val_jsonl_file': './ego4d_data/ego4d_nlq_val_v2_label_lemma.jsonl',
|
| 20 |
+
'val_text_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/nlq_v2_clip_token_features',
|
| 21 |
+
'video_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/em_egovlp+internvideo_visual_features_1.87fps'},
|
| 22 |
+
'dataset_name': 'ego4d_multitask',
|
| 23 |
+
'devices': 'cuda:0',
|
| 24 |
+
'init_rand_seed': 12345678,
|
| 25 |
+
'loader': {'batch_size': 4, 'num_workers': 4},
|
| 26 |
+
'model': {'backbone_arch': [2, 4, 4, 0, 6],
|
| 27 |
+
'backbone_type': 'ObjectMambaTransformer',
|
| 28 |
+
'embd_dim': 512,
|
| 29 |
+
'embd_kernel_size': 3,
|
| 30 |
+
'embd_with_ln': True,
|
| 31 |
+
'fpn_dim': 512,
|
| 32 |
+
'fpn_start_level': 0,
|
| 33 |
+
'fpn_type': 'identity',
|
| 34 |
+
'fpn_with_ln': True,
|
| 35 |
+
'generator': {'generator_type': 'point'},
|
| 36 |
+
'head_dim': 512,
|
| 37 |
+
'head_kernel_size': 3,
|
| 38 |
+
'head_num_layers': 3,
|
| 39 |
+
'head_with_ln': True,
|
| 40 |
+
'input_txt_dim': 512,
|
| 41 |
+
'input_vid_dim': 2304,
|
| 42 |
+
'max_buffer_len_factor': 4.0,
|
| 43 |
+
'max_query': 230,
|
| 44 |
+
'max_seq_len': 2560,
|
| 45 |
+
'max_shot_num': 1800,
|
| 46 |
+
'multiscale_encoder_cfg': [{'layer_cfg': {'mha_win_size': 9,
|
| 47 |
+
'n_ds_strides': [2, 2],
|
| 48 |
+
'n_embd': 512,
|
| 49 |
+
'n_head': 4,
|
| 50 |
+
'path_pdrop': 0.1},
|
| 51 |
+
'layer_num': 6,
|
| 52 |
+
'layer_type': 'TransformerBlock'},
|
| 53 |
+
{'layer_cfg': {'in_channels': [512,
|
| 54 |
+
512,
|
| 55 |
+
512,
|
| 56 |
+
512,
|
| 57 |
+
512,
|
| 58 |
+
512,
|
| 59 |
+
512],
|
| 60 |
+
'out_channel': 512},
|
| 61 |
+
'layer_num': 1,
|
| 62 |
+
'layer_type': 'FPNLayernorm'}],
|
| 63 |
+
'n_head': 4,
|
| 64 |
+
'n_mha_win_size': 9,
|
| 65 |
+
'nlq_heads_cfg': {'center_sample_radius': 1.5,
|
| 66 |
+
'cls_head_cfg': {'empty_cls': [],
|
| 67 |
+
'feat_dim': 512,
|
| 68 |
+
'input_dim': 512,
|
| 69 |
+
'kernel_size': 3,
|
| 70 |
+
'num_classes': 1,
|
| 71 |
+
'num_layers': 3,
|
| 72 |
+
'prior_prob': 0.01,
|
| 73 |
+
'with_ln': True},
|
| 74 |
+
'duration_thresh': 0.001,
|
| 75 |
+
'iou_threshold': 0.1,
|
| 76 |
+
'loss_normalizer': 200,
|
| 77 |
+
'loss_normalizer_momentum': 0.9,
|
| 78 |
+
'max_seg_num': 5,
|
| 79 |
+
'min_score': 0.001,
|
| 80 |
+
'pre_nms_thresh': 0.001,
|
| 81 |
+
'pre_nms_topk': 2000,
|
| 82 |
+
'reg_head_cfg': {'feat_dim': 512,
|
| 83 |
+
'fpn_levels': 7,
|
| 84 |
+
'input_dim': 512,
|
| 85 |
+
'kernel_size': 3,
|
| 86 |
+
'num_layers': 3,
|
| 87 |
+
'with_ln': True},
|
| 88 |
+
'reg_loss_weight': 1.0,
|
| 89 |
+
'train_label_smoothing': 0.1},
|
| 90 |
+
'num_classes': 1,
|
| 91 |
+
'obj_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 92 |
+
'n_in': 512,
|
| 93 |
+
'num_layer': 2},
|
| 94 |
+
'layer_num': 1,
|
| 95 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 96 |
+
{'layer_cfg': {'n_embd': 512, 'path_pdrop': 0.1},
|
| 97 |
+
'layer_num': 4,
|
| 98 |
+
'layer_type': 'ObjectEncoderBlock'}],
|
| 99 |
+
'object_dim': 512,
|
| 100 |
+
'object_use_cross_model': True,
|
| 101 |
+
'object_win_size': 1,
|
| 102 |
+
'regression_range': [[0, 4],
|
| 103 |
+
[2, 8],
|
| 104 |
+
[4, 16],
|
| 105 |
+
[8, 32],
|
| 106 |
+
[16, 64],
|
| 107 |
+
[32, 128],
|
| 108 |
+
[64, 10000]],
|
| 109 |
+
'scale_factor': 2,
|
| 110 |
+
'tasks': ['NLQ', 'VTM'],
|
| 111 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 112 |
+
'ext_score_file': None,
|
| 113 |
+
'iou_threshold': 0.1,
|
| 114 |
+
'max_seg_num': 5,
|
| 115 |
+
'min_score': 0.001,
|
| 116 |
+
'multiclass_nms': True,
|
| 117 |
+
'nms_method': 'soft',
|
| 118 |
+
'nms_sigma': 0.75,
|
| 119 |
+
'pre_nms_thresh': 0.001,
|
| 120 |
+
'pre_nms_topk': 2000,
|
| 121 |
+
'test_num': 1,
|
| 122 |
+
'test_start_epoch': 2,
|
| 123 |
+
'voting_thresh': 0.9},
|
| 124 |
+
'text_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 125 |
+
'n_in': 512,
|
| 126 |
+
'num_layer': 2},
|
| 127 |
+
'layer_num': 1,
|
| 128 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 129 |
+
{'layer_cfg': {'n_embd': 512,
|
| 130 |
+
'n_head': 4,
|
| 131 |
+
'path_pdrop': 0.1},
|
| 132 |
+
'layer_num': 4,
|
| 133 |
+
'layer_type': 'TransformerBlock'}],
|
| 134 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 135 |
+
'center_sample': 'radius',
|
| 136 |
+
'center_sample_radius': 1.5,
|
| 137 |
+
'clip_grad_l2norm': 1.0,
|
| 138 |
+
'cls_prior_prob': 0.01,
|
| 139 |
+
'dropout': 0.0,
|
| 140 |
+
'droppath': 0.1,
|
| 141 |
+
'head_empty_cls': [],
|
| 142 |
+
'init_loss_norm': 200,
|
| 143 |
+
'iou_loss_weight': 1.0,
|
| 144 |
+
'label_smoothing': 0.1,
|
| 145 |
+
'loss_weight': 1.0,
|
| 146 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 147 |
+
'num_decoder_layer': 6},
|
| 148 |
+
'use_abs_pe': True,
|
| 149 |
+
'use_lmha_in_fpn': True,
|
| 150 |
+
'use_rel_pe': False,
|
| 151 |
+
'video_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 152 |
+
'kernel_size': 3,
|
| 153 |
+
'n_hidden': 512,
|
| 154 |
+
'n_in': 2304,
|
| 155 |
+
'n_out': 512,
|
| 156 |
+
'num_layer': 2},
|
| 157 |
+
'layer_num': 1,
|
| 158 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 159 |
+
{'layer_cfg': {'mamba_arch': ['bimamba1',
|
| 160 |
+
'mlp',
|
| 161 |
+
'obj'],
|
| 162 |
+
'n_embd': 512,
|
| 163 |
+
'n_head': 4,
|
| 164 |
+
'path_pdrop': 0.1},
|
| 165 |
+
'layer_num': 4,
|
| 166 |
+
'layer_type': 'ObjectMambaBlock'}],
|
| 167 |
+
'vtm_heads_cfg': {'loss_weight': 1.5,
|
| 168 |
+
'multiscale': False,
|
| 169 |
+
'shot_aggregator_cfg': {'layer_cfg': {'cross_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 170 |
+
'n_head': 4},
|
| 171 |
+
'block_type': 'MaskedMHA'},
|
| 172 |
+
'num_layer': 1,
|
| 173 |
+
'path_pdrop': 0.1,
|
| 174 |
+
'query_num': 5,
|
| 175 |
+
'self_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 176 |
+
'n_head': 4},
|
| 177 |
+
'block_type': 'MaskedMHCA'}},
|
| 178 |
+
'layer_type': 'QFormerLayer'},
|
| 179 |
+
'similarity_head_cfg': {'layer_cfg': {'con_dim': 512,
|
| 180 |
+
'x_dim': 512,
|
| 181 |
+
'y_dim': 512},
|
| 182 |
+
'layer_type': 'Cosine'},
|
| 183 |
+
'soft_label': False}},
|
| 184 |
+
'model_name': 'MultiTaskArch',
|
| 185 |
+
'opt': {'backbone_lr_weight': 1,
|
| 186 |
+
'epochs': 6,
|
| 187 |
+
'learning_rate': 0.0004,
|
| 188 |
+
'momentum': 0.9,
|
| 189 |
+
'schedule_gamma': 0.1,
|
| 190 |
+
'schedule_steps': [],
|
| 191 |
+
'schedule_type': 'cosine',
|
| 192 |
+
'type': 'AdamW',
|
| 193 |
+
'warmup': True,
|
| 194 |
+
'warmup_epochs': 4,
|
| 195 |
+
'weight_decay': 0.05},
|
| 196 |
+
'output_folder': '/root/autodl-tmp/model/GroundNLQ/ckpt/',
|
| 197 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 198 |
+
'ext_score_file': None,
|
| 199 |
+
'iou_threshold': 0.1,
|
| 200 |
+
'max_seg_num': 5,
|
| 201 |
+
'min_score': 0.001,
|
| 202 |
+
'multiclass_nms': True,
|
| 203 |
+
'nms_method': 'soft',
|
| 204 |
+
'nms_sigma': 0.75,
|
| 205 |
+
'pre_nms_thresh': 0.001,
|
| 206 |
+
'pre_nms_topk': 2000,
|
| 207 |
+
'test_num': 1,
|
| 208 |
+
'test_start_epoch': 2,
|
| 209 |
+
'voting_thresh': 0.9},
|
| 210 |
+
'track': 'nlq',
|
| 211 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 212 |
+
'center_sample': 'radius',
|
| 213 |
+
'center_sample_radius': 1.5,
|
| 214 |
+
'clip_grad_l2norm': 1.0,
|
| 215 |
+
'cls_prior_prob': 0.01,
|
| 216 |
+
'dropout': 0.0,
|
| 217 |
+
'droppath': 0.1,
|
| 218 |
+
'head_empty_cls': [],
|
| 219 |
+
'init_loss_norm': 200,
|
| 220 |
+
'iou_loss_weight': 1.0,
|
| 221 |
+
'label_smoothing': 0.1,
|
| 222 |
+
'loss_weight': 1.0,
|
| 223 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 224 |
+
'num_decoder_layer': 6},
|
| 225 |
+
'train_split': ['training'],
|
| 226 |
+
'val_split': ['validation']}
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/eval_results.txt
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
+----------+----------+----------+----------+----------+----------+
|
| 2 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 3 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 4 |
+
+----------+----------+----------+----------+----------+----------+
|
| 5 |
+
| 22.28 | 13.51 | 47.47 | 30.82 | 47.47 | 30.82 |
|
| 6 |
+
+----------+----------+----------+----------+----------+----------+avgiou=17.893234
|
| 7 |
+
epoch0
|
| 8 |
+
final_loss 1.49 (1.50)
|
| 9 |
+
cls_loss 0.80 (0.82)
|
| 10 |
+
reg_loss 0.41 (0.37)
|
| 11 |
+
vtm_loss 0.18 (0.21)
|
| 12 |
+
+----------+----------+----------+----------+----------+----------+
|
| 13 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 14 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 15 |
+
+----------+----------+----------+----------+----------+----------+
|
| 16 |
+
| 30.80 | 21.22 | 57.93 | 43.91 | 57.93 | 43.91 |
|
| 17 |
+
+----------+----------+----------+----------+----------+----------+avgiou=26.010545
|
| 18 |
+
epoch1
|
| 19 |
+
final_loss 1.32 (1.47)
|
| 20 |
+
cls_loss 0.76 (0.81)
|
| 21 |
+
reg_loss 0.29 (0.34)
|
| 22 |
+
vtm_loss 0.18 (0.21)
|
| 23 |
+
+----------+----------+----------+----------+----------+----------+
|
| 24 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 25 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 26 |
+
+----------+----------+----------+----------+----------+----------+
|
| 27 |
+
| 31.63 | 22.03 | 57.91 | 45.19 | 57.91 | 45.19 |
|
| 28 |
+
+----------+----------+----------+----------+----------+----------+avgiou=26.834359
|
| 29 |
+
epoch2
|
| 30 |
+
final_loss 1.39 (1.48)
|
| 31 |
+
cls_loss 0.82 (0.84)
|
| 32 |
+
reg_loss 0.31 (0.34)
|
| 33 |
+
vtm_loss 0.17 (0.20)
|
| 34 |
+
+----------+----------+----------+----------+----------+----------+
|
| 35 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 36 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 37 |
+
+----------+----------+----------+----------+----------+----------+
|
| 38 |
+
| 30.82 | 20.98 | 56.81 | 43.83 | 56.81 | 43.83 |
|
| 39 |
+
+----------+----------+----------+----------+----------+----------+avgiou=25.900703
|
| 40 |
+
epoch3
|
| 41 |
+
final_loss 1.45 (1.50)
|
| 42 |
+
cls_loss 0.80 (0.86)
|
| 43 |
+
reg_loss 0.40 (0.34)
|
| 44 |
+
vtm_loss 0.17 (0.20)
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_cls_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:204898598341fc7043ef1565613b7d8f4f72c38b4f00cc21786f8cbaff7fd1cb
|
| 3 |
+
size 2726
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_reg_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:902bc45d32111db78cc6e77989359e1a7a6a84cd731e3cbd7ebd78db976394a7
|
| 3 |
+
size 2726
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/EVAL_GPU_0_all_losses_vtm_loss/events.out.tfevents.1728637028.autodl-container-b3ec4da47b-bc5fbea1.33573.6
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2fb79a698b30e5224fea25ec682384c511beaf6adedc29c26457aa1e286dd94
|
| 3 |
+
size 2726
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632235.autodl-container-b3ec4da47b-bc5fbea1.31554.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a939c2d48a6aea6f129586ee4d0fa6ba50f075927233c253ca3544769e15d79
|
| 3 |
+
size 88
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632235.autodl-container-b3ec4da47b-bc5fbea1.31555.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5435840bf4ee16117de08cd1862d324adee13f0f8474bf803994ae794ad6b182
|
| 3 |
+
size 88
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632793.autodl-container-b3ec4da47b-bc5fbea1.33573.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2835cf0277a91e459c72e8eed594345f5b97e35298e453e192332afa8cd47de3
|
| 3 |
+
size 12974
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/events.out.tfevents.1728632793.autodl-container-b3ec4da47b-bc5fbea1.33574.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46d617f8b7b1e0cf8f671677c63e908bd78be53699e32afa09e17e4536bfbcfc
|
| 3 |
+
size 88
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:418ff6da494fef248a78f29ec7bd762321dd4fcdd73bca21d8a00876b14fcb8a
|
| 3 |
+
size 5089
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30a8ce87fb2738bb17843b94845afcfb532659e2426c7ff0cb7871f6dcc89bcf
|
| 3 |
+
size 5089
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1728633033.autodl-container-b3ec4da47b-bc5fbea1.33573.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44a0fc1c4df109b6a12877a31941824addd99bfdf6ec269d0ed190ea7a024a6b
|
| 3 |
+
size 5089
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/model_2_26.834358523725836.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:382435ff1efe5a1167474e66d3bb31c1f33d80414bdd9c532ddeb08d39ccd139
|
| 3 |
+
size 486572975
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/nlq_predictions_epoch_val_top10_2.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/nlq_predictions_epoch_val_top10_2_noscore.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ego4d_nlq_v2/internvideo/finetune/ego4d_nlq_v2_multitask_finetune_2e-4_objectmambafinetune144/run.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
bash tools/train_ego4d_finetune_head_twogpu.sh configs/ego4d_nlq_v2_multitask_finetune_2e-4.yaml /root/autodl-tmp/model/GroundNLQ/ckpt/save/model_7_pretrain.pth.tar objectmambafinetune144 0,1
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/config.txt
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'dataset': {'classname_feat_concat': 'only',
|
| 2 |
+
'classname_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/classname-clip-base/a_photo_of.pt',
|
| 3 |
+
'default_fps': 30,
|
| 4 |
+
'downsample_rate': 1,
|
| 5 |
+
'enable_temporal_jittering': False,
|
| 6 |
+
'feat_stride': 16.0,
|
| 7 |
+
'fix_video_frames': 0,
|
| 8 |
+
'input_txt_dim': 512,
|
| 9 |
+
'input_vid_dim': 2304,
|
| 10 |
+
'json_file': 'ego4d_data/goalstep_data/ego4d_goal_step_val_v2_lemma.jsonl',
|
| 11 |
+
'lavila_caption_dir': '/root/autodl-tmp/data/ego4d/goalstep/lavila-64/',
|
| 12 |
+
'max_seq_len': 9216,
|
| 13 |
+
'num_classes': 1,
|
| 14 |
+
'num_frames': 16.0,
|
| 15 |
+
'object_feat_dir': '/root/autodl-tmp/data/ego4d/goalstep/co-detr/clip-class-lmdb',
|
| 16 |
+
'object_feat_type': 'class-score',
|
| 17 |
+
'text_feat_dir': '/root/autodl-tmp/data/ego4d/goalstep/clip_query_lmdb',
|
| 18 |
+
'train_jsonl_file': 'ego4d_data/goalstep_data/clip/ego4d_goal_step_train_v2.jsonl',
|
| 19 |
+
'val_jsonl_file': 'ego4d_data/goalstep_data/ego4d_goal_step_val_v2_lemma.jsonl',
|
| 20 |
+
'val_text_feat_dir': '/root/autodl-tmp/data/ego4d/goalstep/clip_query_lmdb',
|
| 21 |
+
'video_feat_dir': '/root/autodl-tmp/data/ego4d/goalstep/video_feature/internvideo_clip_lmdb'},
|
| 22 |
+
'dataset_name': 'ego4d_multitask',
|
| 23 |
+
'devices': 'cuda:0',
|
| 24 |
+
'init_rand_seed': 12345678,
|
| 25 |
+
'loader': {'batch_size': 1, 'num_workers': 1},
|
| 26 |
+
'model': {'backbone_arch': (2, 2, 2, 0, 6),
|
| 27 |
+
'backbone_type': 'convTransformer',
|
| 28 |
+
'embd_dim': 512,
|
| 29 |
+
'embd_kernel_size': 3,
|
| 30 |
+
'embd_with_ln': True,
|
| 31 |
+
'fpn_dim': 512,
|
| 32 |
+
'fpn_start_level': 0,
|
| 33 |
+
'fpn_type': 'identity',
|
| 34 |
+
'fpn_with_ln': True,
|
| 35 |
+
'generator': {'generator_type': 'point'},
|
| 36 |
+
'head_dim': 512,
|
| 37 |
+
'head_kernel_size': 3,
|
| 38 |
+
'head_num_layers': 3,
|
| 39 |
+
'head_with_ln': True,
|
| 40 |
+
'input_txt_dim': 512,
|
| 41 |
+
'input_vid_dim': 2304,
|
| 42 |
+
'max_buffer_len_factor': 4.0,
|
| 43 |
+
'max_query': 560,
|
| 44 |
+
'max_seq_len': 9216,
|
| 45 |
+
'max_shot_num': 3400,
|
| 46 |
+
'multiscale_encoder_cfg': [{'layer_cfg': {'mha_win_size': 9,
|
| 47 |
+
'n_ds_strides': [2, 2],
|
| 48 |
+
'n_embd': 512,
|
| 49 |
+
'n_head': 4,
|
| 50 |
+
'path_pdrop': 0.1},
|
| 51 |
+
'layer_num': 6,
|
| 52 |
+
'layer_type': 'TransformerBlock'},
|
| 53 |
+
{'layer_cfg': {'in_channels': [512,
|
| 54 |
+
512,
|
| 55 |
+
512,
|
| 56 |
+
512,
|
| 57 |
+
512,
|
| 58 |
+
512,
|
| 59 |
+
512],
|
| 60 |
+
'out_channel': 512},
|
| 61 |
+
'layer_num': 1,
|
| 62 |
+
'layer_type': 'FPNLayernorm'}],
|
| 63 |
+
'n_head': 4,
|
| 64 |
+
'n_mha_win_size': -1,
|
| 65 |
+
'nlq_heads_cfg': {'center_sample_radius': 1.5,
|
| 66 |
+
'cls_head_cfg': {'empty_cls': [],
|
| 67 |
+
'feat_dim': 512,
|
| 68 |
+
'input_dim': 512,
|
| 69 |
+
'kernel_size': 3,
|
| 70 |
+
'num_classes': 1,
|
| 71 |
+
'num_layers': 3,
|
| 72 |
+
'prior_prob': 0.01,
|
| 73 |
+
'with_ln': True},
|
| 74 |
+
'duration_thresh': 0.001,
|
| 75 |
+
'iou_threshold': 0.1,
|
| 76 |
+
'loss_normalizer': 200,
|
| 77 |
+
'loss_normalizer_momentum': 0.9,
|
| 78 |
+
'max_seg_num': 5,
|
| 79 |
+
'min_score': 0.001,
|
| 80 |
+
'pre_nms_thresh': 0.001,
|
| 81 |
+
'pre_nms_topk': 2000,
|
| 82 |
+
'reg_head_cfg': {'feat_dim': 512,
|
| 83 |
+
'fpn_levels': 7,
|
| 84 |
+
'input_dim': 512,
|
| 85 |
+
'kernel_size': 3,
|
| 86 |
+
'num_layers': 3,
|
| 87 |
+
'with_ln': True},
|
| 88 |
+
'reg_loss_weight': 1.0,
|
| 89 |
+
'train_label_smoothing': 0.1},
|
| 90 |
+
'num_classes': 1,
|
| 91 |
+
'obj_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 92 |
+
'n_in': 512,
|
| 93 |
+
'num_layer': 2},
|
| 94 |
+
'layer_num': 1,
|
| 95 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 96 |
+
{'layer_cfg': {'n_embd': 512, 'path_pdrop': 0.1},
|
| 97 |
+
'layer_num': 4,
|
| 98 |
+
'layer_type': 'ObjectEncoderBlock'}],
|
| 99 |
+
'regression_range': [[0, 4],
|
| 100 |
+
[2, 8],
|
| 101 |
+
[4, 16],
|
| 102 |
+
[8, 32],
|
| 103 |
+
[16, 64],
|
| 104 |
+
[32, 128],
|
| 105 |
+
[64, 10000]],
|
| 106 |
+
'scale_factor': 2,
|
| 107 |
+
'tasks': ['NLQ', 'VTM'],
|
| 108 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 109 |
+
'ext_score_file': None,
|
| 110 |
+
'iou_threshold': 0.1,
|
| 111 |
+
'max_seg_num': 5,
|
| 112 |
+
'min_score': 0.001,
|
| 113 |
+
'multiclass_nms': True,
|
| 114 |
+
'nms_method': 'soft',
|
| 115 |
+
'nms_sigma': 0.75,
|
| 116 |
+
'pre_nms_thresh': 0.001,
|
| 117 |
+
'pre_nms_topk': 2000,
|
| 118 |
+
'test_num': 1,
|
| 119 |
+
'test_start_epoch': 2,
|
| 120 |
+
'voting_thresh': 0.9},
|
| 121 |
+
'text_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 122 |
+
'n_in': 512,
|
| 123 |
+
'num_layer': 2},
|
| 124 |
+
'layer_num': 1,
|
| 125 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 126 |
+
{'layer_cfg': {'n_embd': 512,
|
| 127 |
+
'n_head': 4,
|
| 128 |
+
'path_pdrop': 0.1},
|
| 129 |
+
'layer_num': 4,
|
| 130 |
+
'layer_type': 'TransformerBlock'}],
|
| 131 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 132 |
+
'center_sample': 'radius',
|
| 133 |
+
'center_sample_radius': 1.5,
|
| 134 |
+
'clip_grad_l2norm': 1.0,
|
| 135 |
+
'cls_prior_prob': 0.01,
|
| 136 |
+
'dropout': 0.0,
|
| 137 |
+
'droppath': 0.1,
|
| 138 |
+
'head_empty_cls': [],
|
| 139 |
+
'init_loss_norm': 200,
|
| 140 |
+
'iou_loss_weight': 1.0,
|
| 141 |
+
'label_smoothing': 0.1,
|
| 142 |
+
'loss_weight': 1.0,
|
| 143 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 144 |
+
'num_decoder_layer': 6},
|
| 145 |
+
'use_abs_pe': True,
|
| 146 |
+
'use_lmha_in_fpn': True,
|
| 147 |
+
'use_rel_pe': False,
|
| 148 |
+
'video_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 149 |
+
'kernel_size': 3,
|
| 150 |
+
'n_hidden': 512,
|
| 151 |
+
'n_in': 2304,
|
| 152 |
+
'n_out': 512,
|
| 153 |
+
'num_layer': 2},
|
| 154 |
+
'layer_num': 1,
|
| 155 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 156 |
+
{'layer_cfg': {'mamba_arch': ['bimamba1',
|
| 157 |
+
'mlp',
|
| 158 |
+
'obj'],
|
| 159 |
+
'n_embd': 512,
|
| 160 |
+
'n_head': 4,
|
| 161 |
+
'path_pdrop': 0.1},
|
| 162 |
+
'layer_num': 4,
|
| 163 |
+
'layer_type': 'ObjectMambaBlock'}],
|
| 164 |
+
'vtm_heads_cfg': {'loss_weight': 1.5,
|
| 165 |
+
'multiscale': False,
|
| 166 |
+
'shot_aggregator_cfg': {'layer_cfg': {'cross_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 167 |
+
'n_head': 4},
|
| 168 |
+
'block_type': 'MaskedMHA'},
|
| 169 |
+
'num_layer': 1,
|
| 170 |
+
'path_pdrop': 0.1,
|
| 171 |
+
'query_num': 5,
|
| 172 |
+
'self_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 173 |
+
'n_head': 4},
|
| 174 |
+
'block_type': 'MaskedMHCA'}},
|
| 175 |
+
'layer_type': 'QFormerLayer'},
|
| 176 |
+
'similarity_head_cfg': {'layer_cfg': {'con_dim': 512,
|
| 177 |
+
'x_dim': 512,
|
| 178 |
+
'y_dim': 512},
|
| 179 |
+
'layer_type': 'Cosine'},
|
| 180 |
+
'soft_label': False}},
|
| 181 |
+
'model_name': 'MultiTaskArch',
|
| 182 |
+
'opt': {'backbone_lr_weight': 1,
|
| 183 |
+
'epochs': 6,
|
| 184 |
+
'learning_rate': 0.0008,
|
| 185 |
+
'momentum': 0.9,
|
| 186 |
+
'schedule_gamma': 0.1,
|
| 187 |
+
'schedule_steps': [],
|
| 188 |
+
'schedule_type': 'cosine',
|
| 189 |
+
'type': 'AdamW',
|
| 190 |
+
'warmup': True,
|
| 191 |
+
'warmup_epochs': 4,
|
| 192 |
+
'weight_decay': 0.05},
|
| 193 |
+
'output_folder': '/root/autodl-tmp/model/GroundNLQ/goalstep/',
|
| 194 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 195 |
+
'ext_score_file': None,
|
| 196 |
+
'iou_threshold': 0.1,
|
| 197 |
+
'max_seg_num': 5,
|
| 198 |
+
'min_score': 0.001,
|
| 199 |
+
'multiclass_nms': True,
|
| 200 |
+
'nms_method': 'soft',
|
| 201 |
+
'nms_sigma': 0.75,
|
| 202 |
+
'pre_nms_thresh': 0.001,
|
| 203 |
+
'pre_nms_topk': 2000,
|
| 204 |
+
'test_num': 1,
|
| 205 |
+
'test_start_epoch': 2,
|
| 206 |
+
'voting_thresh': 0.9},
|
| 207 |
+
'track': 'goal_step',
|
| 208 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 209 |
+
'center_sample': 'radius',
|
| 210 |
+
'center_sample_radius': 1.5,
|
| 211 |
+
'clip_grad_l2norm': 1.0,
|
| 212 |
+
'cls_prior_prob': 0.01,
|
| 213 |
+
'dropout': 0.0,
|
| 214 |
+
'droppath': 0.1,
|
| 215 |
+
'head_empty_cls': [],
|
| 216 |
+
'init_loss_norm': 200,
|
| 217 |
+
'iou_loss_weight': 1.0,
|
| 218 |
+
'label_smoothing': 0.1,
|
| 219 |
+
'loss_weight': 1.0,
|
| 220 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 221 |
+
'num_decoder_layer': 6},
|
| 222 |
+
'train_split': ['training'],
|
| 223 |
+
'val_split': ['validation']}
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/eval_results.txt
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
+----------+----------+----------+----------+----------+----------+
|
| 2 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 3 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 4 |
+
+----------+----------+----------+----------+----------+----------+
|
| 5 |
+
| 26.13 | 21.61 | 56.73 | 49.30 | 56.73 | 49.30 |
|
| 6 |
+
+----------+----------+----------+----------+----------+----------+avgiou=26.130457
|
| 7 |
+
epoch0
|
| 8 |
+
+----------+----------+----------+----------+----------+----------+
|
| 9 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 10 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 11 |
+
+----------+----------+----------+----------+----------+----------+
|
| 12 |
+
| 27.99 | 23.83 | 58.72 | 51.57 | 58.72 | 51.57 |
|
| 13 |
+
+----------+----------+----------+----------+----------+----------+avgiou=27.988565
|
| 14 |
+
epoch1
|
| 15 |
+
+----------+----------+----------+----------+----------+----------+
|
| 16 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 17 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 18 |
+
+----------+----------+----------+----------+----------+----------+
|
| 19 |
+
| 28.42 | 23.79 | 59.50 | 52.39 | 59.50 | 52.39 |
|
| 20 |
+
+----------+----------+----------+----------+----------+----------+avgiou=28.417360
|
| 21 |
+
epoch2
|
| 22 |
+
+----------+----------+----------+----------+----------+----------+
|
| 23 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 24 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 25 |
+
+----------+----------+----------+----------+----------+----------+
|
| 26 |
+
| 28.17 | 24.06 | 58.74 | 51.74 | 58.74 | 51.74 |
|
| 27 |
+
+----------+----------+----------+----------+----------+----------+avgiou=28.170478
|
| 28 |
+
epoch3
|
| 29 |
+
+----------+----------+----------+----------+----------+----------+
|
| 30 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 31 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 32 |
+
+----------+----------+----------+----------+----------+----------+
|
| 33 |
+
| 28.74 | 24.27 | 59.16 | 52.27 | 59.16 | 52.27 |
|
| 34 |
+
+----------+----------+----------+----------+----------+----------+avgiou=28.742204
|
| 35 |
+
epoch4
|
| 36 |
+
+----------+----------+----------+----------+----------+----------+
|
| 37 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 38 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 39 |
+
+----------+----------+----------+----------+----------+----------+
|
| 40 |
+
| 28.60 | 24.44 | 59.65 | 53.13 | 59.65 | 53.13 |
|
| 41 |
+
+----------+----------+----------+----------+----------+----------+avgiou=28.599272
|
| 42 |
+
epoch5
|
| 43 |
+
+----------+----------+----------+----------+----------+----------+
|
| 44 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 45 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 46 |
+
+----------+----------+----------+----------+----------+----------+
|
| 47 |
+
| 29.61 | 24.94 | 59.51 | 52.48 | 59.51 | 52.48 |
|
| 48 |
+
+----------+----------+----------+----------+----------+----------+avgiou=29.612786
|
| 49 |
+
epoch6
|
| 50 |
+
+----------+----------+----------+----------+----------+----------+
|
| 51 |
+
| Rank@1 | Rank@1 | Rank@5 | Rank@5 | Rank@10 | Rank@10 |
|
| 52 |
+
| mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 | mIoU@0.3 | mIoU@0.5 |
|
| 53 |
+
+----------+----------+----------+----------+----------+----------+
|
| 54 |
+
| 28.92 | 24.57 | 59.17 | 52.43 | 59.17 | 52.43 |
|
| 55 |
+
+----------+----------+----------+----------+----------+----------+avgiou=28.924116
|
| 56 |
+
epoch7
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458057.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecca71d48639775ce1f6c217bfe928294fc08b01b5568b66109cb367ea1b9f63
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458059.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00aaeab73622930454886f391bb37812060334fc4e955468ec0796ca6f992f96
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726904924.autodl-container-b3ec4da47b-bc5fbea1.458060.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64ac5b1566c4058ef555d97b101936cee9aeb90cfcb2885b648f348e1b6fd8d9
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906438.autodl-container-b3ec4da47b-bc5fbea1.460310.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d096f2b7006f2deb0a4c10eaa54f0a69f89c4ff41eaa521de174b436e93169b
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460307.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4e28bd158929f7db87bfc69169443fa99dcb61a8e47c4fd235ea86cfebe403a
|
| 3 |
+
size 79899
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460308.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91a45dc73a5396e958bb5b8b0ed845bf85c3d7625c807f712686c3b41b9af2cd
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/events.out.tfevents.1726906439.autodl-container-b3ec4da47b-bc5fbea1.460309.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b33462cec9f03325622b13d84818afb653ab1aea89bfe5f3449bc670b0ea288
|
| 3 |
+
size 88
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_cls_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:695ef39d29320f4820ae3af21b36d103b3b88a547c56db757caf66deae948f36
|
| 3 |
+
size 39047
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_reg_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74c8ee73938de8a9ebbf72aa4ed3401902ef7ad497b49ca30a5042dccf8a001b
|
| 3 |
+
size 39047
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/logs/train_GPU_0_all_losses_vtm_loss/events.out.tfevents.1726906838.autodl-container-b3ec4da47b-bc5fbea1.460307.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:692dfc9b278fc5fa5ebcb849f351b4c6c14636f88c16f4c3b6de74ba5ac0d545
|
| 3 |
+
size 39047
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_Bayesian.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_noscore.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/nlq_predictions_epoch_val_top10_6_noscore_Bayesian.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
goalstep/internvideo/finetuned/ego4d_goalstep_v2_baseline_2e-4_objectmambafinetune135/run.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
bash tools/train_ego4d_finetune_head_4gpu_noeval.sh configs/ego4d_goalstep_v2_baseline_2e-4.yaml /root/autodl-tmp/model/GroundNLQ/ckpt/save/model_7_pretrain.pth.tar objectmambafinetune135 0,1,2,3
|
pretrain_naq/egovlp/model_5_pretrain_egovlp.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0582a3fc28fd33b0c1a183014ca70eaa39faffad9c5955f97ce460f01324b04d
|
| 3 |
+
size 122080780
|
pretrain_naq/internvideo/model_7_pretrain.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55ea828749455f900e0d7bad3556d6d4035d31a7fc247a1e3d6b697a86ec5ed1
|
| 3 |
+
size 425493745
|
tacos/c3d/scratch/tacos_c3d_glove_weight1_5e-5_objectmambafinetune150/config.txt
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'dataset': {'classname_feat_concat': 'only',
|
| 2 |
+
'classname_feat_dir': '/root/autodl-tmp/data/ego4d/nlq/classname-clip-base/a_photo_of.pt',
|
| 3 |
+
'default_fps': 29.4,
|
| 4 |
+
'downsample_rate': 1,
|
| 5 |
+
'enable_temporal_jittering': False,
|
| 6 |
+
'feat_stride': 16.0,
|
| 7 |
+
'fix_video_frames': 0,
|
| 8 |
+
'input_txt_dim': 512,
|
| 9 |
+
'input_vid_dim': 4096,
|
| 10 |
+
'json_file': 'ego4d_data/tacos/test_lemma.jsonl',
|
| 11 |
+
'lavila_caption_dir': '/root/autodl-tmp/data/tacos/lavila',
|
| 12 |
+
'max_seq_len': 3072,
|
| 13 |
+
'num_classes': 1,
|
| 14 |
+
'num_frames': 16.0,
|
| 15 |
+
'object_feat_dir': '/root/autodl-tmp/data/tacos/class-score0.6-minnum10-lmdb',
|
| 16 |
+
'object_feat_type': 'class-score',
|
| 17 |
+
'text_feat_dir': '/root/autodl-tmp/data/tacos/glove_clip_token_features',
|
| 18 |
+
'train_jsonl_file': 'ego4d_data/tacos/train_lemma.jsonl',
|
| 19 |
+
'val_jsonl_file': 'ego4d_data/tacos/test_lemma.jsonl',
|
| 20 |
+
'val_text_feat_dir': '/root/autodl-tmp/data/tacos/glove_clip_token_features',
|
| 21 |
+
'video_feat_dir': '/root/autodl-tmp/data/tacos/c3d_lmdb'},
|
| 22 |
+
'dataset_name': 'ego4d_multitask',
|
| 23 |
+
'devices': 'cuda:0',
|
| 24 |
+
'init_rand_seed': 12345678,
|
| 25 |
+
'loader': {'batch_size': 2, 'num_workers': 2},
|
| 26 |
+
'model': {'backbone_arch': [2, 4, 4, 0, 6],
|
| 27 |
+
'backbone_type': 'ObjectMambaTransformer',
|
| 28 |
+
'embd_dim': 512,
|
| 29 |
+
'embd_kernel_size': 3,
|
| 30 |
+
'embd_with_ln': True,
|
| 31 |
+
'fpn_dim': 512,
|
| 32 |
+
'fpn_start_level': 0,
|
| 33 |
+
'fpn_type': 'identity',
|
| 34 |
+
'fpn_with_ln': True,
|
| 35 |
+
'generator': {'generator_type': 'point'},
|
| 36 |
+
'head_dim': 512,
|
| 37 |
+
'head_kernel_size': 3,
|
| 38 |
+
'head_num_layers': 3,
|
| 39 |
+
'head_with_ln': True,
|
| 40 |
+
'input_txt_dim': 512,
|
| 41 |
+
'input_vid_dim': 4096,
|
| 42 |
+
'max_buffer_len_factor': 4.0,
|
| 43 |
+
'max_query': 500,
|
| 44 |
+
'max_seq_len': 3072,
|
| 45 |
+
'max_shot_num': 100,
|
| 46 |
+
'multiscale_encoder_cfg': [{'layer_cfg': {'mha_win_size': 9,
|
| 47 |
+
'n_ds_strides': [2, 2],
|
| 48 |
+
'n_embd': 512,
|
| 49 |
+
'n_head': 4,
|
| 50 |
+
'path_pdrop': 0.1},
|
| 51 |
+
'layer_num': 6,
|
| 52 |
+
'layer_type': 'TransformerBlock'},
|
| 53 |
+
{'layer_cfg': {'in_channels': [512,
|
| 54 |
+
512,
|
| 55 |
+
512,
|
| 56 |
+
512,
|
| 57 |
+
512,
|
| 58 |
+
512,
|
| 59 |
+
512],
|
| 60 |
+
'out_channel': 512},
|
| 61 |
+
'layer_num': 1,
|
| 62 |
+
'layer_type': 'FPNLayernorm'}],
|
| 63 |
+
'n_head': 4,
|
| 64 |
+
'n_mha_win_size': 9,
|
| 65 |
+
'nlq_heads_cfg': {'center_sample_radius': 1.5,
|
| 66 |
+
'cls_head_cfg': {'empty_cls': [],
|
| 67 |
+
'feat_dim': 512,
|
| 68 |
+
'input_dim': 512,
|
| 69 |
+
'kernel_size': 3,
|
| 70 |
+
'num_classes': 1,
|
| 71 |
+
'num_layers': 3,
|
| 72 |
+
'prior_prob': 0.01,
|
| 73 |
+
'with_ln': True},
|
| 74 |
+
'duration_thresh': 0.001,
|
| 75 |
+
'iou_threshold': 0.1,
|
| 76 |
+
'loss_normalizer': 200,
|
| 77 |
+
'loss_normalizer_momentum': 0.9,
|
| 78 |
+
'max_seg_num': 5,
|
| 79 |
+
'min_score': 0.001,
|
| 80 |
+
'pre_nms_thresh': 0.001,
|
| 81 |
+
'pre_nms_topk': 2000,
|
| 82 |
+
'reg_head_cfg': {'feat_dim': 512,
|
| 83 |
+
'fpn_levels': 7,
|
| 84 |
+
'input_dim': 512,
|
| 85 |
+
'kernel_size': 3,
|
| 86 |
+
'num_layers': 3,
|
| 87 |
+
'with_ln': True},
|
| 88 |
+
'reg_loss_weight': 1.0,
|
| 89 |
+
'train_label_smoothing': 0.1},
|
| 90 |
+
'num_classes': 1,
|
| 91 |
+
'obj_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 92 |
+
'n_in': 512,
|
| 93 |
+
'num_layer': 2},
|
| 94 |
+
'layer_num': 1,
|
| 95 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 96 |
+
{'layer_cfg': {'n_embd': 512, 'path_pdrop': 0.1},
|
| 97 |
+
'layer_num': 4,
|
| 98 |
+
'layer_type': 'ObjectEncoderBlock'}],
|
| 99 |
+
'object_dim': 512,
|
| 100 |
+
'object_use_cross_model': True,
|
| 101 |
+
'object_win_size': 1,
|
| 102 |
+
'regression_range': [[0, 4],
|
| 103 |
+
[2, 8],
|
| 104 |
+
[4, 16],
|
| 105 |
+
[8, 32],
|
| 106 |
+
[16, 64],
|
| 107 |
+
[32, 128],
|
| 108 |
+
[64, 10000]],
|
| 109 |
+
'scale_factor': 2,
|
| 110 |
+
'tasks': ['NLQ', 'VTM'],
|
| 111 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 112 |
+
'ext_score_file': None,
|
| 113 |
+
'iou_threshold': 0.1,
|
| 114 |
+
'max_seg_num': 5,
|
| 115 |
+
'min_score': 0.001,
|
| 116 |
+
'multiclass_nms': True,
|
| 117 |
+
'nms_method': 'soft',
|
| 118 |
+
'nms_sigma': 0.75,
|
| 119 |
+
'pre_nms_thresh': 0.001,
|
| 120 |
+
'pre_nms_topk': 2000,
|
| 121 |
+
'test_num': 1,
|
| 122 |
+
'test_start_epoch': 2,
|
| 123 |
+
'voting_thresh': 0.9},
|
| 124 |
+
'text_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 125 |
+
'n_in': 300,
|
| 126 |
+
'n_out': 512,
|
| 127 |
+
'num_layer': 2},
|
| 128 |
+
'layer_num': 1,
|
| 129 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 130 |
+
{'layer_cfg': {'n_embd': 512,
|
| 131 |
+
'n_head': 4,
|
| 132 |
+
'path_pdrop': 0.1},
|
| 133 |
+
'layer_num': 4,
|
| 134 |
+
'layer_type': 'TransformerBlock',
|
| 135 |
+
'use_abs_pe': True}],
|
| 136 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 137 |
+
'center_sample': 'radius',
|
| 138 |
+
'center_sample_radius': 1.5,
|
| 139 |
+
'clip_grad_l2norm': 1.0,
|
| 140 |
+
'cls_prior_prob': 0.01,
|
| 141 |
+
'dropout': 0.0,
|
| 142 |
+
'droppath': 0.1,
|
| 143 |
+
'head_empty_cls': [],
|
| 144 |
+
'init_loss_norm': 200,
|
| 145 |
+
'iou_loss_weight': 1.0,
|
| 146 |
+
'label_smoothing': 0.1,
|
| 147 |
+
'loss_weight': 1.0,
|
| 148 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 149 |
+
'num_decoder_layer': 6},
|
| 150 |
+
'use_abs_pe': True,
|
| 151 |
+
'use_lmha_in_fpn': True,
|
| 152 |
+
'use_rel_pe': False,
|
| 153 |
+
'video_encoder_cfg': [{'layer_cfg': {'act': 'relu',
|
| 154 |
+
'kernel_size': 3,
|
| 155 |
+
'n_hidden': 512,
|
| 156 |
+
'n_in': 4096,
|
| 157 |
+
'n_out': 512,
|
| 158 |
+
'num_layer': 2},
|
| 159 |
+
'layer_num': 1,
|
| 160 |
+
'layer_type': 'MaskedConv1DLayer'},
|
| 161 |
+
{'layer_cfg': {'mamba_arch': ['bimamba1',
|
| 162 |
+
'mlp',
|
| 163 |
+
'obj'],
|
| 164 |
+
'n_embd': 512,
|
| 165 |
+
'n_head': 4,
|
| 166 |
+
'path_pdrop': 0.1},
|
| 167 |
+
'layer_num': 4,
|
| 168 |
+
'layer_type': 'ObjectMambaBlock'}],
|
| 169 |
+
'vtm_heads_cfg': {'loss_weight': 1.0,
|
| 170 |
+
'multiscale': False,
|
| 171 |
+
'shot_aggregator_cfg': {'layer_cfg': {'cross_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 172 |
+
'n_head': 4},
|
| 173 |
+
'block_type': 'MaskedMHA'},
|
| 174 |
+
'num_layer': 1,
|
| 175 |
+
'path_pdrop': 0.1,
|
| 176 |
+
'query_num': 5,
|
| 177 |
+
'self_mixer_cfg': {'block_cfg': {'n_embd': 512,
|
| 178 |
+
'n_head': 4},
|
| 179 |
+
'block_type': 'MaskedMHCA'}},
|
| 180 |
+
'layer_type': 'QFormerLayer'},
|
| 181 |
+
'similarity_head_cfg': {'layer_cfg': {'con_dim': 512,
|
| 182 |
+
'x_dim': 512,
|
| 183 |
+
'y_dim': 512},
|
| 184 |
+
'layer_type': 'Cosine'},
|
| 185 |
+
'soft_label': False}},
|
| 186 |
+
'model_name': 'MultiTaskArch',
|
| 187 |
+
'opt': {'backbone_lr_weight': 1,
|
| 188 |
+
'epochs': 6,
|
| 189 |
+
'learning_rate': 0.0002,
|
| 190 |
+
'momentum': 0.9,
|
| 191 |
+
'schedule_gamma': 0.1,
|
| 192 |
+
'schedule_steps': [],
|
| 193 |
+
'schedule_type': 'cosine',
|
| 194 |
+
'type': 'AdamW',
|
| 195 |
+
'warmup': True,
|
| 196 |
+
'warmup_epochs': 4,
|
| 197 |
+
'weight_decay': 0.05},
|
| 198 |
+
'output_folder': '/root/autodl-tmp/model/GroundNLQ/tacos/',
|
| 199 |
+
'test_cfg': {'duration_thresh': 0.001,
|
| 200 |
+
'ext_score_file': None,
|
| 201 |
+
'iou_threshold': 0.1,
|
| 202 |
+
'max_seg_num': 5,
|
| 203 |
+
'min_score': 0.001,
|
| 204 |
+
'multiclass_nms': True,
|
| 205 |
+
'nms_method': 'soft',
|
| 206 |
+
'nms_sigma': 0.75,
|
| 207 |
+
'pre_nms_thresh': 0.001,
|
| 208 |
+
'pre_nms_topk': 2000,
|
| 209 |
+
'test_num': 1,
|
| 210 |
+
'test_start_epoch': 2,
|
| 211 |
+
'voting_thresh': 0.9},
|
| 212 |
+
'track': 'goal_step',
|
| 213 |
+
'train_cfg': {'box_loss_weight': 5.0,
|
| 214 |
+
'center_sample': 'radius',
|
| 215 |
+
'center_sample_radius': 1.5,
|
| 216 |
+
'clip_grad_l2norm': 1.0,
|
| 217 |
+
'cls_prior_prob': 0.01,
|
| 218 |
+
'dropout': 0.0,
|
| 219 |
+
'droppath': 0.1,
|
| 220 |
+
'head_empty_cls': [],
|
| 221 |
+
'init_loss_norm': 200,
|
| 222 |
+
'iou_loss_weight': 1.0,
|
| 223 |
+
'label_smoothing': 0.1,
|
| 224 |
+
'loss_weight': 1.0,
|
| 225 |
+
'mamba_arch': ['bimamba1', 'mlp', 'obj'],
|
| 226 |
+
'num_decoder_layer': 6},
|
| 227 |
+
'train_split': ['training'],
|
| 228 |
+
'val_split': ['validation']}
|