DenseAV-language / config.json
nielsr's picture
nielsr HF staff
Push model using huggingface_hub.
fed630b verified
{
"adaptive_clipping": true,
"audio_aligner_type": "audio_sa_3_3_pool_2",
"audio_lora": false,
"audio_lora_rank": 8,
"audio_model_type": "hubert",
"audio_pool_width": 1,
"cal_balance_weight": 0.1,
"cal_init": 1.0,
"channel_dropout": 0.0,
"code_dim": 384,
"disentangle_weight": 0.0,
"finetune_audio_model": true,
"finetune_image_model": false,
"gather_tensors": true,
"gradient_clipping": 10.0,
"head_agg": "max_elementwise",
"image_aligner_type": "image_linear",
"image_lora": true,
"image_lora_rank": 8,
"image_model_token_type": "token",
"image_model_type": "dino8",
"image_pool_width": 2,
"learn_audio_cls": true,
"loss_leak": 0.0,
"loss_margin": 0.0,
"loss_type": "nce",
"lr": 5e-05,
"lr_cycle_length": 50000,
"lr_schedule": null,
"lr_warmup": 1000,
"mask_silence": true,
"memory_buffer_size": 0,
"mixup_weight": 0.0,
"neg_audio": true,
"neg_audio_weight": 0.01,
"nonneg_pressure": 0.01,
"nonneg_sim": false,
"norm_vectors": false,
"optimizer": "adam",
"output_root": "/mnt/azureml/cr/j/7e4ff299c6de48ee8bd4222906d5eca7/cap/data-capability/wd/INPUT_video_analysis_store_3_video_analysis_store_3",
"pretrain_lr": 5e-05,
"silence_l1": 0.01,
"silence_l2": 0.0,
"sim_agg_heads": 1,
"sim_agg_type": "misa",
"sim_use_cls": false,
"spatial_dropout": 0.0,
"specialization_weight": 0.05,
"tv_weight": 0.01,
"use_cached_embs": false
}