task: S2G data: name: vsl subset: rgb_videos num_proc: 4 cache_dir: data/external/huggingface input_streams: - rgb level: word #word or char txt_lowercase: true max_sent_length: 400 transform_cfg: rand_crop_size: 224 rand_crop_threshold: 0.7 rand_crop_bottom_area: 0.7 rand_crop_aspect_ratio_min: 0.75 rand_crop_aspect_ratio_max: 1.3 cent_crop_size: 224 scale_size: 224 color_jitter_threshold: 0.2 temporal_augmentation: tmin: 0.5 tmax: 1.5 testing: cfg: recognition: beam_size: 5 training: overwrite: false model_dir: experiments/outputs/SingleStream/vsl_s2g shuffle: true batch_size: 4 total_epoch: 100 keep_last_ckpts: 5 validation: unit: epoch freq: 1 cfg: recognition: beam_size: 2 optimization: learning_rate: default: 4.0e-3 optimizer: sgd weight_decay: 1.0e-3 momentum: 0.9 betas: - 0.9 - 0.998 scheduler: cosineannealing t_max: 50 model: RecognitionNetwork: GlossTokenizer: gloss2id_file: pretrained/mBart_vi/gloss2ids.pkl s3d: pretrained_ckpt: pretrained/s3ds_actioncls use_block: 4 freeze_block: 1 visual_head: input_size: 832 hidden_size: 512 ff_size: 2048 pe: True ff_kernelsize: - 3 - 3