task: S2G
data:
  name: vsl
  subset: rgb_videos
  num_proc: 4
  cache_dir: data/external/huggingface
  input_streams:
    - rgb
  level: word #word or char
  txt_lowercase: true
  max_sent_length: 400
  transform_cfg:
    rand_crop_size: 224
    rand_crop_threshold: 0.7
    rand_crop_bottom_area: 0.7
    rand_crop_aspect_ratio_min: 0.75
    rand_crop_aspect_ratio_max: 1.3
    cent_crop_size: 224
    scale_size: 224
    color_jitter_threshold: 0.2
    temporal_augmentation:
      tmin: 0.5
      tmax: 1.5
testing:
  cfg:
    recognition:
      beam_size: 5
training:
  overwrite: false
  model_dir: experiments/outputs/SingleStream/vsl_s2g
  shuffle: true
  batch_size: 4
  total_epoch: 100
  keep_last_ckpts: 5
  validation: 
    unit: epoch
    freq: 1
    cfg:
      recognition:
        beam_size: 2
  optimization:
    learning_rate:
      default: 4.0e-3
    optimizer: sgd
    weight_decay: 1.0e-3
    momentum: 0.9
    betas:
    - 0.9
    - 0.998
    scheduler: cosineannealing
    t_max: 50
model:
  RecognitionNetwork:
    GlossTokenizer:
      gloss2id_file: pretrained/mBart_vi/gloss2ids.pkl
    s3d:
      pretrained_ckpt: pretrained/s3ds_actioncls
      use_block: 4
      freeze_block: 1
    visual_head:
      input_size: 832
      hidden_size: 512
      ff_size: 2048 
      pe: True 
      ff_kernelsize:
        - 3
        - 3