File size: 1,374 Bytes
data:
  cache_dir: data/external/huggingface
  input_streams:
  - rgb
  level: word
  max_sent_length: 400
  name: wlasl300
  num_proc: 2
  subset: rgb_videos
  transform_cfg:
    cent_crop_size: 224
    color_jitter_threshold: 0.2
    rand_crop_aspect_ratio_max: 1.3
    rand_crop_aspect_ratio_min: 0.75
    rand_crop_bottom_area: 0.7
    rand_crop_size: 224
    rand_crop_threshold: 0.7
    scale_size: 224
    temporal_augmentation:
      tmax: 1.5
      tmin: 0.5
  txt_lowercase: true
model:
  RecognitionNetwork:
    GlossTokenizer:
      gloss2id_file: pretrained/mBart_en/gloss2ids.pkl
    s3d:
      freeze_block: 1
      pretrained_ckpt: pretrained/s3ds_actioncls
      use_block: 4
    visual_head:
      ff_kernelsize:
      - 3
      - 3
      ff_size: 2048
      hidden_size: 512
      input_size: 832
      pe: true
task: S2G
testing:
  cfg:
    recognition:
      beam_size: 5
training:
  batch_size: 3
  from_best: true
  from_ckpt: true
  keep_last_ckpts: 5
  model_dir: experiments/outputs/SingleStream/wlasl300_s2g
  optimization:
    betas:
    - 0.9
    - 0.998
    learning_rate:
      default: 0.002
    momentum: 0.9
    optimizer: sgd
    scheduler: cosineannealing
    t_max: 50
    weight_decay: 0.001
  overwrite: false
  shuffle: true
  total_epoch: 50
  validation:
    cfg:
      recognition:
        beam_size: 2
    freq: 1
    unit: epoch