File size: 1,369 Bytes
a5f8a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -----------------------------------------------------------------------------
# Base config: VirTex pretraining for our "base" bicaptioning model:
# ResNet-50 + (L = 1, H = 1024) transformer trained for 500K iterations.
# -----------------------------------------------------------------------------
RANDOM_SEED: 0
AMP: true
CUDNN_BENCHMARK: true
CUDNN_DETERMINISTIC: false

DATA:
  ROOT: "datasets/coco"
  TOKENIZER_MODEL: "datasets/vocab/coco_10k.model"
  VOCAB_SIZE: 10000
  UNK_INDEX: 0
  SOS_INDEX: 1
  EOS_INDEX: 2
  MASK_INDEX: 3

  IMAGE_CROP_SIZE: 224
  MAX_CAPTION_LENGTH: 30

  IMAGE_TRANSFORM_TRAIN:
    - "random_resized_crop"
    - "horizontal_flip"
    - "color_jitter"
    - "normalize"

  IMAGE_TRANSFORM_VAL:
    - "smallest_resize"
    - "center_crop"
    - "normalize"

  USE_PERCENTAGE: 100.0
  USE_SINGLE_CAPTION: false

MODEL:
  NAME: "virtex"
  VISUAL:
    NAME: "torchvision::resnet50"
    PRETRAINED: false
    FROZEN: false
  TEXTUAL:
    NAME: "transdec_postnorm::L1_H1024_A16_F4096"
    DROPOUT: 0.1

OPTIM:
  OPTIMIZER_NAME: "sgd"
  SGD_MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0001

  LOOKAHEAD:
    USE: true
    ALPHA: 0.5
    STEPS: 5

  BATCH_SIZE: 256
  CNN_LR: 0.2
  LR: 0.001
  NUM_ITERATIONS: 500000

  WARMUP_STEPS: 10000
  LR_DECAY_NAME: "cosine"

  NO_DECAY: ".*textual.(embedding|transformer).*(norm.*|bias)"
  CLIP_GRAD_NORM: 10.0