File size: 2,590 Bytes
c4c5809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
cfg:
  micro_batch_size: 20
  global_batch_size: 8000
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  encoder_seq_length: 512
  max_position_embeddings: 512
  num_layers: 24
  hidden_size: 1024
  ffn_hidden_size: 4096
  num_attention_heads: 16
  init_method_std: 0.02
  hidden_dropout: 0.1
  kv_channels: null
  apply_query_key_layer_scaling: true
  layernorm_epsilon: 1.0e-05
  make_vocab_size_divisible_by: 128
  pre_process: true
  post_process: true
  bert_binary_head: true
  tokenizer:
    library: huggingface
    type: KBLab/spe-bpe-32k-pretok-small_data-tokenizer
    model: null
    vocab_file: null
    merge_file: null
  native_amp_init_scale: 4294967296
  native_amp_growth_interval: 1000
  fp32_residual_connection: false
  fp16_lm_cross_entropy: false
  megatron_amp_O2: false
  grad_allreduce_chunk_size_mb: 125
  grad_div_ar_fusion: false
  seed: 666
  use_cpu_initialization: false
  onnx_safe: false
  gradient_as_bucket_view: true
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  num_micro_batches_with_partial_activation_checkpoints: null
  activations_checkpoint_layers_per_pipeline: null
  sequence_parallel: false
  data:
    data_prefix:
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/wikipedia-spe-bpe-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/edepos_html-spe-bpe-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/oscar-spe-bpe-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/kw3-2017-spe-bpe-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/issues-spe-bpe-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/mc4-spe-bpe-32k-pretok-small_data_text_sentence
    index_mapping_dir: /project/scratch/$PID/data/spe-bpe-32k-pretok-small_data/npy_files/
    data_impl: mmap
    splits_string: 980,10,10
    seq_length: 512
    skip_warmup: true
    num_workers: 32
    dataloader_type: single
    reset_position_ids: false
    reset_attention_mask: false
    eod_mask_loss: false
    masked_lm_prob: 0.15
    short_seq_prob: 0.1
  optim:
    name: fused_adam
    lr: 0.0006
    weight_decay: 0.01
    betas:
    - 0.9
    - 0.98
    sched:
      name: CosineAnnealing
      warmup_steps: 500
      constant_steps: 500
      min_lr: 2.0e-05
  precision: 16