cfg:
  micro_batch_size: 20
  global_batch_size: 8000
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  encoder_seq_length: 512
  max_position_embeddings: 512
  num_layers: 24
  hidden_size: 1024
  ffn_hidden_size: 4096
  num_attention_heads: 16
  init_method_std: 0.02
  hidden_dropout: 0.1
  kv_channels: null
  apply_query_key_layer_scaling: true
  layernorm_epsilon: 1.0e-05
  make_vocab_size_divisible_by: 128
  pre_process: true
  post_process: true
  bert_binary_head: true
  tokenizer:
    library: huggingface
    type: KBLab/unigram-32k-pretok-small_data-tokenizer
    model: null
    vocab_file: null
    merge_file: null
  native_amp_init_scale: 4294967296
  native_amp_growth_interval: 1000
  fp32_residual_connection: false
  fp16_lm_cross_entropy: false
  megatron_amp_O2: false
  grad_allreduce_chunk_size_mb: 125
  grad_div_ar_fusion: false
  seed: 666
  use_cpu_initialization: false
  onnx_safe: false
  gradient_as_bucket_view: true
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  num_micro_batches_with_partial_activation_checkpoints: null
  activations_checkpoint_layers_per_pipeline: null
  sequence_parallel: false
  data:
    data_prefix:
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/wikipedia-unigram-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/edepos_html-unigram-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/oscar-unigram-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/kw3-2017-unigram-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/issues-unigram-32k-pretok-small_data_text_sentence
    - 1
    - /project/scratch/$PID/data/unigram-32k-pretok-small_data/mc4-unigram-32k-pretok-small_data_text_sentence
    index_mapping_dir: /project/scratch/$PID/data/unigram-32k-pretok-small_data/npy_files/
    data_impl: mmap
    splits_string: 980,10,10
    seq_length: 512
    skip_warmup: true
    num_workers: 32
    dataloader_type: single
    reset_position_ids: false
    reset_attention_mask: false
    eod_mask_loss: false
    masked_lm_prob: 0.15
    short_seq_prob: 0.1
  optim:
    name: fused_adam
    lr: 0.0006
    weight_decay: 0.01
    betas:
    - 0.9
    - 0.98
    sched:
      name: CosineAnnealing
      warmup_steps: 500
      constant_steps: 500
      min_lr: 2.0e-05
  precision: 16