cfg:
  micro_batch_size: 4
  global_batch_size: 32
  rampup_batch_size: null
  context_parallel_size: 1
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  virtual_pipeline_model_parallel_size: null
  resume_from_checkpoint: null
  encoder_seq_length: 2048
  max_position_embeddings: 2048
  num_layers: 24
  hidden_size: 4096
  ffn_hidden_size: 16384
  num_attention_heads: 32
  init_method_std: 0.01
  hidden_dropout: 0.1
  attention_dropout: 0.1
  kv_channels: null
  apply_query_key_layer_scaling: true
  layernorm_epsilon: 1.0e-05
  make_vocab_size_divisible_by: 128
  pre_process: true
  post_process: true
  persist_layer_norm: true
  gradient_as_bucket_view: true
  grad_div_ar_fusion: true
  gradient_accumulation_fusion: true
  bias_activation_fusion: true
  bias_dropout_add_fusion: true
  masked_softmax_fusion: true
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  num_micro_batches_with_partial_activation_checkpoints: null
  activations_checkpoint_layers_per_pipeline: null
  fsdp: false
  fsdp_sharding_strategy: full
  fsdp_grad_reduce_dtype: 32
  fsdp_sharded_checkpoint: false
  sequence_parallel: false
  overlap_p2p_comm: false
  batch_p2p_comm: true
  num_query_groups: null
  tokenizer:
    library: megatron
    type: GPT2BPETokenizer
    model: null
    delimiter: null
    vocab_file: /gpt3_dataset//bpe/vocab.json
    merge_file: /gpt3_dataset//bpe/merges.txt
  native_amp_init_scale: 4294967296
  native_amp_growth_interval: 1000
  hysteresis: 2
  fp32_residual_connection: false
  fp16_lm_cross_entropy: false
  megatron_amp_O2: true
  grad_allreduce_chunk_size_mb: 125
  sharp: false
  mcore_gpt: true
  transformer_engine: false
  fp8: false
  fp8_e4m3: false
  fp8_hybrid: true
  fp8_margin: 0
  fp8_interval: 1
  fp8_amax_history_len: 1024
  fp8_amax_compute_algo: max
  fp8_wgrad: true
  ub_tp_comm_overlap: false
  tp_comm_atomic_ag: false
  tp_comm_atomic_rs: false
  seed: 1234
  sync_batch_comm: false
  use_cpu_initialization: false
  onnx_safe: false
  apex_transformer_log_level: 30
  nsys_profile:
    enabled: false
    trace:
    - nvtx
    - cuda
    start_step: 10
    end_step: 10
    ranks:
    - 0
    gen_shape: false
  optim:
    name: distributed_fused_adam
    bucket_cap_mb: 400
    overlap_grad_sync: true
    overlap_param_sync: true
    contiguous_grad_buffer: true
    lr: 0.00016
    weight_decay: 0.1
    betas:
    - 0.9
    - 0.95
    sched:
      name: CosineAnnealing
      warmup_steps: 115
      constant_steps: 12500
      min_lr: 1.6e-05
  data:
    data_impl: mmap
    splits_string: 99990,8,2
    seq_length: 2048
    skip_warmup: true
    num_workers: 2
    dataloader_type: single
    reset_position_ids: false
    reset_attention_mask: false
    eod_mask_loss: false
    index_mapping_dir: null
    data_prefix:
    - 0.0333
    - /gpt3_dataset/wiki_text_document
  precision: bf16-mixed