| cfg: | |
| micro_batch_size: 20 | |
| global_batch_size: 8000 | |
| tensor_model_parallel_size: 1 | |
| pipeline_model_parallel_size: 1 | |
| encoder_seq_length: 512 | |
| max_position_embeddings: 512 | |
| num_layers: 24 | |
| hidden_size: 1024 | |
| ffn_hidden_size: 4096 | |
| num_attention_heads: 16 | |
| init_method_std: 0.02 | |
| hidden_dropout: 0.1 | |
| kv_channels: null | |
| apply_query_key_layer_scaling: true | |
| layernorm_epsilon: 1.0e-05 | |
| make_vocab_size_divisible_by: 128 | |
| pre_process: true | |
| post_process: true | |
| bert_binary_head: true | |
| tokenizer: | |
| library: huggingface | |
| type: KBLab/unigram-64k-pretok-small_data-tokenizer | |
| model: null | |
| vocab_file: null | |
| merge_file: null | |
| native_amp_init_scale: 4294967296 | |
| native_amp_growth_interval: 1000 | |
| fp32_residual_connection: false | |
| fp16_lm_cross_entropy: false | |
| megatron_amp_O2: false | |
| grad_allreduce_chunk_size_mb: 125 | |
| grad_div_ar_fusion: false | |
| seed: 666 | |
| use_cpu_initialization: false | |
| onnx_safe: false | |
| gradient_as_bucket_view: true | |
| activations_checkpoint_granularity: null | |
| activations_checkpoint_method: null | |
| activations_checkpoint_num_layers: null | |
| num_micro_batches_with_partial_activation_checkpoints: null | |
| activations_checkpoint_layers_per_pipeline: null | |
| sequence_parallel: false | |
| data: | |
| data_prefix: | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/wikipedia-unigram-64k-pretok-small_data_text_sentence | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/edepos_html-unigram-64k-pretok-small_data_text_sentence | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/oscar-unigram-64k-pretok-small_data_text_sentence | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/kw3-2017-unigram-64k-pretok-small_data_text_sentence | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/issues-unigram-64k-pretok-small_data_text_sentence | |
| - 1 | |
| - /project/scratch/$PID/data/unigram-64k-pretok-small_data/mc4-unigram-64k-pretok-small_data_text_sentence | |
| index_mapping_dir: /project/scratch/$PID/data/unigram-64k-pretok-small_data/npy_files/ | |
| data_impl: mmap | |
| splits_string: 980,10,10 | |
| seq_length: 512 | |
| skip_warmup: true | |
| num_workers: 32 | |
| dataloader_type: single | |
| reset_position_ids: false | |
| reset_attention_mask: false | |
| eod_mask_loss: false | |
| masked_lm_prob: 0.15 | |
| short_seq_prob: 0.1 | |
| optim: | |
| name: fused_adam | |
| lr: 0.0006 | |
| weight_decay: 0.01 | |
| betas: | |
| - 0.9 | |
| - 0.98 | |
| sched: | |
| name: CosineAnnealing | |
| warmup_steps: 500 | |
| constant_steps: 500 | |
| min_lr: 2.0e-05 | |
| precision: 16 | |