micro_batch_size: 27 tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 make_vocab_size_divisible_by: 128 pre_process: true post_process: true megatron_amp_O2: false seq_length: 512 max_position_embeddings: 512 num_layers: 24 hidden_size: 1024 ffn_hidden_size: 16384 num_attention_heads: 32 init_method_std: 0.015 hidden_dropout: 0.1 attention_dropout: 0.1 kv_channels: 128 apply_query_key_layer_scaling: true layernorm_epsilon: 1.0e-05 persist_layer_norm: true gradient_as_bucket_view: true encoder_arch: transformer decoder_arch: transformer activation: gelu tokenizer: library: megatron type: BertWordPieceCase model: null vocab_file: bert_vocab.txt merge_file: null num_sentinel_tokens: 100 native_amp_init_scale: 4294967296 native_amp_growth_interval: 1000 fp32_residual_connection: false fp16_lm_cross_entropy: false seed: 1234 use_cpu_initialization: false onnx_safe: false activations_checkpoint_method: null activations_checkpoint_num_layers: 1 data: data_prefix: - 0.0333 - /preproc_data/my-t5_00_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_01_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_02_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_03_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_04_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_05_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_06_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_07_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_08_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_09_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_10_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_11_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_12_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_13_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_14_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_15_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_16_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_17_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_18_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_19_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_20_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_21_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_22_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_23_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_24_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_25_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_26_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_27_bert_tokenizer_text_document - 0.0333 - /preproc_data/my-t5_28_bert_tokenizer_text_document - 0.0334 - /preproc_data/my-t5_29_bert_tokenizer_text_document data_impl: mmap splits_string: 99982,9,9 seq_length: 512 seq_length_dec: 128 skip_warmup: true num_workers: 4 dataloader_type: single masked_lm_prob: 0.15 dataset_type: t5 short_seq_prob: 0.0 max_ngram_size: 10 mean_ngram_size: null geometric_dist: true permutation: false whole_word_masking: true favor_longer_ngrams: false optim: name: fused_adam lr: 0.0001 betas: - 0.9 - 0.999 eps: 1.0e-08 weight_decay: 0.01 sched: name: WarmupAnnealing min_lr: 1.0e-05 last_epoch: -1 warmup_ratio: 0.01 precision: bf16 target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model nemo_version: 1.7.1 vocab_file: nemo:6b9a052d82a744389fbe256fea20c06f_vocab.txt