diff --git "a/.gitattributes" "b/.gitattributes" --- "a/.gitattributes" +++ "b/.gitattributes" @@ -35,3 +35,2260 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text tokenizer.json filter=lfs diff=lfs merge=lfs -text trt/NeMo_bfloat16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/gemma-7b-sql-nemo.nemo filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/0.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/1.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/10.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/11.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/12.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/13.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/14.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/15.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/16.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/17.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/18.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/19.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/2.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/20.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/21.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/22.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/23.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/24.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/25.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/26.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/27.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/3.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/4.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/5.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/6.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/7.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/8.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc1.weight/9.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.mlp.linear_fc2.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_proj.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.decoder.layers.self_attention.linear_qkv.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.embedding.word_embeddings.weight/0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.embedding.word_embeddings.weight/1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.embedding.word_embeddings.weight/2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/model.embedding.word_embeddings.weight/3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/0.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/1.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/10.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/11.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/12.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/13.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/14.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/15.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/16.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/17.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/18.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/19.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/2.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/20.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/21.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/22.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/23.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/24.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/25.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/26.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/27.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/3.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/4.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/5.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/6.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/7.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/8.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc1.weight/9.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.mlp.linear_fc2.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_proj.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.decoder.layers.self_attention.linear_qkv.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.embedding.word_embeddings.weight/0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.embedding.word_embeddings.weight/1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.embedding.word_embeddings.weight/2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg.model.embedding.word_embeddings.weight/3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/0.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/1.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/10.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/11.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/12.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/13.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/14.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/15.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/16.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/17.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/18.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/19.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/2.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/20.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/21.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/22.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/23.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/24.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/25.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/26.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/27.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/3.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/4.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/5.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/6.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/7.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/8.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc1.weight/9.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.mlp.linear_fc2.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_proj.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.decoder.layers.self_attention.linear_qkv.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.embedding.word_embeddings.weight/0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.embedding.word_embeddings.weight/1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.embedding.word_embeddings.weight/2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.exp_avg_sq.model.embedding.word_embeddings.weight/3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.mlp.linear_fc2.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/0.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/0.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/0.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/1.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/1.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/1.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/10.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/10.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/10.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/11.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/11.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/11.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/12.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/12.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/12.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/13.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/13.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/13.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/14.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/14.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/14.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/15.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/15.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/15.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/16.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/16.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/16.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/17.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/17.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/17.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/18.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/18.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/18.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/19.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/19.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/19.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/2.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/2.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/2.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/20.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/20.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/20.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/21.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/21.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/21.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/22.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/22.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/22.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/23.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/23.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/23.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/24.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/24.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/24.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/25.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/25.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/25.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/26.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/26.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/26.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/27.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/27.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/27.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/3.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/3.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/3.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/4.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/4.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/4.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/5.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/5.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/5.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/6.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/6.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/6.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/7.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/7.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/7.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/8.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/8.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/8.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/9.0.1 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/9.0.2 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_proj.weight/9.0.3 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.decoder.layers.self_attention.linear_qkv.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.embedding.word_embeddings.weight/0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.embedding.word_embeddings.weight/1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.embedding.word_embeddings.weight/2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.fp32_param.model.embedding.word_embeddings.weight/3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/0.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/1.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/10.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/11.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/12.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/13.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/14.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/15.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/16.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/17.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/18.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/19.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/2.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/20.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/21.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/22.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/23.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/24.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/25.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/26.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/27.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/3.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/4.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/5.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/6.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/7.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/8.7.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.0.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.1.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.2.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.3.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.4.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.5.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.6.0 filter=lfs diff=lfs merge=lfs -text +nemo/checkpoints/megatron_gpt_sft--validation_loss=0.000-step=613-consumed_samples=78464-epoch=1-last/optimizer.state.param.model.decoder.layers.mlp.linear_fc1.weight/9.7.0 filter=lfs diff=lfs merge=lfs -text