|
{ |
|
"_name_or_path": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/", |
|
"architectures": [ |
|
"PoptorchPipelinedBertForPretraining" |
|
], |
|
"async_dataloader": true, |
|
"attention_probs_dropout_prob": 0.0, |
|
"auto_loss_scaling": false, |
|
"batch_size": 2, |
|
"batches_per_step": 1, |
|
"checkpoint_output_dir": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase2", |
|
"checkpoint_steps": null, |
|
"compile_only": false, |
|
"config": null, |
|
"custom_ops": true, |
|
"dataloader_workers": 64, |
|
"dataset": "pretraining", |
|
"disable_progress_bar": true, |
|
"embedding_serialization_factor": 2, |
|
"enable_half_first_order_momentum": false, |
|
"enable_half_partials": true, |
|
"executable_cache_dir": "", |
|
"file_buffer_size": 10, |
|
"global_batch_size": 16384, |
|
"gradient_accumulation": 2048, |
|
"gradient_checkpointing": false, |
|
"hidden_act": "gelu", |
|
"hidden_dropout_prob": 0.1, |
|
"hidden_size": 1024, |
|
"initializer_range": 0.02, |
|
"input_files": [ |
|
"data/wikipedia/384/*.tfrecord" |
|
], |
|
"intermediate_size": 4096, |
|
"ipus_per_replica": 4, |
|
"layer_norm_eps": 0.001, |
|
"layers_per_ipu": [ |
|
3, |
|
7, |
|
7, |
|
7 |
|
], |
|
"learning_rate": 0.002828, |
|
"loss_scaling": 8192.0, |
|
"lr_schedule": "linear", |
|
"lr_warmup": 0.128, |
|
"mask_tokens": 56, |
|
"matmul_proportion": [ |
|
0.15, |
|
0.25, |
|
0.25, |
|
0.25 |
|
], |
|
"max_position_embeddings": 512, |
|
"model_type": "bert", |
|
"num_attention_heads": 16, |
|
"num_epochs": null, |
|
"num_hidden_layers": 24, |
|
"optimizer": "LAMB", |
|
"optimizer_state_offchip": true, |
|
"pad_token_id": 0, |
|
"position_embedding_type": "absolute", |
|
"pred_head_transform": true, |
|
"pretrained_checkpoint": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/", |
|
"profile": false, |
|
"profile_dir": "profile", |
|
"random_seed": 42, |
|
"recompute_checkpoint_every_layer": true, |
|
"replicated_tensor_sharding": true, |
|
"replication_factor": 4, |
|
"restore_steps_and_optimizer": false, |
|
"samples_per_step": 16384, |
|
"sdk_version": "poplar_sdk-ubuntu_18_04-2.3.0-EA.1+716-757737e247", |
|
"sequence_length": 384, |
|
"squad_do_training": true, |
|
"squad_do_validation": true, |
|
"synthetic_data": false, |
|
"training_steps": 2137, |
|
"transformers_version": "4.7.0", |
|
"type_vocab_size": 2, |
|
"use_cache": true, |
|
"use_popdist": false, |
|
"vocab_size": 30522, |
|
"wandb": true, |
|
"wandb_param_steps": null, |
|
"weight_decay": 0.01 |
|
} |
|
|