Transformers
PyTorch
Graphcore
bert
Generated from Trainer
Inference Endpoints
bert-large-uncased / config.json
Sylvain Viguier
first version of the model
b02bd0a
raw
history blame
2.4 kB
{
"_name_or_path": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/",
"architectures": [
"PoptorchPipelinedBertForPretraining"
],
"async_dataloader": true,
"attention_probs_dropout_prob": 0.0,
"auto_loss_scaling": false,
"batch_size": 2,
"batches_per_step": 1,
"checkpoint_output_dir": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase2",
"checkpoint_steps": null,
"compile_only": false,
"config": null,
"custom_ops": true,
"dataloader_workers": 64,
"dataset": "pretraining",
"disable_progress_bar": true,
"embedding_serialization_factor": 2,
"enable_half_first_order_momentum": false,
"enable_half_partials": true,
"executable_cache_dir": "",
"file_buffer_size": 10,
"global_batch_size": 16384,
"gradient_accumulation": 2048,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"input_files": [
"data/wikipedia/384/*.tfrecord"
],
"intermediate_size": 4096,
"ipus_per_replica": 4,
"layer_norm_eps": 0.001,
"layers_per_ipu": [
3,
7,
7,
7
],
"learning_rate": 0.002828,
"loss_scaling": 8192.0,
"lr_schedule": "linear",
"lr_warmup": 0.128,
"mask_tokens": 56,
"matmul_proportion": [
0.15,
0.25,
0.25,
0.25
],
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 16,
"num_epochs": null,
"num_hidden_layers": 24,
"optimizer": "LAMB",
"optimizer_state_offchip": true,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"pred_head_transform": true,
"pretrained_checkpoint": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/",
"profile": false,
"profile_dir": "profile",
"random_seed": 42,
"recompute_checkpoint_every_layer": true,
"replicated_tensor_sharding": true,
"replication_factor": 4,
"restore_steps_and_optimizer": false,
"samples_per_step": 16384,
"sdk_version": "poplar_sdk-ubuntu_18_04-2.3.0-EA.1+716-757737e247",
"sequence_length": 384,
"squad_do_training": true,
"squad_do_validation": true,
"synthetic_data": false,
"training_steps": 2137,
"transformers_version": "4.7.0",
"type_vocab_size": 2,
"use_cache": true,
"use_popdist": false,
"vocab_size": 30522,
"wandb": true,
"wandb_param_steps": null,
"weight_decay": 0.01
}