Transformers
PyTorch
Graphcore
bert
generated_from_trainer
Inference Endpoints
Sylvain Viguier commited on
Commit
b02bd0a
1 Parent(s): 53f0d2a

first version of the model

Browse files
Files changed (3) hide show
  1. config.json +87 -0
  2. pytorch_model.bin +3 -0
  3. training_state.pt +3 -0
config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/",
3
+ "architectures": [
4
+ "PoptorchPipelinedBertForPretraining"
5
+ ],
6
+ "async_dataloader": true,
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "auto_loss_scaling": false,
9
+ "batch_size": 2,
10
+ "batches_per_step": 1,
11
+ "checkpoint_output_dir": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase2",
12
+ "checkpoint_steps": null,
13
+ "compile_only": false,
14
+ "config": null,
15
+ "custom_ops": true,
16
+ "dataloader_workers": 64,
17
+ "dataset": "pretraining",
18
+ "disable_progress_bar": true,
19
+ "embedding_serialization_factor": 2,
20
+ "enable_half_first_order_momentum": false,
21
+ "enable_half_partials": true,
22
+ "executable_cache_dir": "",
23
+ "file_buffer_size": 10,
24
+ "global_batch_size": 16384,
25
+ "gradient_accumulation": 2048,
26
+ "gradient_checkpointing": false,
27
+ "hidden_act": "gelu",
28
+ "hidden_dropout_prob": 0.1,
29
+ "hidden_size": 1024,
30
+ "initializer_range": 0.02,
31
+ "input_files": [
32
+ "data/wikipedia/384/*.tfrecord"
33
+ ],
34
+ "intermediate_size": 4096,
35
+ "ipus_per_replica": 4,
36
+ "layer_norm_eps": 0.001,
37
+ "layers_per_ipu": [
38
+ 3,
39
+ 7,
40
+ 7,
41
+ 7
42
+ ],
43
+ "learning_rate": 0.002828,
44
+ "loss_scaling": 8192.0,
45
+ "lr_schedule": "linear",
46
+ "lr_warmup": 0.128,
47
+ "mask_tokens": 56,
48
+ "matmul_proportion": [
49
+ 0.15,
50
+ 0.25,
51
+ 0.25,
52
+ 0.25
53
+ ],
54
+ "max_position_embeddings": 512,
55
+ "model_type": "bert",
56
+ "num_attention_heads": 16,
57
+ "num_epochs": null,
58
+ "num_hidden_layers": 24,
59
+ "optimizer": "LAMB",
60
+ "optimizer_state_offchip": true,
61
+ "pad_token_id": 0,
62
+ "position_embedding_type": "absolute",
63
+ "pred_head_transform": true,
64
+ "pretrained_checkpoint": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/",
65
+ "profile": false,
66
+ "profile_dir": "profile",
67
+ "random_seed": 42,
68
+ "recompute_checkpoint_every_layer": true,
69
+ "replicated_tensor_sharding": true,
70
+ "replication_factor": 4,
71
+ "restore_steps_and_optimizer": false,
72
+ "samples_per_step": 16384,
73
+ "sdk_version": "poplar_sdk-ubuntu_18_04-2.3.0-EA.1+716-757737e247",
74
+ "sequence_length": 384,
75
+ "squad_do_training": true,
76
+ "squad_do_validation": true,
77
+ "synthetic_data": false,
78
+ "training_steps": 2137,
79
+ "transformers_version": "4.7.0",
80
+ "type_vocab_size": 2,
81
+ "use_cache": true,
82
+ "use_popdist": false,
83
+ "vocab_size": 30522,
84
+ "wandb": true,
85
+ "wandb_param_steps": null,
86
+ "weight_decay": 0.01
87
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa7af4e8c9c66b940751f91be95590f3dde4aba5160d65d5ce952dff2835a63
3
+ size 672610403
training_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e170da99eae1aeb46476dfb799e440557c0a07e40043abe92e1e339048452f2
3
+ size 5167