versae commited on
Commit
39d9314
1 Parent(s): c2aa204

Saving weights and logs of step 1000

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "roberta-base",
3
  "architectures": [
4
  "RobertaForMaskedLM"
5
  ],
@@ -19,6 +19,7 @@
19
  "num_hidden_layers": 12,
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
 
22
  "transformers_version": "4.16.0.dev0",
23
  "type_vocab_size": 1,
24
  "use_cache": true,
1
  {
2
+ "_name_or_path": "./",
3
  "architectures": [
4
  "RobertaForMaskedLM"
5
  ],
19
  "num_hidden_layers": 12,
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
22
+ "torch_dtype": "bfloat16",
23
  "transformers_version": "4.16.0.dev0",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
events.out.tfevents.1642353687.t1v-n-00e295a4-w-0.437639.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a2c72eddd38a2e2c1f27d2fbc653a4aadf40419494842b53dd7f7559a30129
3
+ size 146996
events.out.tfevents.1642508113.t1v-n-00e295a4-w-0.632677.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ff28af8474a1a3bed992c3c440935d0776de435ca3b3eab3d8a8ed8cec34a2f
3
+ size 40
events.out.tfevents.1642508176.t1v-n-00e295a4-w-0.645011.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79ef23d9737ae60fddcddd281a6127f5325f3d0c8080d8b3b17c8105bf147c82
3
+ size 40
events.out.tfevents.1642508417.t1v-n-00e295a4-w-0.662728.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2cd702f0bd422826437b0d591702c3d773f8a3c8a4b25c8a2a91f848f42b2ad
3
+ size 40
events.out.tfevents.1642509967.t1v-n-00e295a4-w-0.712066.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0694b7f11a53d724a4552a8be128d5008f0c34de819b6925531da247835c6afe
3
+ size 40
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3ef24399f6318fe5bd62ae854e2d6bfd60c2ce162537aafd46d63d7221cfe3e
3
+ size 498796983
run_mlm_flax.py CHANGED
@@ -640,17 +640,17 @@ def main():
640
  # - mask for weight decay is not implemented but we don't use it anyway
641
  optimizer = distributed_shampoo(
642
  linear_decay_lr_schedule_fn,
643
- block_size=1024, # recommended default for large LM is 1536
644
  beta1=training_args.adam_beta1, # 0.9,
645
  beta2=training_args.adam_beta2, # 0.999,
646
  diagonal_epsilon=training_args.adam_epsilon, # 1e-10,
647
  matrix_epsilon=1e-8,
648
  weight_decay=training_args.weight_decay, # 0.0,
649
  start_preconditioning_step=1001,
650
- preconditioning_compute_steps=10,
651
  statistics_compute_steps=1,
652
  best_effort_shape_interpretation=True,
653
- graft_type=GraftingType.RMSPROP_NORMALIZED,
654
  nesterov=False,
655
  exponent_override=0,
656
  batch_axis_name="batch",
@@ -658,7 +658,7 @@ def main():
658
  moving_average_for_momentum=True,
659
  skip_preconditioning_dim_size_gt=4096,
660
  clip_by_scaled_gradient_norm=None,
661
- precision=jax.lax.Precision.HIGHEST,
662
  )
663
  else:
664
  optimizer = optax.adamw(
640
  # - mask for weight decay is not implemented but we don't use it anyway
641
  optimizer = distributed_shampoo(
642
  linear_decay_lr_schedule_fn,
643
+ block_size=1536, # 1024 by Boris, recommended default for large LM is 1536
644
  beta1=training_args.adam_beta1, # 0.9,
645
  beta2=training_args.adam_beta2, # 0.999,
646
  diagonal_epsilon=training_args.adam_epsilon, # 1e-10,
647
  matrix_epsilon=1e-8,
648
  weight_decay=training_args.weight_decay, # 0.0,
649
  start_preconditioning_step=1001,
650
+ preconditioning_compute_steps=1,
651
  statistics_compute_steps=1,
652
  best_effort_shape_interpretation=True,
653
+ graft_type=GraftingType.SGD, # GraftingType.RMSPROP_NORMALIZED,
654
  nesterov=False,
655
  exponent_override=0,
656
  batch_axis_name="batch",
658
  moving_average_for_momentum=True,
659
  skip_preconditioning_dim_size_gt=4096,
660
  clip_by_scaled_gradient_norm=None,
661
+ precision=jax.lax.Precision.DEFAULT, # DEFAULT is bfloat16
662
  )
663
  else:
664
  optimizer = optax.adamw(
train.128.sh CHANGED
@@ -1,18 +1,18 @@
1
  python run_mlm_flax.py \
2
  --output_dir="./" \
3
  --model_type="roberta" \
4
- --config_name="roberta-base" \
5
  --tokenizer_name="NbAiLab/nb-roberta-base" \
6
  --dataset_name="NbAiLab/NCC" \
7
  --max_seq_length="128" \
8
  --weight_decay="0.0" \
9
- --per_device_train_batch_size="232" \
10
- --per_device_eval_batch_size="232" \
11
  --pad_to_max_length \
12
  --learning_rate="6e-4" \
13
  --warmup_steps="10000" \
14
  --overwrite_output_dir \
15
- --num_train_epochs="3" \
16
  --distributed_shampoo \
17
  --adam_beta1="0.9" \
18
  --adam_beta2="0.99" \
1
  python run_mlm_flax.py \
2
  --output_dir="./" \
3
  --model_type="roberta" \
4
+ --config_name="./" \
5
  --tokenizer_name="NbAiLab/nb-roberta-base" \
6
  --dataset_name="NbAiLab/NCC" \
7
  --max_seq_length="128" \
8
  --weight_decay="0.0" \
9
+ --per_device_train_batch_size="210" \
10
+ --per_device_eval_batch_size="210" \
11
  --pad_to_max_length \
12
  --learning_rate="6e-4" \
13
  --warmup_steps="10000" \
14
  --overwrite_output_dir \
15
+ --num_train_epochs="5" \
16
  --distributed_shampoo \
17
  --adam_beta1="0.9" \
18
  --adam_beta2="0.99" \