Saving weights and logs of step 1000
Browse files- config.json +2 -1
- events.out.tfevents.1642353687.t1v-n-00e295a4-w-0.437639.0.v2 +3 -0
- events.out.tfevents.1642508113.t1v-n-00e295a4-w-0.632677.0.v2 +3 -0
- events.out.tfevents.1642508176.t1v-n-00e295a4-w-0.645011.0.v2 +3 -0
- events.out.tfevents.1642508417.t1v-n-00e295a4-w-0.662728.0.v2 +3 -0
- events.out.tfevents.1642509967.t1v-n-00e295a4-w-0.712066.0.v2 +3 -0
- flax_model.msgpack +3 -0
- run_mlm_flax.py +4 -4
- train.128.sh +4 -4
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"RobertaForMaskedLM"
|
5 |
],
|
@@ -19,6 +19,7 @@
|
|
19 |
"num_hidden_layers": 12,
|
20 |
"pad_token_id": 1,
|
21 |
"position_embedding_type": "absolute",
|
|
|
22 |
"transformers_version": "4.16.0.dev0",
|
23 |
"type_vocab_size": 1,
|
24 |
"use_cache": true,
|
1 |
{
|
2 |
+
"_name_or_path": "./",
|
3 |
"architectures": [
|
4 |
"RobertaForMaskedLM"
|
5 |
],
|
19 |
"num_hidden_layers": 12,
|
20 |
"pad_token_id": 1,
|
21 |
"position_embedding_type": "absolute",
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
"transformers_version": "4.16.0.dev0",
|
24 |
"type_vocab_size": 1,
|
25 |
"use_cache": true,
|
events.out.tfevents.1642353687.t1v-n-00e295a4-w-0.437639.0.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64a2c72eddd38a2e2c1f27d2fbc653a4aadf40419494842b53dd7f7559a30129
|
3 |
+
size 146996
|
events.out.tfevents.1642508113.t1v-n-00e295a4-w-0.632677.0.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ff28af8474a1a3bed992c3c440935d0776de435ca3b3eab3d8a8ed8cec34a2f
|
3 |
+
size 40
|
events.out.tfevents.1642508176.t1v-n-00e295a4-w-0.645011.0.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79ef23d9737ae60fddcddd281a6127f5325f3d0c8080d8b3b17c8105bf147c82
|
3 |
+
size 40
|
events.out.tfevents.1642508417.t1v-n-00e295a4-w-0.662728.0.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2cd702f0bd422826437b0d591702c3d773f8a3c8a4b25c8a2a91f848f42b2ad
|
3 |
+
size 40
|
events.out.tfevents.1642509967.t1v-n-00e295a4-w-0.712066.0.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0694b7f11a53d724a4552a8be128d5008f0c34de819b6925531da247835c6afe
|
3 |
+
size 40
|
flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3ef24399f6318fe5bd62ae854e2d6bfd60c2ce162537aafd46d63d7221cfe3e
|
3 |
+
size 498796983
|
run_mlm_flax.py
CHANGED
@@ -640,17 +640,17 @@ def main():
|
|
640 |
# - mask for weight decay is not implemented but we don't use it anyway
|
641 |
optimizer = distributed_shampoo(
|
642 |
linear_decay_lr_schedule_fn,
|
643 |
-
block_size=
|
644 |
beta1=training_args.adam_beta1, # 0.9,
|
645 |
beta2=training_args.adam_beta2, # 0.999,
|
646 |
diagonal_epsilon=training_args.adam_epsilon, # 1e-10,
|
647 |
matrix_epsilon=1e-8,
|
648 |
weight_decay=training_args.weight_decay, # 0.0,
|
649 |
start_preconditioning_step=1001,
|
650 |
-
preconditioning_compute_steps=
|
651 |
statistics_compute_steps=1,
|
652 |
best_effort_shape_interpretation=True,
|
653 |
-
graft_type=GraftingType.RMSPROP_NORMALIZED,
|
654 |
nesterov=False,
|
655 |
exponent_override=0,
|
656 |
batch_axis_name="batch",
|
@@ -658,7 +658,7 @@ def main():
|
|
658 |
moving_average_for_momentum=True,
|
659 |
skip_preconditioning_dim_size_gt=4096,
|
660 |
clip_by_scaled_gradient_norm=None,
|
661 |
-
precision=jax.lax.Precision.
|
662 |
)
|
663 |
else:
|
664 |
optimizer = optax.adamw(
|
640 |
# - mask for weight decay is not implemented but we don't use it anyway
|
641 |
optimizer = distributed_shampoo(
|
642 |
linear_decay_lr_schedule_fn,
|
643 |
+
block_size=1536, # 1024 by Boris, recommended default for large LM is 1536
|
644 |
beta1=training_args.adam_beta1, # 0.9,
|
645 |
beta2=training_args.adam_beta2, # 0.999,
|
646 |
diagonal_epsilon=training_args.adam_epsilon, # 1e-10,
|
647 |
matrix_epsilon=1e-8,
|
648 |
weight_decay=training_args.weight_decay, # 0.0,
|
649 |
start_preconditioning_step=1001,
|
650 |
+
preconditioning_compute_steps=1,
|
651 |
statistics_compute_steps=1,
|
652 |
best_effort_shape_interpretation=True,
|
653 |
+
graft_type=GraftingType.SGD, # GraftingType.RMSPROP_NORMALIZED,
|
654 |
nesterov=False,
|
655 |
exponent_override=0,
|
656 |
batch_axis_name="batch",
|
658 |
moving_average_for_momentum=True,
|
659 |
skip_preconditioning_dim_size_gt=4096,
|
660 |
clip_by_scaled_gradient_norm=None,
|
661 |
+
precision=jax.lax.Precision.DEFAULT, # DEFAULT is bfloat16
|
662 |
)
|
663 |
else:
|
664 |
optimizer = optax.adamw(
|
train.128.sh
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
python run_mlm_flax.py \
|
2 |
--output_dir="./" \
|
3 |
--model_type="roberta" \
|
4 |
-
--config_name="
|
5 |
--tokenizer_name="NbAiLab/nb-roberta-base" \
|
6 |
--dataset_name="NbAiLab/NCC" \
|
7 |
--max_seq_length="128" \
|
8 |
--weight_decay="0.0" \
|
9 |
-
--per_device_train_batch_size="
|
10 |
-
--per_device_eval_batch_size="
|
11 |
--pad_to_max_length \
|
12 |
--learning_rate="6e-4" \
|
13 |
--warmup_steps="10000" \
|
14 |
--overwrite_output_dir \
|
15 |
-
--num_train_epochs="
|
16 |
--distributed_shampoo \
|
17 |
--adam_beta1="0.9" \
|
18 |
--adam_beta2="0.99" \
|
1 |
python run_mlm_flax.py \
|
2 |
--output_dir="./" \
|
3 |
--model_type="roberta" \
|
4 |
+
--config_name="./" \
|
5 |
--tokenizer_name="NbAiLab/nb-roberta-base" \
|
6 |
--dataset_name="NbAiLab/NCC" \
|
7 |
--max_seq_length="128" \
|
8 |
--weight_decay="0.0" \
|
9 |
+
--per_device_train_batch_size="210" \
|
10 |
+
--per_device_eval_batch_size="210" \
|
11 |
--pad_to_max_length \
|
12 |
--learning_rate="6e-4" \
|
13 |
--warmup_steps="10000" \
|
14 |
--overwrite_output_dir \
|
15 |
+
--num_train_epochs="5" \
|
16 |
--distributed_shampoo \
|
17 |
--adam_beta1="0.9" \
|
18 |
--adam_beta2="0.99" \
|