Model save
Browse files- README.md +3 -3
- all_results.json +5 -5
- breeze-listen-w2v2-kn-GF.log +7 -6
- model.safetensors +1 -1
- train-ctc-model.sh +5 -4
- train_results.json +5 -5
- trainer_state.json +12 -12
- training_args.bin +1 -1
README.md
CHANGED
@@ -35,12 +35,12 @@ More information needed
|
|
35 |
|
36 |
The following hyperparameters were used during training:
|
37 |
- learning_rate: 0.001
|
38 |
-
- train_batch_size:
|
39 |
- eval_batch_size: 8
|
40 |
- seed: 42
|
41 |
- distributed_type: multi-GPU
|
42 |
-
- gradient_accumulation_steps:
|
43 |
-
- total_train_batch_size:
|
44 |
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
45 |
- lr_scheduler_type: linear
|
46 |
- lr_scheduler_warmup_steps: 100
|
|
|
35 |
|
36 |
The following hyperparameters were used during training:
|
37 |
- learning_rate: 0.001
|
38 |
+
- train_batch_size: 2
|
39 |
- eval_batch_size: 8
|
40 |
- seed: 42
|
41 |
- distributed_type: multi-GPU
|
42 |
+
- gradient_accumulation_steps: 8
|
43 |
+
- total_train_batch_size: 16
|
44 |
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
45 |
- lr_scheduler_type: linear
|
46 |
- lr_scheduler_warmup_steps: 100
|
all_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch": 3.
|
3 |
-
"train_loss": 3.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 2471,
|
6 |
-
"train_samples_per_second": 0.
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 3.94,
|
3 |
+
"train_loss": 3.1379870364540503,
|
4 |
+
"train_runtime": 12199.309,
|
5 |
"train_samples": 2471,
|
6 |
+
"train_samples_per_second": 0.81,
|
7 |
+
"train_steps_per_second": 0.012
|
8 |
}
|
breeze-listen-w2v2-kn-GF.log
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
02/04/2024 18:
|
2 |
-
02/04/2024 18:
|
3 |
_n_gpu=1,
|
4 |
adafactor=False,
|
5 |
adam_beta1=0.9,
|
@@ -39,7 +39,7 @@ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
|
|
39 |
fsdp_min_num_params=0,
|
40 |
fsdp_transformer_layer_cls_to_wrap=None,
|
41 |
full_determinism=False,
|
42 |
-
gradient_accumulation_steps=
|
43 |
gradient_checkpointing=True,
|
44 |
gradient_checkpointing_kwargs=None,
|
45 |
greater_is_better=None,
|
@@ -64,7 +64,7 @@ local_rank=0,
|
|
64 |
log_level=passive,
|
65 |
log_level_replica=warning,
|
66 |
log_on_each_node=True,
|
67 |
-
logging_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-kn-GF/runs/
|
68 |
logging_first_step=False,
|
69 |
logging_nan_inf_filter=True,
|
70 |
logging_steps=500,
|
@@ -84,7 +84,7 @@ output_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w
|
|
84 |
overwrite_output_dir=True,
|
85 |
past_index=-1,
|
86 |
per_device_eval_batch_size=8,
|
87 |
-
per_device_train_batch_size=
|
88 |
prediction_loss_only=False,
|
89 |
push_to_hub=True,
|
90 |
push_to_hub_model_id=None,
|
@@ -119,4 +119,5 @@ warmup_ratio=0.0,
|
|
119 |
warmup_steps=100,
|
120 |
weight_decay=0.0,
|
121 |
)
|
122 |
-
{'
|
|
|
|
1 |
+
02/04/2024 23:18:42 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True
|
2 |
+
02/04/2024 23:18:42 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
|
3 |
_n_gpu=1,
|
4 |
adafactor=False,
|
5 |
adam_beta1=0.9,
|
|
|
39 |
fsdp_min_num_params=0,
|
40 |
fsdp_transformer_layer_cls_to_wrap=None,
|
41 |
full_determinism=False,
|
42 |
+
gradient_accumulation_steps=8,
|
43 |
gradient_checkpointing=True,
|
44 |
gradient_checkpointing_kwargs=None,
|
45 |
greater_is_better=None,
|
|
|
64 |
log_level=passive,
|
65 |
log_level_replica=warning,
|
66 |
log_on_each_node=True,
|
67 |
+
logging_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-kn-GF/runs/Feb04_23-18-41_knight,
|
68 |
logging_first_step=False,
|
69 |
logging_nan_inf_filter=True,
|
70 |
logging_steps=500,
|
|
|
84 |
overwrite_output_dir=True,
|
85 |
past_index=-1,
|
86 |
per_device_eval_batch_size=8,
|
87 |
+
per_device_train_batch_size=2,
|
88 |
prediction_loss_only=False,
|
89 |
push_to_hub=True,
|
90 |
push_to_hub_model_id=None,
|
|
|
119 |
warmup_steps=100,
|
120 |
weight_decay=0.0,
|
121 |
)
|
122 |
+
{'loss': 1.1689, 'learning_rate': 0.0002306201550387597, 'epoch': 3.24}
|
123 |
+
{'train_runtime': 12565.5546, 'train_samples_per_second': 0.787, 'train_steps_per_second': 0.049, 'train_loss': 0.9830725657475459, 'epoch': 3.99}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3859264976
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba57738ed6370499d692e4056bc3cf252c2f39cd7e074d05d585a03caace6e53
|
3 |
size 3859264976
|
train-ctc-model.sh
CHANGED
@@ -44,7 +44,8 @@ export "WORLD_SIZE"="1"
|
|
44 |
MODEL=w2v2
|
45 |
|
46 |
# Model names and other stuff
|
47 |
-
BASE_MODEL="facebook/mms-1b-all"
|
|
|
48 |
|
49 |
JUST_LANG=${LANG%%_*}
|
50 |
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}-GF"
|
@@ -55,8 +56,8 @@ echo "OUTDIR: ${OUTDIR}"
|
|
55 |
# Training parameters you can tweak. Feel free to directly change any of the parameters below.
|
56 |
|
57 |
MAX_EPOCHS=4
|
58 |
-
TRAIN_BATCH_SIZE=
|
59 |
-
EVAL_BATCH_SIZE=
|
60 |
LEARNING_RATE="1e-3"
|
61 |
|
62 |
EVAL_STEPS="1000"
|
@@ -80,7 +81,7 @@ python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
|
|
80 |
--output_dir="${OUTDIR}" \
|
81 |
--num_train_epochs="${MAX_EPOCHS}" \
|
82 |
--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
|
83 |
-
--gradient_accumulation_steps="
|
84 |
--learning_rate="${LEARNING_RATE}" \
|
85 |
--warmup_steps="100" \
|
86 |
--evaluation_strategy="steps" \
|
|
|
44 |
MODEL=w2v2
|
45 |
|
46 |
# Model names and other stuff
|
47 |
+
#BASE_MODEL="facebook/mms-1b-all"
|
48 |
+
BASE_MODEL="facebook/mms-1b-fl102"
|
49 |
|
50 |
JUST_LANG=${LANG%%_*}
|
51 |
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}-GF"
|
|
|
56 |
# Training parameters you can tweak. Feel free to directly change any of the parameters below.
|
57 |
|
58 |
MAX_EPOCHS=4
|
59 |
+
TRAIN_BATCH_SIZE=4
|
60 |
+
EVAL_BATCH_SIZE=4
|
61 |
LEARNING_RATE="1e-3"
|
62 |
|
63 |
EVAL_STEPS="1000"
|
|
|
81 |
--output_dir="${OUTDIR}" \
|
82 |
--num_train_epochs="${MAX_EPOCHS}" \
|
83 |
--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
|
84 |
+
--gradient_accumulation_steps="16" \
|
85 |
--learning_rate="${LEARNING_RATE}" \
|
86 |
--warmup_steps="100" \
|
87 |
--evaluation_strategy="steps" \
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch": 3.
|
3 |
-
"train_loss": 3.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 2471,
|
6 |
-
"train_samples_per_second": 0.
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 3.94,
|
3 |
+
"train_loss": 3.1379870364540503,
|
4 |
+
"train_runtime": 12199.309,
|
5 |
"train_samples": 2471,
|
6 |
+
"train_samples_per_second": 0.81,
|
7 |
+
"train_steps_per_second": 0.012
|
8 |
}
|
trainer_state.json
CHANGED
@@ -1,30 +1,30 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 3.
|
5 |
"eval_steps": 1000,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
-
"epoch": 3.
|
13 |
-
"step":
|
14 |
-
"total_flos": 1.
|
15 |
-
"train_loss": 3.
|
16 |
-
"train_runtime":
|
17 |
-
"train_samples_per_second": 0.
|
18 |
-
"train_steps_per_second": 0.
|
19 |
}
|
20 |
],
|
21 |
"logging_steps": 500,
|
22 |
-
"max_steps":
|
23 |
"num_input_tokens_seen": 0,
|
24 |
"num_train_epochs": 4,
|
25 |
"save_steps": 1000,
|
26 |
-
"total_flos": 1.
|
27 |
-
"train_batch_size":
|
28 |
"trial_name": null,
|
29 |
"trial_params": null
|
30 |
}
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 3.9352750809061487,
|
5 |
"eval_steps": 1000,
|
6 |
+
"global_step": 152,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
+
"epoch": 3.94,
|
13 |
+
"step": 152,
|
14 |
+
"total_flos": 1.1080848497912578e+19,
|
15 |
+
"train_loss": 3.1379870364540503,
|
16 |
+
"train_runtime": 12199.309,
|
17 |
+
"train_samples_per_second": 0.81,
|
18 |
+
"train_steps_per_second": 0.012
|
19 |
}
|
20 |
],
|
21 |
"logging_steps": 500,
|
22 |
+
"max_steps": 152,
|
23 |
"num_input_tokens_seen": 0,
|
24 |
"num_train_epochs": 4,
|
25 |
"save_steps": 1000,
|
26 |
+
"total_flos": 1.1080848497912578e+19,
|
27 |
+
"train_batch_size": 4,
|
28 |
"trial_name": null,
|
29 |
"trial_params": null
|
30 |
}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4856
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a8b420468056bb42f40608b089d2f99c12e63a02141099c5c66c2c659044452
|
3 |
size 4856
|