arampacha commited on
Commit
cb469f6
1 Parent(s): 28582fe

trained model 1

Browse files
.gitattributes CHANGED
@@ -26,3 +26,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  language_model/5gram.bin filter=lfs diff=lfs merge=lfs -text
 
 
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  language_model/5gram.bin filter=lfs diff=lfs merge=lfs -text
29
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - hy-AM
4
+ license: apache-2.0
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - mozilla-foundation/common_voice_8_0
8
+ - generated_from_trainer
9
+ datasets:
10
+ - common_voice
11
+ model-index:
12
+ - name: ''
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ #
20
+
21
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the MOZILLA-FOUNDATION/COMMON_VOICE_8_0 - HY-AM dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.4521
24
+ - Wer: 0.5141
25
+ - Cer: 0.1100
26
+
27
+ ## Model description
28
+
29
+ More information needed
30
+
31
+ ## Intended uses & limitations
32
+
33
+ More information needed
34
+
35
+ ## Training and evaluation data
36
+
37
+ More information needed
38
+
39
+ ## Training procedure
40
+
41
+ ### Training hyperparameters
42
+
43
+ The following hyperparameters were used during training:
44
+ - learning_rate: 8e-05
45
+ - train_batch_size: 16
46
+ - eval_batch_size: 64
47
+ - seed: 42
48
+ - gradient_accumulation_steps: 8
49
+ - total_train_batch_size: 128
50
+ - optimizer: Adam with betas=(0.9,0.98) and epsilon=1e-08
51
+ - lr_scheduler_type: linear
52
+ - lr_scheduler_warmup_ratio: 0.1
53
+ - training_steps: 1400
54
+ - mixed_precision_training: Native AMP
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss | Wer | Cer |
59
+ |:-------------:|:------:|:----:|:---------------:|:------:|:------:|
60
+ | 6.1298 | 19.87 | 100 | 3.1204 | 1.0 | 1.0 |
61
+ | 2.7269 | 39.87 | 200 | 0.6200 | 0.7592 | 0.1755 |
62
+ | 1.4643 | 59.87 | 300 | 0.4796 | 0.5921 | 0.1277 |
63
+ | 1.1242 | 79.87 | 400 | 0.4637 | 0.5359 | 0.1145 |
64
+ | 0.9592 | 99.87 | 500 | 0.4521 | 0.5141 | 0.1100 |
65
+ | 0.8704 | 119.87 | 600 | 0.4736 | 0.4914 | 0.1045 |
66
+ | 0.7908 | 139.87 | 700 | 0.5394 | 0.5250 | 0.1124 |
67
+ | 0.7049 | 159.87 | 800 | 0.4822 | 0.4754 | 0.0985 |
68
+ | 0.6299 | 179.87 | 900 | 0.4890 | 0.4809 | 0.1028 |
69
+ | 0.5832 | 199.87 | 1000 | 0.5233 | 0.4813 | 0.1028 |
70
+ | 0.5145 | 219.87 | 1100 | 0.5350 | 0.4781 | 0.0994 |
71
+ | 0.4604 | 239.87 | 1200 | 0.5223 | 0.4715 | 0.0984 |
72
+ | 0.4226 | 259.87 | 1300 | 0.5167 | 0.4625 | 0.0953 |
73
+ | 0.3946 | 279.87 | 1400 | 0.5248 | 0.4614 | 0.0950 |
74
+
75
+
76
+ ### Framework versions
77
+
78
+ - Transformers 4.17.0.dev0
79
+ - Pytorch 1.10.2+cu102
80
+ - Datasets 1.18.2.dev0
81
+ - Tokenizers 0.11.0
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 279.87,
3
+ "eval_cer": 0.1099645928174001,
4
+ "eval_loss": 0.452116459608078,
5
+ "eval_runtime": 15.4676,
6
+ "eval_samples": 335,
7
+ "eval_samples_per_second": 21.658,
8
+ "eval_steps_per_second": 0.388,
9
+ "eval_wer": 0.5140515222482436,
10
+ "train_loss": 1.2697014454432896,
11
+ "train_runtime": 17182.2968,
12
+ "train_samples": 728,
13
+ "train_samples_per_second": 10.429,
14
+ "train_steps_per_second": 0.081
15
+ }
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-1b",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.0,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.1,
62
+ "mask_feature_length": 64,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.25,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.75,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 41,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 44,
106
+ "xvector_output_dim": 512
107
+ }
eval.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+
6
+ import torch
7
+ from datasets import Audio, Dataset, load_dataset, load_metric
8
+
9
+ from transformers import AutoFeatureExtractor, pipeline, Wav2Vec2ProcessorWithLM
10
+
11
+
12
+ def log_results(result: Dataset, args: Dict[str, str]):
13
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
14
+
15
+ log_outputs = args.log_outputs
16
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
17
+
18
+ # load metric
19
+ wer = load_metric("wer")
20
+ cer = load_metric("cer")
21
+
22
+ # compute metrics
23
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
24
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
25
+
26
+ # print & log results
27
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str) -> str:
51
+ """This function normalizes the target text."""
52
+
53
+ chars_to_ignore_regex = re.compile("[^\sաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև]")
54
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
55
+ text = " ".join(text.split())
56
+
57
+ return text
58
+
59
+
60
+ def main(args):
61
+ # load dataset
62
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
63
+
64
+ # for testing: only process the first two examples as a test
65
+ # dataset = dataset.select(range(10))
66
+
67
+ # load processor
68
+ # feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
69
+ # sampling_rate = feature_extractor.sampling_rate
70
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
71
+
72
+ # resample audio
73
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
74
+
75
+ # load eval pipeline
76
+ if args.device is None:
77
+ args.device = 0 if torch.cuda.is_available() else -1
78
+ asr = pipeline(
79
+ "automatic-speech-recognition", model=args.model_id, device=args.device,
80
+ feature_extractor=processor.feature_extractor, decoder=processor.decoder
81
+ )
82
+
83
+ # map function to decode audio
84
+ def map_to_pred(batch):
85
+ prediction = asr(
86
+ batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
87
+ )
88
+
89
+ batch["prediction"] = prediction["text"]
90
+ batch["target"] = normalize_text(batch["sentence"])
91
+ return batch
92
+
93
+ # run inference on all examples
94
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
95
+
96
+ # compute and log_results
97
+ # do not change function below
98
+ log_results(result, args)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ parser = argparse.ArgumentParser()
103
+
104
+ parser.add_argument(
105
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
106
+ )
107
+ parser.add_argument(
108
+ "--dataset",
109
+ type=str,
110
+ required=True,
111
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
112
+ )
113
+ parser.add_argument(
114
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
115
+ )
116
+ parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
117
+ parser.add_argument(
118
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
119
+ )
120
+ parser.add_argument(
121
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
122
+ )
123
+ parser.add_argument(
124
+ "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
125
+ )
126
+ parser.add_argument(
127
+ "--device",
128
+ type=int,
129
+ default=None,
130
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
131
+ )
132
+ args = parser.parse_args()
133
+
134
+ main(args)
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 279.87,
3
+ "eval_cer": 0.1099645928174001,
4
+ "eval_loss": 0.452116459608078,
5
+ "eval_runtime": 15.4676,
6
+ "eval_samples": 335,
7
+ "eval_samples_per_second": 21.658,
8
+ "eval_steps_per_second": 0.388,
9
+ "eval_wer": 0.5140515222482436
10
+ }
preprocessor_config.json CHANGED
@@ -3,8 +3,8 @@
3
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
  "feature_size": 1,
5
  "padding_side": "right",
6
- "padding_value": 0.0,
7
- "processor_class": "Wav2Vec2ProcessorWithLM",
8
  "return_attention_mask": true,
9
- "sampling_rate": 16000
 
10
  }
 
3
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
  "feature_size": 1,
5
  "padding_side": "right",
6
+ "padding_value": 0,
 
7
  "return_attention_mask": true,
8
+ "sampling_rate": 16000,
9
+ "processor_class": "Wav2Vec2ProcessorWithLM"
10
  }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f8978ee6447aa667cea589211c87f8fb8e06cc6854f68043d8ea9c89baaee13
3
+ size 3850538161
run.sh CHANGED
@@ -4,6 +4,7 @@ python run_speech_recognition_ctc.py \
4
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
5
  --tokenizer_name_or_path="./" \
6
  --output_dir="./" \
 
7
  --max_steps 1400 \
8
  --per_device_train_batch_size="16" \
9
  --per_device_eval_batch_size="64" \
@@ -18,7 +19,6 @@ python run_speech_recognition_ctc.py \
18
  --save_steps="100" \
19
  --eval_steps="100" \
20
  --logging_steps="100" \
21
- --eval_metrics="wer cer" \
22
  --save_total_limit="2" \
23
  --freeze_feature_encoder \
24
  --layerdrop="0.1" \
@@ -35,6 +35,6 @@ python run_speech_recognition_ctc.py \
35
  --do_train --do_eval \
36
  --load_best_model_at_end \
37
  --report_to all \
38
- --run_name xlsr-hy-cv-1b-1 \
39
- --wandb_project xlsr-hy \
40
  --bnb --tristage_sched
 
4
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
5
  --tokenizer_name_or_path="./" \
6
  --output_dir="./" \
7
+ --overwrite_output_dir \
8
  --max_steps 1400 \
9
  --per_device_train_batch_size="16" \
10
  --per_device_eval_batch_size="64" \
 
19
  --save_steps="100" \
20
  --eval_steps="100" \
21
  --logging_steps="100" \
 
22
  --save_total_limit="2" \
23
  --freeze_feature_encoder \
24
  --layerdrop="0.1" \
 
35
  --do_train --do_eval \
36
  --load_best_model_at_end \
37
  --report_to all \
38
+ --run_name="xlsr-hy-cv-1b-1" \
39
+ --wandb_project="xlsr-hy" \
40
  --bnb --tristage_sched
run_speech_recognition_ctc.py CHANGED
@@ -192,7 +192,7 @@ class DataTrainingArguments:
192
  metadata={"help": "A list of characters to remove from the transcripts."},
193
  )
194
  eval_metrics: List[str] = list_field(
195
- default=["wer"],
196
  metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
197
  )
198
  max_duration_in_seconds: float = field(
@@ -521,9 +521,9 @@ def main():
521
 
522
  vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
523
 
524
- with training_args.main_process_first():
525
- if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
526
- os.remove(vocab_file)
527
 
528
  with training_args.main_process_first(desc="dataset map vocabulary creation"):
529
  if not os.path.isfile(vocab_file):
@@ -685,8 +685,8 @@ def main():
685
  # Now save everything to be able to create a single processor later
686
  if is_main_process(training_args.local_rank):
687
  # save feature extractor, tokenizer and config
688
- feature_extractor.save_pretrained(training_args.output_dir)
689
- tokenizer.save_pretrained(training_args.output_dir)
690
  config.save_pretrained(training_args.output_dir)
691
 
692
  try:
 
192
  metadata={"help": "A list of characters to remove from the transcripts."},
193
  )
194
  eval_metrics: List[str] = list_field(
195
+ default=["wer", "cer"],
196
  metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
197
  )
198
  max_duration_in_seconds: float = field(
 
521
 
522
  vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
523
 
524
+ # with training_args.main_process_first():
525
+ # if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
526
+ # os.remove(vocab_file)
527
 
528
  with training_args.main_process_first(desc="dataset map vocabulary creation"):
529
  if not os.path.isfile(vocab_file):
 
685
  # Now save everything to be able to create a single processor later
686
  if is_main_process(training_args.local_rank):
687
  # save feature extractor, tokenizer and config
688
+ # feature_extractor.save_pretrained(training_args.output_dir)
689
+ # tokenizer.save_pretrained(training_args.output_dir)
690
  config.save_pretrained(training_args.output_dir)
691
 
692
  try:
runs/Jan29_18-11-09_job-b1f4681b-d20d-47f2-af64-0c1734f4ff64/1643479909.8714664/events.out.tfevents.1643479909.job-b1f4681b-d20d-47f2-af64-0c1734f4ff64.6189.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc84b6b9c7e039a23d8ea0de837aed93d5a56a472944aae8a0a5513d579eb0d1
3
+ size 4772
runs/Jan29_18-11-09_job-b1f4681b-d20d-47f2-af64-0c1734f4ff64/events.out.tfevents.1643479909.job-b1f4681b-d20d-47f2-af64-0c1734f4ff64.6189.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76b421bd9dff985180762874b1d03e350411b7d1d329716b241946a206fbdddd
3
+ size 12354
runs/Jan29_18-11-09_job-b1f4681b-d20d-47f2-af64-0c1734f4ff64/events.out.tfevents.1643497112.job-b1f4681b-d20d-47f2-af64-0c1734f4ff64.6189.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad62cf8a733fcb9bcef04edf23c64c03b02fe5fb12fd25b4744b660931c8729
3
+ size 405
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 279.87,
3
+ "train_loss": 1.2697014454432896,
4
+ "train_runtime": 17182.2968,
5
+ "train_samples": 728,
6
+ "train_samples_per_second": 10.429,
7
+ "train_steps_per_second": 0.081
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.452116459608078,
3
+ "best_model_checkpoint": "./checkpoint-500",
4
+ "epoch": 279.8695652173913,
5
+ "global_step": 1400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 19.87,
12
+ "learning_rate": 5.6240000000000004e-05,
13
+ "loss": 6.1298,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 19.87,
18
+ "eval_cer": 1.0,
19
+ "eval_loss": 3.120361804962158,
20
+ "eval_runtime": 16.2509,
21
+ "eval_samples_per_second": 20.614,
22
+ "eval_steps_per_second": 0.369,
23
+ "eval_wer": 1.0,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 39.87,
28
+ "learning_rate": 8e-05,
29
+ "loss": 2.7269,
30
+ "step": 200
31
+ },
32
+ {
33
+ "epoch": 39.87,
34
+ "eval_cer": 0.17546788062721294,
35
+ "eval_loss": 0.6199544668197632,
36
+ "eval_runtime": 17.4782,
37
+ "eval_samples_per_second": 19.167,
38
+ "eval_steps_per_second": 0.343,
39
+ "eval_wer": 0.7591725214676034,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 59.87,
44
+ "learning_rate": 8e-05,
45
+ "loss": 1.4643,
46
+ "step": 300
47
+ },
48
+ {
49
+ "epoch": 59.87,
50
+ "eval_cer": 0.12771876580677793,
51
+ "eval_loss": 0.4795631468296051,
52
+ "eval_runtime": 15.5884,
53
+ "eval_samples_per_second": 21.49,
54
+ "eval_steps_per_second": 0.385,
55
+ "eval_wer": 0.5921155347384855,
56
+ "step": 300
57
+ },
58
+ {
59
+ "epoch": 79.87,
60
+ "learning_rate": 8e-05,
61
+ "loss": 1.1242,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 79.87,
66
+ "eval_cer": 0.11451694486595852,
67
+ "eval_loss": 0.463740736246109,
68
+ "eval_runtime": 15.4298,
69
+ "eval_samples_per_second": 21.711,
70
+ "eval_steps_per_second": 0.389,
71
+ "eval_wer": 0.5359094457455114,
72
+ "step": 400
73
+ },
74
+ {
75
+ "epoch": 99.87,
76
+ "learning_rate": 8e-05,
77
+ "loss": 0.9592,
78
+ "step": 500
79
+ },
80
+ {
81
+ "epoch": 99.87,
82
+ "eval_cer": 0.1099645928174001,
83
+ "eval_loss": 0.452116459608078,
84
+ "eval_runtime": 15.3829,
85
+ "eval_samples_per_second": 21.777,
86
+ "eval_steps_per_second": 0.39,
87
+ "eval_wer": 0.5140515222482436,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 119.87,
92
+ "learning_rate": 8e-05,
93
+ "loss": 0.8704,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 119.87,
98
+ "eval_cer": 0.10450177035913,
99
+ "eval_loss": 0.4736480712890625,
100
+ "eval_runtime": 15.4613,
101
+ "eval_samples_per_second": 21.667,
102
+ "eval_steps_per_second": 0.388,
103
+ "eval_wer": 0.49141295862607337,
104
+ "step": 600
105
+ },
106
+ {
107
+ "epoch": 139.87,
108
+ "learning_rate": 8e-05,
109
+ "loss": 0.7908,
110
+ "step": 700
111
+ },
112
+ {
113
+ "epoch": 139.87,
114
+ "eval_cer": 0.11244309559939301,
115
+ "eval_loss": 0.539383053779602,
116
+ "eval_runtime": 15.4463,
117
+ "eval_samples_per_second": 21.688,
118
+ "eval_steps_per_second": 0.388,
119
+ "eval_wer": 0.5249804839968775,
120
+ "step": 700
121
+ },
122
+ {
123
+ "epoch": 159.87,
124
+ "learning_rate": 6.936e-05,
125
+ "loss": 0.7049,
126
+ "step": 800
127
+ },
128
+ {
129
+ "epoch": 159.87,
130
+ "eval_cer": 0.09848254931714719,
131
+ "eval_loss": 0.48218029737472534,
132
+ "eval_runtime": 15.323,
133
+ "eval_samples_per_second": 21.863,
134
+ "eval_steps_per_second": 0.392,
135
+ "eval_wer": 0.47540983606557374,
136
+ "step": 800
137
+ },
138
+ {
139
+ "epoch": 179.87,
140
+ "learning_rate": 5.850285714285715e-05,
141
+ "loss": 0.6299,
142
+ "step": 900
143
+ },
144
+ {
145
+ "epoch": 179.87,
146
+ "eval_cer": 0.1028325746079919,
147
+ "eval_loss": 0.48903265595436096,
148
+ "eval_runtime": 15.3742,
149
+ "eval_samples_per_second": 21.79,
150
+ "eval_steps_per_second": 0.39,
151
+ "eval_wer": 0.4808743169398907,
152
+ "step": 900
153
+ },
154
+ {
155
+ "epoch": 199.87,
156
+ "learning_rate": 4.76457142857143e-05,
157
+ "loss": 0.5832,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 199.87,
162
+ "eval_cer": 0.10278199291856348,
163
+ "eval_loss": 0.5233051180839539,
164
+ "eval_runtime": 15.4506,
165
+ "eval_samples_per_second": 21.682,
166
+ "eval_steps_per_second": 0.388,
167
+ "eval_wer": 0.4812646370023419,
168
+ "step": 1000
169
+ },
170
+ {
171
+ "epoch": 219.87,
172
+ "learning_rate": 3.6788571428571434e-05,
173
+ "loss": 0.5145,
174
+ "step": 1100
175
+ },
176
+ {
177
+ "epoch": 219.87,
178
+ "eval_cer": 0.09939301972685888,
179
+ "eval_loss": 0.5349759459495544,
180
+ "eval_runtime": 15.4699,
181
+ "eval_samples_per_second": 21.655,
182
+ "eval_steps_per_second": 0.388,
183
+ "eval_wer": 0.4781420765027322,
184
+ "step": 1100
185
+ },
186
+ {
187
+ "epoch": 239.87,
188
+ "learning_rate": 2.5931428571428576e-05,
189
+ "loss": 0.4604,
190
+ "step": 1200
191
+ },
192
+ {
193
+ "epoch": 239.87,
194
+ "eval_cer": 0.09838138593829034,
195
+ "eval_loss": 0.5222976803779602,
196
+ "eval_runtime": 15.1676,
197
+ "eval_samples_per_second": 22.087,
198
+ "eval_steps_per_second": 0.396,
199
+ "eval_wer": 0.4715066354410617,
200
+ "step": 1200
201
+ },
202
+ {
203
+ "epoch": 259.87,
204
+ "learning_rate": 1.5074285714285721e-05,
205
+ "loss": 0.4226,
206
+ "step": 1300
207
+ },
208
+ {
209
+ "epoch": 259.87,
210
+ "eval_cer": 0.0952959028831563,
211
+ "eval_loss": 0.5167204737663269,
212
+ "eval_runtime": 15.5392,
213
+ "eval_samples_per_second": 21.558,
214
+ "eval_steps_per_second": 0.386,
215
+ "eval_wer": 0.46252927400468385,
216
+ "step": 1300
217
+ },
218
+ {
219
+ "epoch": 279.87,
220
+ "learning_rate": 4.217142857142858e-06,
221
+ "loss": 0.3946,
222
+ "step": 1400
223
+ },
224
+ {
225
+ "epoch": 279.87,
226
+ "eval_cer": 0.09504299443601416,
227
+ "eval_loss": 0.5248004794120789,
228
+ "eval_runtime": 15.5024,
229
+ "eval_samples_per_second": 21.61,
230
+ "eval_steps_per_second": 0.387,
231
+ "eval_wer": 0.4613583138173302,
232
+ "step": 1400
233
+ },
234
+ {
235
+ "epoch": 279.87,
236
+ "step": 1400,
237
+ "total_flos": 1.252060110838857e+20,
238
+ "train_loss": 1.2697014454432896,
239
+ "train_runtime": 17182.2968,
240
+ "train_samples_per_second": 10.429,
241
+ "train_steps_per_second": 0.081
242
+ }
243
+ ],
244
+ "max_steps": 1400,
245
+ "num_train_epochs": 280,
246
+ "total_flos": 1.252060110838857e+20,
247
+ "trial_name": null,
248
+ "trial_params": null
249
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c9d49bcd1768ee2043ea2c5e88a280c8d1fe28001131ac6b51202415414f57
3
+ size 3055