jsnfly commited on
Commit
b374f58
1 Parent(s): 8c5006d

add files and evaluate

Browse files
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - de
4
+ license: apache-2.0
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - mozilla-foundation/common_voice_8_0
8
+ - de
9
+ - robust-speech-event
10
+ datasets:
11
+ - mozilla-foundation/common_voice_8_0
12
+ model-index:
13
+ - name: XLS-R-1B - German
14
+ results:
15
+ - task:
16
+ name: Automatic Speech Recognition
17
+ type: automatic-speech-recognition
18
+ dataset:
19
+ name: Common Voice 8
20
+ type: mozilla-foundation/common_voice_8_0
21
+ args: de
22
+ metrics:
23
+ - name: Test WER
24
+ type: wer
25
+ value: 11.37
26
+ - name: Test CER
27
+ type: cer
28
+ value: 2.89
29
+ - task:
30
+ name: Automatic Speech Recognition
31
+ type: automatic-speech-recognition
32
+ dataset:
33
+ name: Robust Speech Event - Dev Data
34
+ type: speech-recognition-community-v2/dev_data
35
+ args: de
36
+ metrics:
37
+ - name: Dev WER
38
+ type: wer
39
+ value: 31.16
40
+ - name: Dev CER
41
+ type: cer
42
+ value: 13.41
43
+ ---
added_tokens.json ADDED
@@ -0,0 +1 @@
 
1
+ {"<s>": 191, "</s>": 192}
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.11697383970022202,
4
+ "eval_runtime": 1050.8284,
5
+ "eval_samples": 16007,
6
+ "eval_samples_per_second": 15.233,
7
+ "eval_steps_per_second": 1.904,
8
+ "eval_wer": 0.1117051786299574,
9
+ "train_loss": 0.27212924943281636,
10
+ "train_runtime": 112594.4054,
11
+ "train_samples": 436168,
12
+ "train_samples_per_second": 7.748,
13
+ "train_steps_per_second": 0.242
14
+ }
eval.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor
4
+ import re
5
+ import argparse
6
+ import unicodedata
7
+ from typing import Dict
8
+
9
+
10
+ def log_results(result: Dataset, args: Dict[str, str]):
11
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
12
+
13
+ log_outputs = args.log_outputs
14
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
15
+
16
+ # load metric
17
+ wer = load_metric("wer")
18
+ cer = load_metric("cer")
19
+
20
+ # compute metrics
21
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
22
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
23
+
24
+ # print & log results
25
+ result_str = (
26
+ f"WER: {wer_result}\n"
27
+ f"CER: {cer_result}"
28
+ )
29
+ print(result_str)
30
+
31
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
32
+ f.write(result_str)
33
+
34
+ # log all results in text file. Possibly interesting for analysis
35
+ if log_outputs is not None:
36
+ pred_file = f"log_{dataset_id}_predictions.txt"
37
+ target_file = f"log_{dataset_id}_targets.txt"
38
+
39
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
40
+
41
+ # mapping function to write output
42
+ def write_to_file(batch, i):
43
+ p.write(f"{i}" + "\n")
44
+ p.write(batch["prediction"] + "\n")
45
+ t.write(f"{i}" + "\n")
46
+ t.write(batch["target"] + "\n")
47
+
48
+ result.map(write_to_file, with_indices=True)
49
+
50
+
51
+ def normalize_text(text: str) -> str:
52
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
53
+
54
+ # From https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-german.
55
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
56
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
57
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
58
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
59
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
60
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
61
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
62
+
63
+ return " ".join(text.split())
64
+
65
+
66
+ def main(args):
67
+ # load dataset
68
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
69
+
70
+ # # for testing: only process the first two examples as a test
71
+ # dataset = dataset.select(range(10))
72
+
73
+ # load processor
74
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
75
+ sampling_rate = feature_extractor.sampling_rate
76
+
77
+ # resample audio
78
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
79
+
80
+ # load eval pipeline
81
+ asr = pipeline("automatic-speech-recognition", model=args.model_id, device=0)
82
+
83
+ # map function to decode audio
84
+ def map_to_pred(batch):
85
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
86
+
87
+ batch["prediction"] = prediction["text"]
88
+ batch["target"] = normalize_text(batch["sentence"])
89
+ return batch
90
+
91
+ # run inference on all examples
92
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
93
+
94
+ # compute and log_results
95
+ # do not change function below
96
+ log_results(result, args)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser()
101
+
102
+ parser.add_argument(
103
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
104
+ )
105
+ parser.add_argument(
106
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
107
+ )
108
+ parser.add_argument(
109
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
110
+ )
111
+ parser.add_argument(
112
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
113
+ )
114
+ parser.add_argument(
115
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
116
+ )
117
+ parser.add_argument(
118
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
119
+ )
120
+ parser.add_argument(
121
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
122
+ )
123
+ args = parser.parse_args()
124
+
125
+ main(args)
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.11697383970022202,
4
+ "eval_runtime": 1050.8284,
5
+ "eval_samples": 16007,
6
+ "eval_samples_per_second": 15.233,
7
+ "eval_steps_per_second": 1.904,
8
+ "eval_wer": 0.1117051786299574
9
+ }
log_mozilla-foundation_common_voice_8_0_de_test_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
log_mozilla-foundation_common_voice_8_0_de_test_targets.txt ADDED
The diff for this file is too large to render. See raw diff
log_speech-recognition-community-v2_dev_data_de_validation_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
log_speech-recognition-community-v2_dev_data_de_validation_targets.txt ADDED
The diff for this file is too large to render. See raw diff
mozilla-foundation_common_voice_8_0_de_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ WER: 0.11368385114373507
2
+ CER: 0.028929306965087716
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="mozilla-foundation/common_voice_8_0" \
3
+ --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
4
+ --dataset_config_name="de" \
5
+ --output_dir="./" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="2" \
8
+ --per_device_train_batch_size="8" \
9
+ --per_device_eval_batch_size="8" \
10
+ --gradient_accumulation_steps="4" \
11
+ --learning_rate="7e-5" \
12
+ --lr_scheduler_type="cosine" \
13
+ --warmup_steps="10" \
14
+ --length_column_name="input_length" \
15
+ --evaluation_strategy="steps" \
16
+ --text_column_name="sentence" \
17
+ --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
18
+ --save_steps="1500" \
19
+ --eval_steps="1500" \
20
+ --logging_steps="100" \
21
+ --save_total_limit="2" \
22
+ --freeze_feature_encoder \
23
+ --attention_dropout="0.15" \
24
+ --hidden_dropout="0.15" \
25
+ --mask_time_prob="0.15" \
26
+ --mask_time_length="10" \
27
+ --mask_feature_prob="0.35" \
28
+ --mask_feature_length="64" \
29
+ --gradient_checkpointing \
30
+ --use_auth_token \
31
+ --fp16 \
32
+ --group_by_length \
33
+ --do_train --do_eval
run_speech_recognition_ctc.py ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset, load_metric
32
+
33
+ import transformers
34
+ from transformers import (
35
+ AutoConfig,
36
+ AutoFeatureExtractor,
37
+ AutoModelForCTC,
38
+ AutoProcessor,
39
+ AutoTokenizer,
40
+ HfArgumentParser,
41
+ Trainer,
42
+ TrainingArguments,
43
+ Wav2Vec2Processor,
44
+ set_seed,
45
+ )
46
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
+ from transformers.utils import check_min_version
48
+ from transformers.utils.versions import require_version
49
+
50
+
51
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
52
+ check_min_version("4.17.0.dev0")
53
+
54
+ require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
55
+
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+
60
+ def list_field(default=None, metadata=None):
61
+ return field(default_factory=lambda: default, metadata=metadata)
62
+
63
+
64
+ @dataclass
65
+ class ModelArguments:
66
+ """
67
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
68
+ """
69
+
70
+ model_name_or_path: str = field(
71
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
72
+ )
73
+ tokenizer_name_or_path: Optional[str] = field(
74
+ default=None,
75
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
76
+ )
77
+ cache_dir: Optional[str] = field(
78
+ default=None,
79
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
80
+ )
81
+ freeze_feature_encoder: bool = field(
82
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
83
+ )
84
+ attention_dropout: float = field(
85
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
86
+ )
87
+ activation_dropout: float = field(
88
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
89
+ )
90
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
91
+ hidden_dropout: float = field(
92
+ default=0.0,
93
+ metadata={
94
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
95
+ },
96
+ )
97
+ final_dropout: float = field(
98
+ default=0.0,
99
+ metadata={"help": "The dropout probability for the final projection layer."},
100
+ )
101
+ mask_time_prob: float = field(
102
+ default=0.05,
103
+ metadata={
104
+ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
105
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
106
+ "vectors will be masked along the time axis."
107
+ },
108
+ )
109
+ mask_time_length: int = field(
110
+ default=10,
111
+ metadata={"help": "Length of vector span to mask along the time axis."},
112
+ )
113
+ mask_feature_prob: float = field(
114
+ default=0.0,
115
+ metadata={
116
+ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
117
+ "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
118
+ },
119
+ )
120
+ mask_feature_length: int = field(
121
+ default=10,
122
+ metadata={"help": "Length of vector span to mask along the feature axis."},
123
+ )
124
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
125
+ ctc_loss_reduction: Optional[str] = field(
126
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class DataTrainingArguments:
132
+ """
133
+ Arguments pertaining to what data we are going to input our model for training and eval.
134
+
135
+ Using `HfArgumentParser` we can turn this class
136
+ into argparse arguments to be able to specify them on
137
+ the command line.
138
+ """
139
+
140
+ dataset_name: str = field(
141
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
142
+ )
143
+ dataset_config_name: str = field(
144
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
145
+ )
146
+ train_split_name: str = field(
147
+ default="train+validation",
148
+ metadata={
149
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train+validation'"
150
+ },
151
+ )
152
+ eval_split_name: str = field(
153
+ default="test",
154
+ metadata={
155
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
156
+ },
157
+ )
158
+ audio_column_name: str = field(
159
+ default="audio",
160
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
161
+ )
162
+ text_column_name: str = field(
163
+ default="text",
164
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
165
+ )
166
+ overwrite_cache: bool = field(
167
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
168
+ )
169
+ preprocessing_num_workers: Optional[int] = field(
170
+ default=None,
171
+ metadata={"help": "The number of processes to use for the preprocessing."},
172
+ )
173
+ max_train_samples: Optional[int] = field(
174
+ default=None,
175
+ metadata={
176
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
177
+ "value if set."
178
+ },
179
+ )
180
+ max_eval_samples: Optional[int] = field(
181
+ default=None,
182
+ metadata={
183
+ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
184
+ "value if set."
185
+ },
186
+ )
187
+ chars_to_ignore: Optional[List[str]] = list_field(
188
+ default=None,
189
+ metadata={"help": "A list of characters to remove from the transcripts."},
190
+ )
191
+ eval_metrics: List[str] = list_field(
192
+ default=["wer"],
193
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
194
+ )
195
+ max_duration_in_seconds: float = field(
196
+ default=20.0,
197
+ metadata={
198
+ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
199
+ },
200
+ )
201
+ min_duration_in_seconds: float = field(
202
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
203
+ )
204
+ preprocessing_only: bool = field(
205
+ default=False,
206
+ metadata={
207
+ "help": "Whether to only do data preprocessing and skip training. "
208
+ "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
209
+ "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
210
+ "so that the cached datasets can consequently be loaded in distributed training"
211
+ },
212
+ )
213
+ use_auth_token: bool = field(
214
+ default=False,
215
+ metadata={
216
+ "help": "If :obj:`True`, will use the token generated when running"
217
+ ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
218
+ },
219
+ )
220
+ unk_token: str = field(
221
+ default="[UNK]",
222
+ metadata={"help": "The unk token for the tokenizer"},
223
+ )
224
+ pad_token: str = field(
225
+ default="[PAD]",
226
+ metadata={"help": "The padding token for the tokenizer"},
227
+ )
228
+ word_delimiter_token: str = field(
229
+ default="|",
230
+ metadata={"help": "The word delimiter token for the tokenizer"},
231
+ )
232
+ phoneme_language: Optional[str] = field(
233
+ default=None,
234
+ metadata={
235
+ "help": "The target language that should be used be"
236
+ " passed to the tokenizer for tokenization. Note that"
237
+ " this is only relevant if the model classifies the"
238
+ " input audio to a sequence of phoneme sequences."
239
+ },
240
+ )
241
+
242
+
243
+ @dataclass
244
+ class DataCollatorCTCWithPadding:
245
+ """
246
+ Data collator that will dynamically pad the inputs received.
247
+ Args:
248
+ processor (:class:`~transformers.AutoProcessor`)
249
+ The processor used for proccessing the data.
250
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
251
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
252
+ among:
253
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
254
+ sequence if provided).
255
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
256
+ maximum acceptable input length for the model if that argument is not provided.
257
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
258
+ different lengths).
259
+ max_length (:obj:`int`, `optional`):
260
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
261
+ max_length_labels (:obj:`int`, `optional`):
262
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
263
+ pad_to_multiple_of (:obj:`int`, `optional`):
264
+ If set will pad the sequence to a multiple of the provided value.
265
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
266
+ 7.5 (Volta).
267
+ """
268
+
269
+ processor: AutoProcessor
270
+ padding: Union[bool, str] = "longest"
271
+ pad_to_multiple_of: Optional[int] = None
272
+ pad_to_multiple_of_labels: Optional[int] = None
273
+
274
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
275
+ # split inputs and labels since they have to be of different lenghts and need
276
+ # different padding methods
277
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
278
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
279
+
280
+ batch = self.processor.pad(
281
+ input_features,
282
+ padding=self.padding,
283
+ pad_to_multiple_of=self.pad_to_multiple_of,
284
+ return_tensors="pt",
285
+ )
286
+
287
+ with self.processor.as_target_processor():
288
+ labels_batch = self.processor.pad(
289
+ label_features,
290
+ padding=self.padding,
291
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
292
+ return_tensors="pt",
293
+ )
294
+
295
+ # replace padding with -100 to ignore loss correctly
296
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
297
+
298
+ batch["labels"] = labels
299
+
300
+ return batch
301
+
302
+
303
+ def create_vocabulary_from_data(
304
+ datasets: DatasetDict,
305
+ word_delimiter_token: Optional[str] = None,
306
+ unk_token: Optional[str] = None,
307
+ pad_token: Optional[str] = None,
308
+ ):
309
+ # Given training and test labels create vocabulary
310
+ def extract_all_chars(batch):
311
+ all_text = " ".join(batch["target_text"])
312
+ vocab = list(set(all_text))
313
+ return {"vocab": [vocab], "all_text": [all_text]}
314
+
315
+ vocabs = datasets.map(
316
+ extract_all_chars,
317
+ batched=True,
318
+ batch_size=-1,
319
+ keep_in_memory=True,
320
+ remove_columns=datasets["train"].column_names,
321
+ )
322
+
323
+ # take union of all unique characters in each dataset
324
+ vocab_set = functools.reduce(
325
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
326
+ )
327
+
328
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
329
+
330
+ # replace white space with delimiter token
331
+ if word_delimiter_token is not None:
332
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
333
+ del vocab_dict[" "]
334
+
335
+ # add unk and pad token
336
+ if unk_token is not None:
337
+ vocab_dict[unk_token] = len(vocab_dict)
338
+
339
+ if pad_token is not None:
340
+ vocab_dict[pad_token] = len(vocab_dict)
341
+
342
+ return vocab_dict
343
+
344
+
345
+ def main():
346
+ # See all possible arguments in src/transformers/training_args.py
347
+ # or by passing the --help flag to this script.
348
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
349
+
350
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
351
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
352
+ # If we pass only one argument to the script and it's the path to a json file,
353
+ # let's parse it to get our arguments.
354
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
355
+ else:
356
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
357
+
358
+ # Detecting last checkpoint.
359
+ last_checkpoint = None
360
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
361
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
362
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
363
+ raise ValueError(
364
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
365
+ "Use --overwrite_output_dir to overcome."
366
+ )
367
+ elif last_checkpoint is not None:
368
+ logger.info(
369
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
370
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
371
+ )
372
+
373
+ # Setup logging
374
+ logging.basicConfig(
375
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
376
+ datefmt="%m/%d/%Y %H:%M:%S",
377
+ handlers=[logging.StreamHandler(sys.stdout)],
378
+ )
379
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
380
+
381
+ # Log on each process the small summary:
382
+ logger.warning(
383
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
384
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
385
+ )
386
+ # Set the verbosity to info of the Transformers logger (on main process only):
387
+ if is_main_process(training_args.local_rank):
388
+ transformers.utils.logging.set_verbosity_info()
389
+ logger.info("Training/evaluation parameters %s", training_args)
390
+
391
+ # Set seed before initializing model.
392
+ set_seed(training_args.seed)
393
+
394
+ # 1. First, let's load the dataset
395
+ raw_datasets = DatasetDict()
396
+
397
+ if training_args.do_train:
398
+ raw_datasets["train"] = load_dataset(
399
+ data_args.dataset_name,
400
+ data_args.dataset_config_name,
401
+ split=data_args.train_split_name,
402
+ use_auth_token=data_args.use_auth_token,
403
+ )
404
+
405
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
406
+ raise ValueError(
407
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
408
+ "Make sure to set `--audio_column_name` to the correct audio column - one of "
409
+ f"{', '.join(raw_datasets['train'].column_names)}."
410
+ )
411
+
412
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
413
+ raise ValueError(
414
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
415
+ "Make sure to set `--text_column_name` to the correct text column - one of "
416
+ f"{', '.join(raw_datasets['train'].column_names)}."
417
+ )
418
+
419
+ if data_args.max_train_samples is not None:
420
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
421
+
422
+ if training_args.do_eval:
423
+ raw_datasets["eval"] = load_dataset(
424
+ data_args.dataset_name,
425
+ data_args.dataset_config_name,
426
+ split=data_args.eval_split_name,
427
+ use_auth_token=data_args.use_auth_token,
428
+ )
429
+
430
+ if data_args.max_eval_samples is not None:
431
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
432
+
433
+ # 2. We remove some special characters from the datasets
434
+ # that make training complicated and do not help in transcribing the speech
435
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
+ # that could be easily picked up by the model
437
+ chars_to_ignore_regex = (
438
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
439
+ )
440
+ text_column_name = data_args.text_column_name
441
+
442
+ def remove_special_characters(batch):
443
+ if chars_to_ignore_regex is not None:
444
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
445
+ else:
446
+ batch["target_text"] = batch[text_column_name].lower() + " "
447
+ return batch
448
+
449
+ with training_args.main_process_first(desc="dataset map special characters removal"):
450
+ raw_datasets = raw_datasets.map(
451
+ remove_special_characters,
452
+ remove_columns=[text_column_name],
453
+ desc="remove special characters from datasets",
454
+ )
455
+
456
+ # save special tokens for tokenizer
457
+ word_delimiter_token = data_args.word_delimiter_token
458
+ unk_token = data_args.unk_token
459
+ pad_token = data_args.pad_token
460
+
461
+ # 3. Next, let's load the config as we might need it to create
462
+ # the tokenizer
463
+ # load config
464
+ config = AutoConfig.from_pretrained(
465
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
466
+ )
467
+
468
+ # 4. Next, if no tokenizer file is defined,
469
+ # we create the vocabulary of the model by extracting all unique characters from
470
+ # the training and evaluation datasets
471
+ # We need to make sure that only first rank saves vocabulary
472
+ # make sure all processes wait until vocab is created
473
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
474
+ tokenizer_kwargs = {}
475
+ if tokenizer_name_or_path is None:
476
+ # save vocab in training output dir
477
+ tokenizer_name_or_path = training_args.output_dir
478
+
479
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
480
+
481
+ with training_args.main_process_first():
482
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
483
+ os.remove(vocab_file)
484
+
485
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
486
+ if not os.path.isfile(vocab_file):
487
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
488
+ vocab_dict = create_vocabulary_from_data(
489
+ raw_datasets,
490
+ word_delimiter_token=word_delimiter_token,
491
+ unk_token=unk_token,
492
+ pad_token=pad_token,
493
+ )
494
+
495
+ # save vocab dict to be loaded into tokenizer
496
+ with open(vocab_file, "w") as file:
497
+ json.dump(vocab_dict, file)
498
+
499
+ # if tokenizer has just been created
500
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
501
+ tokenizer_kwargs = {
502
+ "config": config if config.tokenizer_class is not None else None,
503
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
504
+ "unk_token": unk_token,
505
+ "pad_token": pad_token,
506
+ "word_delimiter_token": word_delimiter_token,
507
+ }
508
+
509
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
510
+ # Note for distributed training, the .from_pretrained methods guarantee that only
511
+ # one local process can concurrently download model & vocab.
512
+
513
+ # load feature_extractor and tokenizer
514
+ tokenizer = AutoTokenizer.from_pretrained(
515
+ tokenizer_name_or_path,
516
+ use_auth_token=data_args.use_auth_token,
517
+ **tokenizer_kwargs,
518
+ )
519
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
520
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
521
+ )
522
+
523
+ # adapt config
524
+ config.update(
525
+ {
526
+ "feat_proj_dropout": model_args.feat_proj_dropout,
527
+ "attention_dropout": model_args.attention_dropout,
528
+ "hidden_dropout": model_args.hidden_dropout,
529
+ "final_dropout": model_args.final_dropout,
530
+ "mask_time_prob": model_args.mask_time_prob,
531
+ "mask_time_length": model_args.mask_time_length,
532
+ "mask_feature_prob": model_args.mask_feature_prob,
533
+ "mask_feature_length": model_args.mask_feature_length,
534
+ "gradient_checkpointing": training_args.gradient_checkpointing,
535
+ "layerdrop": model_args.layerdrop,
536
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
537
+ "pad_token_id": tokenizer.pad_token_id,
538
+ "vocab_size": len(tokenizer),
539
+ "activation_dropout": model_args.activation_dropout,
540
+ }
541
+ )
542
+
543
+ # create model
544
+ model = AutoModelForCTC.from_pretrained(
545
+ model_args.model_name_or_path,
546
+ cache_dir=model_args.cache_dir,
547
+ config=config,
548
+ use_auth_token=data_args.use_auth_token,
549
+ )
550
+
551
+ # freeze encoder
552
+ if model_args.freeze_feature_encoder:
553
+ model.freeze_feature_encoder()
554
+
555
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
556
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
557
+ # so that we just need to set the correct target sampling rate and normalize the input
558
+ # via the `feature_extractor`
559
+
560
+ # make sure that dataset decodes audio with correct sampling rate
561
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
562
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
563
+ raw_datasets = raw_datasets.cast_column(
564
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
565
+ )
566
+
567
+ # derive max & min input length for sample rate & max duration
568
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
569
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
570
+ audio_column_name = data_args.audio_column_name
571
+ num_workers = data_args.preprocessing_num_workers
572
+
573
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
574
+ phoneme_language = data_args.phoneme_language
575
+
576
+ # Preprocessing the datasets.
577
+ # We need to read the audio files as arrays and tokenize the targets.
578
+ def prepare_dataset(batch):
579
+ # load audio
580
+ sample = batch[audio_column_name]
581
+
582
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
583
+ batch["input_values"] = inputs.input_values[0]
584
+ batch["input_length"] = len(batch["input_values"])
585
+
586
+ # encode targets
587
+ additional_kwargs = {}
588
+ if phoneme_language is not None:
589
+ additional_kwargs["phonemizer_lang"] = phoneme_language
590
+
591
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
592
+ return batch
593
+
594
+ with training_args.main_process_first(desc="dataset map preprocessing"):
595
+ vectorized_datasets = raw_datasets.map(
596
+ prepare_dataset,
597
+ remove_columns=next(iter(raw_datasets.values())).column_names,
598
+ num_proc=num_workers,
599
+ desc="preprocess datasets",
600
+ )
601
+
602
+ def is_audio_in_length_range(length):
603
+ return length > min_input_length and length < max_input_length
604
+
605
+ # filter data that is shorter than min_input_length
606
+ vectorized_datasets = vectorized_datasets.filter(
607
+ is_audio_in_length_range,
608
+ num_proc=num_workers,
609
+ input_columns=["input_length"],
610
+ )
611
+
612
+ # 7. Next, we can prepare the training.
613
+ # Let's use word error rate (WER) as our evaluation metric,
614
+ # instantiate a data collator and the trainer
615
+
616
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
617
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
618
+
619
+ # for large datasets it is advised to run the preprocessing on a
620
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
621
+ # be a timeout when running the script in distributed mode.
622
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
623
+ # cached dataset
624
+ if data_args.preprocessing_only:
625
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
626
+ return
627
+
628
+ def compute_metrics(pred):
629
+ pred_logits = pred.predictions
630
+ pred_ids = np.argmax(pred_logits, axis=-1)
631
+
632
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
633
+
634
+ pred_str = tokenizer.batch_decode(pred_ids)
635
+ # we do not want to group tokens when computing the metrics
636
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
637
+
638
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
639
+
640
+ return metrics
641
+
642
+ # Now save everything to be able to create a single processor later
643
+ if is_main_process(training_args.local_rank):
644
+ # save feature extractor, tokenizer and config
645
+ feature_extractor.save_pretrained(training_args.output_dir)
646
+ tokenizer.save_pretrained(training_args.output_dir)
647
+ config.save_pretrained(training_args.output_dir)
648
+
649
+ try:
650
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
651
+ except (OSError, KeyError):
652
+ warnings.warn(
653
+ "Loading a processor from a feature extractor config that does not"
654
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
655
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
656
+ " `'processor_class': 'Wav2Vec2Processor'`",
657
+ FutureWarning,
658
+ )
659
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
660
+
661
+ # Instantiate custom data collator
662
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
663
+
664
+ # Initialize Trainer
665
+ trainer = Trainer(
666
+ model=model,
667
+ data_collator=data_collator,
668
+ args=training_args,
669
+ compute_metrics=compute_metrics,
670
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
671
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
672
+ tokenizer=feature_extractor,
673
+ )
674
+
675
+ # 8. Finally, we can start training
676
+
677
+ # Training
678
+ if training_args.do_train:
679
+
680
+ # use last checkpoint if exist
681
+ if last_checkpoint is not None:
682
+ checkpoint = last_checkpoint
683
+ elif os.path.isdir(model_args.model_name_or_path):
684
+ checkpoint = model_args.model_name_or_path
685
+ else:
686
+ checkpoint = None
687
+
688
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
689
+ trainer.save_model()
690
+
691
+ metrics = train_result.metrics
692
+ max_train_samples = (
693
+ data_args.max_train_samples
694
+ if data_args.max_train_samples is not None
695
+ else len(vectorized_datasets["train"])
696
+ )
697
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
698
+
699
+ trainer.log_metrics("train", metrics)
700
+ trainer.save_metrics("train", metrics)
701
+ trainer.save_state()
702
+
703
+ # Evaluation
704
+ results = {}
705
+ if training_args.do_eval:
706
+ logger.info("*** Evaluate ***")
707
+ metrics = trainer.evaluate()
708
+ max_eval_samples = (
709
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
710
+ )
711
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
712
+
713
+ trainer.log_metrics("eval", metrics)
714
+ trainer.save_metrics("eval", metrics)
715
+
716
+ # Write model card and (optionally) push to hub
717
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
718
+ kwargs = {
719
+ "finetuned_from": model_args.model_name_or_path,
720
+ "tasks": "speech-recognition",
721
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
722
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
723
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
724
+ }
725
+ if "common_voice" in data_args.dataset_name:
726
+ kwargs["language"] = config_name
727
+
728
+ if training_args.push_to_hub:
729
+ trainer.push_to_hub(**kwargs)
730
+ else:
731
+ trainer.create_model_card(**kwargs)
732
+
733
+ return results
734
+
735
+
736
+ if __name__ == "__main__":
737
+ main()
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
speech-recognition-community-v2_dev_data_de_validation_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ WER: 0.31160744887467157
2
+ CER: 0.1341142698398
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.27212924943281636,
4
+ "train_runtime": 112594.4054,
5
+ "train_samples": 436168,
6
+ "train_samples_per_second": 7.748,
7
+ "train_steps_per_second": 0.242
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1798 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9809064397204748,
5
+ "global_step": 27000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 6.999815761247666e-05,
13
+ "loss": 4.5119,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 6.999169171843241e-05,
19
+ "loss": 2.6638,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.02,
24
+ "learning_rate": 6.998057503298084e-05,
25
+ "loss": 1.183,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.03,
30
+ "learning_rate": 6.996480903365516e-05,
31
+ "loss": 0.7325,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.04,
36
+ "learning_rate": 6.99443958159349e-05,
37
+ "loss": 0.5966,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.04,
42
+ "learning_rate": 6.991933809296747e-05,
43
+ "loss": 0.5263,
44
+ "step": 600
45
+ },
46
+ {
47
+ "epoch": 0.05,
48
+ "learning_rate": 6.988963919520753e-05,
49
+ "loss": 0.5068,
50
+ "step": 700
51
+ },
52
+ {
53
+ "epoch": 0.06,
54
+ "learning_rate": 6.985530306997431e-05,
55
+ "loss": 0.4704,
56
+ "step": 800
57
+ },
58
+ {
59
+ "epoch": 0.07,
60
+ "learning_rate": 6.981633428092705e-05,
61
+ "loss": 0.4521,
62
+ "step": 900
63
+ },
64
+ {
65
+ "epoch": 0.07,
66
+ "learning_rate": 6.977273800745834e-05,
67
+ "loss": 0.4439,
68
+ "step": 1000
69
+ },
70
+ {
71
+ "epoch": 0.08,
72
+ "learning_rate": 6.972452004400577e-05,
73
+ "loss": 0.421,
74
+ "step": 1100
75
+ },
76
+ {
77
+ "epoch": 0.09,
78
+ "learning_rate": 6.96716867992818e-05,
79
+ "loss": 0.4105,
80
+ "step": 1200
81
+ },
82
+ {
83
+ "epoch": 0.1,
84
+ "learning_rate": 6.961424529542192e-05,
85
+ "loss": 0.3998,
86
+ "step": 1300
87
+ },
88
+ {
89
+ "epoch": 0.1,
90
+ "learning_rate": 6.955220316705135e-05,
91
+ "loss": 0.3971,
92
+ "step": 1400
93
+ },
94
+ {
95
+ "epoch": 0.11,
96
+ "learning_rate": 6.948556866027035e-05,
97
+ "loss": 0.3958,
98
+ "step": 1500
99
+ },
100
+ {
101
+ "epoch": 0.11,
102
+ "eval_loss": 0.26585060358047485,
103
+ "eval_runtime": 1119.616,
104
+ "eval_samples_per_second": 14.297,
105
+ "eval_steps_per_second": 1.787,
106
+ "eval_wer": 0.2537555992570742,
107
+ "step": 1500
108
+ },
109
+ {
110
+ "epoch": 0.12,
111
+ "learning_rate": 6.941435063155818e-05,
112
+ "loss": 0.3828,
113
+ "step": 1600
114
+ },
115
+ {
116
+ "epoch": 0.12,
117
+ "learning_rate": 6.933855854659593e-05,
118
+ "loss": 0.3852,
119
+ "step": 1700
120
+ },
121
+ {
122
+ "epoch": 0.13,
123
+ "learning_rate": 6.925820247900854e-05,
124
+ "loss": 0.3749,
125
+ "step": 1800
126
+ },
127
+ {
128
+ "epoch": 0.14,
129
+ "learning_rate": 6.917329310902582e-05,
130
+ "loss": 0.3853,
131
+ "step": 1900
132
+ },
133
+ {
134
+ "epoch": 0.15,
135
+ "learning_rate": 6.90838417220629e-05,
136
+ "loss": 0.3692,
137
+ "step": 2000
138
+ },
139
+ {
140
+ "epoch": 0.15,
141
+ "learning_rate": 6.898986020722038e-05,
142
+ "loss": 0.3676,
143
+ "step": 2100
144
+ },
145
+ {
146
+ "epoch": 0.16,
147
+ "learning_rate": 6.889136105570403e-05,
148
+ "loss": 0.3697,
149
+ "step": 2200
150
+ },
151
+ {
152
+ "epoch": 0.17,
153
+ "learning_rate": 6.878835735916458e-05,
154
+ "loss": 0.3608,
155
+ "step": 2300
156
+ },
157
+ {
158
+ "epoch": 0.18,
159
+ "learning_rate": 6.868086280795778e-05,
160
+ "loss": 0.3515,
161
+ "step": 2400
162
+ },
163
+ {
164
+ "epoch": 0.18,
165
+ "learning_rate": 6.85688916893247e-05,
166
+ "loss": 0.3383,
167
+ "step": 2500
168
+ },
169
+ {
170
+ "epoch": 0.19,
171
+ "learning_rate": 6.845245888549281e-05,
172
+ "loss": 0.3421,
173
+ "step": 2600
174
+ },
175
+ {
176
+ "epoch": 0.2,
177
+ "learning_rate": 6.833157987169802e-05,
178
+ "loss": 0.3454,
179
+ "step": 2700
180
+ },
181
+ {
182
+ "epoch": 0.21,
183
+ "learning_rate": 6.820627071412778e-05,
184
+ "loss": 0.3499,
185
+ "step": 2800
186
+ },
187
+ {
188
+ "epoch": 0.21,
189
+ "learning_rate": 6.807654806778575e-05,
190
+ "loss": 0.3411,
191
+ "step": 2900
192
+ },
193
+ {
194
+ "epoch": 0.22,
195
+ "learning_rate": 6.794242917427811e-05,
196
+ "loss": 0.3422,
197
+ "step": 3000
198
+ },
199
+ {
200
+ "epoch": 0.22,
201
+ "eval_loss": 0.21751175820827484,
202
+ "eval_runtime": 1050.6432,
203
+ "eval_samples_per_second": 15.235,
204
+ "eval_steps_per_second": 1.905,
205
+ "eval_wer": 0.23247159401289194,
206
+ "step": 3000
207
+ },
208
+ {
209
+ "epoch": 0.23,
210
+ "learning_rate": 6.780393185952203e-05,
211
+ "loss": 0.3378,
212
+ "step": 3100
213
+ },
214
+ {
215
+ "epoch": 0.23,
216
+ "learning_rate": 6.766107453137634e-05,
217
+ "loss": 0.3354,
218
+ "step": 3200
219
+ },
220
+ {
221
+ "epoch": 0.24,
222
+ "learning_rate": 6.751387617719493e-05,
223
+ "loss": 0.3332,
224
+ "step": 3300
225
+ },
226
+ {
227
+ "epoch": 0.25,
228
+ "learning_rate": 6.736235636130315e-05,
229
+ "loss": 0.3313,
230
+ "step": 3400
231
+ },
232
+ {
233
+ "epoch": 0.26,
234
+ "learning_rate": 6.720653522239741e-05,
235
+ "loss": 0.3291,
236
+ "step": 3500
237
+ },
238
+ {
239
+ "epoch": 0.26,
240
+ "learning_rate": 6.704643347086866e-05,
241
+ "loss": 0.3225,
242
+ "step": 3600
243
+ },
244
+ {
245
+ "epoch": 0.27,
246
+ "learning_rate": 6.688207238604962e-05,
247
+ "loss": 0.3314,
248
+ "step": 3700
249
+ },
250
+ {
251
+ "epoch": 0.28,
252
+ "learning_rate": 6.671347381338648e-05,
253
+ "loss": 0.3279,
254
+ "step": 3800
255
+ },
256
+ {
257
+ "epoch": 0.29,
258
+ "learning_rate": 6.654066016153562e-05,
259
+ "loss": 0.3248,
260
+ "step": 3900
261
+ },
262
+ {
263
+ "epoch": 0.29,
264
+ "learning_rate": 6.636365439938497e-05,
265
+ "loss": 0.3184,
266
+ "step": 4000
267
+ },
268
+ {
269
+ "epoch": 0.3,
270
+ "learning_rate": 6.618248005300135e-05,
271
+ "loss": 0.3312,
272
+ "step": 4100
273
+ },
274
+ {
275
+ "epoch": 0.31,
276
+ "learning_rate": 6.599716120250359e-05,
277
+ "loss": 0.3156,
278
+ "step": 4200
279
+ },
280
+ {
281
+ "epoch": 0.32,
282
+ "learning_rate": 6.58077224788619e-05,
283
+ "loss": 0.32,
284
+ "step": 4300
285
+ },
286
+ {
287
+ "epoch": 0.32,
288
+ "learning_rate": 6.561418906062424e-05,
289
+ "loss": 0.3143,
290
+ "step": 4400
291
+ },
292
+ {
293
+ "epoch": 0.33,
294
+ "learning_rate": 6.541658667056979e-05,
295
+ "loss": 0.3151,
296
+ "step": 4500
297
+ },
298
+ {
299
+ "epoch": 0.33,
300
+ "eval_loss": 0.2139398157596588,
301
+ "eval_runtime": 1046.9527,
302
+ "eval_samples_per_second": 15.289,
303
+ "eval_steps_per_second": 1.911,
304
+ "eval_wer": 0.19961351469463565,
305
+ "step": 4500
306
+ },
307
+ {
308
+ "epoch": 0.34,
309
+ "learning_rate": 6.521494157229007e-05,
310
+ "loss": 0.3228,
311
+ "step": 4600
312
+ },
313
+ {
314
+ "epoch": 0.34,
315
+ "learning_rate": 6.50092805666982e-05,
316
+ "loss": 0.3223,
317
+ "step": 4700
318
+ },
319
+ {
320
+ "epoch": 0.35,
321
+ "learning_rate": 6.47996309884668e-05,
322
+ "loss": 0.3186,
323
+ "step": 4800
324
+ },
325
+ {
326
+ "epoch": 0.36,
327
+ "learning_rate": 6.45860207023949e-05,
328
+ "loss": 0.3069,
329
+ "step": 4900
330
+ },
331
+ {
332
+ "epoch": 0.37,
333
+ "learning_rate": 6.436847809970438e-05,
334
+ "loss": 0.3149,
335
+ "step": 5000
336
+ },
337
+ {
338
+ "epoch": 0.37,
339
+ "learning_rate": 6.41470320942664e-05,
340
+ "loss": 0.3152,
341
+ "step": 5100
342
+ },
343
+ {
344
+ "epoch": 0.38,
345
+ "learning_rate": 6.392171211875852e-05,
346
+ "loss": 0.3054,
347
+ "step": 5200
348
+ },
349
+ {
350
+ "epoch": 0.39,
351
+ "learning_rate": 6.369485868905532e-05,
352
+ "loss": 0.3196,
353
+ "step": 5300
354
+ },
355
+ {
356
+ "epoch": 0.4,
357
+ "learning_rate": 6.346191911024053e-05,
358
+ "loss": 0.3011,
359
+ "step": 5400
360
+ },
361
+ {
362
+ "epoch": 0.4,
363
+ "learning_rate": 6.322519662061658e-05,
364
+ "loss": 0.3103,
365
+ "step": 5500
366
+ },
367
+ {
368
+ "epoch": 0.41,
369
+ "learning_rate": 6.298472268327846e-05,
370
+ "loss": 0.305,
371
+ "step": 5600
372
+ },
373
+ {
374
+ "epoch": 0.42,
375
+ "learning_rate": 6.274052925993097e-05,
376
+ "loss": 0.3043,
377
+ "step": 5700
378
+ },
379
+ {
380
+ "epoch": 0.43,
381
+ "learning_rate": 6.249264880664065e-05,
382
+ "loss": 0.3031,
383
+ "step": 5800
384
+ },
385
+ {
386
+ "epoch": 0.43,
387
+ "learning_rate": 6.224111426952202e-05,
388
+ "loss": 0.2988,
389
+ "step": 5900
390
+ },
391
+ {
392
+ "epoch": 0.44,
393
+ "learning_rate": 6.198595908035864e-05,
394
+ "loss": 0.3,
395
+ "step": 6000
396
+ },
397
+ {
398
+ "epoch": 0.44,
399
+ "eval_loss": 0.20627757906913757,
400
+ "eval_runtime": 1055.8743,
401
+ "eval_samples_per_second": 15.16,
402
+ "eval_steps_per_second": 1.895,
403
+ "eval_wer": 0.20541079427510106,
404
+ "step": 6000
405
+ },
406
+ {
407
+ "epoch": 0.45,
408
+ "learning_rate": 6.172721715215964e-05,
409
+ "loss": 0.2983,
410
+ "step": 6100
411
+ },
412
+ {
413
+ "epoch": 0.45,
414
+ "learning_rate": 6.146492287465236e-05,
415
+ "loss": 0.3012,
416
+ "step": 6200
417
+ },
418
+ {
419
+ "epoch": 0.46,
420
+ "learning_rate": 6.119911110971146e-05,
421
+ "loss": 0.2989,
422
+ "step": 6300
423
+ },
424
+ {
425
+ "epoch": 0.47,
426
+ "learning_rate": 6.092981718672549e-05,
427
+ "loss": 0.2909,
428
+ "step": 6400
429
+ },
430
+ {
431
+ "epoch": 0.48,
432
+ "learning_rate": 6.065707689790118e-05,
433
+ "loss": 0.3063,
434
+ "step": 6500
435
+ },
436
+ {
437
+ "epoch": 0.48,
438
+ "learning_rate": 6.03809264935062e-05,
439
+ "loss": 0.2908,
440
+ "step": 6600
441
+ },
442
+ {
443
+ "epoch": 0.49,
444
+ "learning_rate": 6.0101402677051154e-05,
445
+ "loss": 0.309,
446
+ "step": 6700
447
+ },
448
+ {
449
+ "epoch": 0.5,
450
+ "learning_rate": 5.981854260041124e-05,
451
+ "loss": 0.2891,
452
+ "step": 6800
453
+ },
454
+ {
455
+ "epoch": 0.51,
456
+ "learning_rate": 5.9532383858888345e-05,
457
+ "loss": 0.2902,
458
+ "step": 6900
459
+ },
460
+ {
461
+ "epoch": 0.51,
462
+ "learning_rate": 5.924296448621422e-05,
463
+ "loss": 0.289,
464
+ "step": 7000
465
+ },
466
+ {
467
+ "epoch": 0.52,
468
+ "learning_rate": 5.8950322949495356e-05,
469
+ "loss": 0.2951,
470
+ "step": 7100
471
+ },
472
+ {
473
+ "epoch": 0.53,
474
+ "learning_rate": 5.8654498144100274e-05,
475
+ "loss": 0.2923,
476
+ "step": 7200
477
+ },
478
+ {
479
+ "epoch": 0.54,
480
+ "learning_rate": 5.835552938848987e-05,
481
+ "loss": 0.2967,
482
+ "step": 7300
483
+ },
484
+ {
485
+ "epoch": 0.54,
486
+ "learning_rate": 5.805345641899159e-05,
487
+ "loss": 0.2863,
488
+ "step": 7400
489
+ },
490
+ {
491
+ "epoch": 0.55,
492
+ "learning_rate": 5.774831938451798e-05,
493
+ "loss": 0.2824,
494
+ "step": 7500
495
+ },
496
+ {
497
+ "epoch": 0.55,
498
+ "eval_loss": 0.18351121246814728,
499
+ "eval_runtime": 1075.697,
500
+ "eval_samples_per_second": 14.881,
501
+ "eval_steps_per_second": 1.86,
502
+ "eval_wer": 0.18133398885611274,
503
+ "step": 7500
504
+ },
505
+ {
506
+ "epoch": 0.56,
507
+ "learning_rate": 5.744325527887681e-05,
508
+ "loss": 0.2833,
509
+ "step": 7600
510
+ },
511
+ {
512
+ "epoch": 0.56,
513
+ "learning_rate": 5.713214180625491e-05,
514
+ "loss": 0.2909,
515
+ "step": 7700
516
+ },
517
+ {
518
+ "epoch": 0.57,
519
+ "learning_rate": 5.68180867217851e-05,
520
+ "loss": 0.286,
521
+ "step": 7800
522
+ },
523
+ {
524
+ "epoch": 0.58,
525
+ "learning_rate": 5.650113176693846e-05,
526
+ "loss": 0.2778,
527
+ "step": 7900
528
+ },
529
+ {
530
+ "epoch": 0.59,
531
+ "learning_rate": 5.618131906861165e-05,
532
+ "loss": 0.2777,
533
+ "step": 8000
534
+ },
535
+ {
536
+ "epoch": 0.59,
537
+ "learning_rate": 5.5858691133527713e-05,
538
+ "loss": 0.2829,
539
+ "step": 8100
540
+ },
541
+ {
542
+ "epoch": 0.6,
543
+ "learning_rate": 5.553329084258652e-05,
544
+ "loss": 0.2748,
545
+ "step": 8200
546
+ },
547
+ {
548
+ "epoch": 0.61,
549
+ "learning_rate": 5.5205161445165346e-05,
550
+ "loss": 0.2827,
551
+ "step": 8300
552
+ },
553
+ {
554
+ "epoch": 0.62,
555
+ "learning_rate": 5.4874346553370585e-05,
556
+ "loss": 0.2712,
557
+ "step": 8400
558
+ },
559
+ {
560
+ "epoch": 0.62,
561
+ "learning_rate": 5.4540890136241195e-05,
562
+ "loss": 0.2792,
563
+ "step": 8500
564
+ },
565
+ {
566
+ "epoch": 0.63,
567
+ "learning_rate": 5.420483651390469e-05,
568
+ "loss": 0.2815,
569
+ "step": 8600
570
+ },
571
+ {
572
+ "epoch": 0.64,
573
+ "learning_rate": 5.386623035168656e-05,
574
+ "loss": 0.2766,
575
+ "step": 8700
576
+ },
577
+ {
578
+ "epoch": 0.65,
579
+ "learning_rate": 5.3525116654173646e-05,
580
+ "loss": 0.2704,
581
+ "step": 8800
582
+ },
583
+ {
584
+ "epoch": 0.65,
585
+ "learning_rate": 5.318154075923263e-05,
586
+ "loss": 0.2695,
587
+ "step": 8900
588
+ },
589
+ {
590
+ "epoch": 0.66,
591
+ "learning_rate": 5.283554833198404e-05,
592
+ "loss": 0.2771,
593
+ "step": 9000
594
+ },
595
+ {
596
+ "epoch": 0.66,
597
+ "eval_loss": 0.18808312714099884,
598
+ "eval_runtime": 1061.3581,
599
+ "eval_samples_per_second": 15.082,
600
+ "eval_steps_per_second": 1.885,
601
+ "eval_wer": 0.17624685895334863,
602
+ "step": 9000
603
+ },
604
+ {
605
+ "epoch": 0.67,
606
+ "learning_rate": 5.2487185358732866e-05,
607
+ "loss": 0.276,
608
+ "step": 9100
609
+ },
610
+ {
611
+ "epoch": 0.67,
612
+ "learning_rate": 5.213649814085646e-05,
613
+ "loss": 0.2647,
614
+ "step": 9200
615
+ },
616
+ {
617
+ "epoch": 0.68,
618
+ "learning_rate": 5.178353328865057e-05,
619
+ "loss": 0.2901,
620
+ "step": 9300
621
+ },
622
+ {
623
+ "epoch": 0.69,
624
+ "learning_rate": 5.142833771513431e-05,
625
+ "loss": 0.2771,
626
+ "step": 9400
627
+ },
628
+ {
629
+ "epoch": 0.7,
630
+ "learning_rate": 5.107095862981481e-05,
631
+ "loss": 0.2706,
632
+ "step": 9500
633
+ },
634
+ {
635
+ "epoch": 0.7,
636
+ "learning_rate": 5.071144353241269e-05,
637
+ "loss": 0.2753,
638
+ "step": 9600
639
+ },
640
+ {
641
+ "epoch": 0.71,
642
+ "learning_rate": 5.0353466418956284e-05,
643
+ "loss": 0.2777,
644
+ "step": 9700
645
+ },
646
+ {
647
+ "epoch": 0.72,
648
+ "learning_rate": 4.9989843088674705e-05,
649
+ "loss": 0.2621,
650
+ "step": 9800
651
+ },
652
+ {
653
+ "epoch": 0.73,
654
+ "learning_rate": 4.962422743878782e-05,
655
+ "loss": 0.2628,
656
+ "step": 9900
657
+ },
658
+ {
659
+ "epoch": 0.73,
660
+ "learning_rate": 4.9256668063748734e-05,
661
+ "loss": 0.272,
662
+ "step": 10000
663
+ },
664
+ {
665
+ "epoch": 0.74,
666
+ "learning_rate": 4.8887213816353655e-05,
667
+ "loss": 0.2683,
668
+ "step": 10100
669
+ },
670
+ {
671
+ "epoch": 0.75,
672
+ "learning_rate": 4.851591380124868e-05,
673
+ "loss": 0.2627,
674
+ "step": 10200
675
+ },
676
+ {
677
+ "epoch": 0.76,
678
+ "learning_rate": 4.814281736840332e-05,
679
+ "loss": 0.2565,
680
+ "step": 10300
681
+ },
682
+ {
683
+ "epoch": 0.76,
684
+ "learning_rate": 4.776797410655135e-05,
685
+ "loss": 0.2619,
686
+ "step": 10400
687
+ },
688
+ {
689
+ "epoch": 0.77,
690
+ "learning_rate": 4.739143383659982e-05,
691
+ "loss": 0.2616,
692
+ "step": 10500
693
+ },
694
+ {
695
+ "epoch": 0.77,
696
+ "eval_loss": 0.18062810599803925,
697
+ "eval_runtime": 1044.5923,
698
+ "eval_samples_per_second": 15.324,
699
+ "eval_steps_per_second": 1.916,
700
+ "eval_wer": 0.17676581448705342,
701
+ "step": 10500
702
+ },
703
+ {
704
+ "epoch": 0.78,
705
+ "learning_rate": 4.701324660500736e-05,
706
+ "loss": 0.2575,
707
+ "step": 10600
708
+ },
709
+ {
710
+ "epoch": 0.79,
711
+ "learning_rate": 4.663346267713244e-05,
712
+ "loss": 0.2642,
713
+ "step": 10700
714
+ },
715
+ {
716
+ "epoch": 0.79,
717
+ "learning_rate": 4.625213253055248e-05,
718
+ "loss": 0.2561,
719
+ "step": 10800
720
+ },
721
+ {
722
+ "epoch": 0.8,
723
+ "learning_rate": 4.586930684835486e-05,
724
+ "loss": 0.2553,
725
+ "step": 10900
726
+ },
727
+ {
728
+ "epoch": 0.81,
729
+ "learning_rate": 4.5485036512400575e-05,
730
+ "loss": 0.2533,
731
+ "step": 11000
732
+ },
733
+ {
734
+ "epoch": 0.81,
735
+ "learning_rate": 4.509937259656139e-05,
736
+ "loss": 0.2528,
737
+ "step": 11100
738
+ },
739
+ {
740
+ "epoch": 0.82,
741
+ "learning_rate": 4.471236635993164e-05,
742
+ "loss": 0.2542,
743
+ "step": 11200
744
+ },
745
+ {
746
+ "epoch": 0.83,
747
+ "learning_rate": 4.432406924001522e-05,
748
+ "loss": 0.2506,
749
+ "step": 11300
750
+ },
751
+ {
752
+ "epoch": 0.84,
753
+ "learning_rate": 4.393453284588905e-05,
754
+ "loss": 0.2587,
755
+ "step": 11400
756
+ },
757
+ {
758
+ "epoch": 0.84,
759
+ "learning_rate": 4.3543808951343574e-05,
760
+ "loss": 0.2503,
761
+ "step": 11500
762
+ },
763
+ {
764
+ "epoch": 0.85,
765
+ "learning_rate": 4.3151949488001475e-05,
766
+ "loss": 0.2535,
767
+ "step": 11600
768
+ },
769
+ {
770
+ "epoch": 0.86,
771
+ "learning_rate": 4.275900653841536e-05,
772
+ "loss": 0.2526,
773
+ "step": 11700
774
+ },
775
+ {
776
+ "epoch": 0.87,
777
+ "learning_rate": 4.236503232914543e-05,
778
+ "loss": 0.253,
779
+ "step": 11800
780
+ },
781
+ {
782
+ "epoch": 0.87,
783
+ "learning_rate": 4.197007922381793e-05,
784
+ "loss": 0.2523,
785
+ "step": 11900
786
+ },
787
+ {
788
+ "epoch": 0.88,
789
+ "learning_rate": 4.157419971616547e-05,
790
+ "loss": 0.2446,
791
+ "step": 12000
792
+ },
793
+ {
794
+ "epoch": 0.88,
795
+ "eval_loss": 0.1757470816373825,
796
+ "eval_runtime": 1062.3089,
797
+ "eval_samples_per_second": 15.068,
798
+ "eval_steps_per_second": 1.884,
799
+ "eval_wer": 0.1589711023708074,
800
+ "step": 12000
801
+ },
802
+ {
803
+ "epoch": 0.89,
804
+ "learning_rate": 4.1177446423050005e-05,
805
+ "loss": 0.2473,
806
+ "step": 12100
807
+ },
808
+ {
809
+ "epoch": 0.9,
810
+ "learning_rate": 4.077987207746943e-05,
811
+ "loss": 0.2438,
812
+ "step": 12200
813
+ },
814
+ {
815
+ "epoch": 0.9,
816
+ "learning_rate": 4.0381529521548834e-05,
817
+ "loss": 0.2468,
818
+ "step": 12300
819
+ },
820
+ {
821
+ "epoch": 0.91,
822
+ "learning_rate": 3.998247169951711e-05,
823
+ "loss": 0.2524,
824
+ "step": 12400
825
+ },
826
+ {
827
+ "epoch": 0.92,
828
+ "learning_rate": 3.958275165067014e-05,
829
+ "loss": 0.2372,
830
+ "step": 12500
831
+ },
832
+ {
833
+ "epoch": 0.92,
834
+ "learning_rate": 3.91824225023212e-05,
835
+ "loss": 0.2324,
836
+ "step": 12600
837
+ },
838
+ {
839
+ "epoch": 0.93,
840
+ "learning_rate": 3.8785548889903e-05,
841
+ "loss": 0.2436,
842
+ "step": 12700
843
+ },
844
+ {
845
+ "epoch": 0.94,
846
+ "learning_rate": 3.8384166003361756e-05,
847
+ "loss": 0.2405,
848
+ "step": 12800
849
+ },
850
+ {
851
+ "epoch": 0.95,
852
+ "learning_rate": 3.798635370086602e-05,
853
+ "loss": 0.2349,
854
+ "step": 12900
855
+ },
856
+ {
857
+ "epoch": 0.95,
858
+ "learning_rate": 3.7584128333900755e-05,
859
+ "loss": 0.2424,
860
+ "step": 13000
861
+ },
862
+ {
863
+ "epoch": 0.96,
864
+ "learning_rate": 3.7181559507066575e-05,
865
+ "loss": 0.2366,
866
+ "step": 13100
867
+ },
868
+ {
869
+ "epoch": 0.97,
870
+ "learning_rate": 3.677870072631157e-05,
871
+ "loss": 0.2435,
872
+ "step": 13200
873
+ },
874
+ {
875
+ "epoch": 0.98,
876
+ "learning_rate": 3.637560553612199e-05,
877
+ "loss": 0.2377,
878
+ "step": 13300
879
+ },
880
+ {
881
+ "epoch": 0.98,
882
+ "learning_rate": 3.597232751240556e-05,
883
+ "loss": 0.2302,
884
+ "step": 13400
885
+ },
886
+ {
887
+ "epoch": 0.99,
888
+ "learning_rate": 3.556892025537066e-05,
889
+ "loss": 0.2377,
890
+ "step": 13500
891
+ },
892
+ {
893
+ "epoch": 0.99,
894
+ "eval_loss": 0.1588028222322464,
895
+ "eval_runtime": 1044.908,
896
+ "eval_samples_per_second": 15.319,
897
+ "eval_steps_per_second": 1.915,
898
+ "eval_wer": 0.15275046432863543,
899
+ "step": 13500
900
+ },
901
+ {
902
+ "epoch": 1.0,
903
+ "learning_rate": 3.516543738240223e-05,
904
+ "loss": 0.2313,
905
+ "step": 13600
906
+ },
907
+ {
908
+ "epoch": 1.01,
909
+ "learning_rate": 3.476193252093543e-05,
910
+ "loss": 0.22,
911
+ "step": 13700
912
+ },
913
+ {
914
+ "epoch": 1.01,
915
+ "learning_rate": 3.4358459301327927e-05,
916
+ "loss": 0.214,
917
+ "step": 13800
918
+ },
919
+ {
920
+ "epoch": 1.02,
921
+ "learning_rate": 3.395507134973183e-05,
922
+ "loss": 0.2257,
923
+ "step": 13900
924
+ },
925
+ {
926
+ "epoch": 1.03,
927
+ "learning_rate": 3.355182228096618e-05,
928
+ "loss": 0.2308,
929
+ "step": 14000
930
+ },
931
+ {
932
+ "epoch": 1.03,
933
+ "learning_rate": 3.314876569139091e-05,
934
+ "loss": 0.2244,
935
+ "step": 14100
936
+ },
937
+ {
938
+ "epoch": 1.04,
939
+ "learning_rate": 3.274595515178329e-05,
940
+ "loss": 0.2176,
941
+ "step": 14200
942
+ },
943
+ {
944
+ "epoch": 1.05,
945
+ "learning_rate": 3.234344420021777e-05,
946
+ "loss": 0.2238,
947
+ "step": 14300
948
+ },
949
+ {
950
+ "epoch": 1.06,
951
+ "learning_rate": 3.194128633495017e-05,
952
+ "loss": 0.219,
953
+ "step": 14400
954
+ },
955
+ {
956
+ "epoch": 1.06,
957
+ "learning_rate": 3.153953500730713e-05,
958
+ "loss": 0.2265,
959
+ "step": 14500
960
+ },
961
+ {
962
+ "epoch": 1.07,
963
+ "learning_rate": 3.113824361458186e-05,
964
+ "loss": 0.2218,
965
+ "step": 14600
966
+ },
967
+ {
968
+ "epoch": 1.08,
969
+ "learning_rate": 3.073746549293703e-05,
970
+ "loss": 0.2129,
971
+ "step": 14700
972
+ },
973
+ {
974
+ "epoch": 1.09,
975
+ "learning_rate": 3.0337253910315748e-05,
976
+ "loss": 0.2126,
977
+ "step": 14800
978
+ },
979
+ {
980
+ "epoch": 1.09,
981
+ "learning_rate": 2.993766205936171e-05,
982
+ "loss": 0.2047,
983
+ "step": 14900
984
+ },
985
+ {
986
+ "epoch": 1.1,
987
+ "learning_rate": 2.9538743050349254e-05,
988
+ "loss": 0.2141,
989
+ "step": 15000
990
+ },
991
+ {
992
+ "epoch": 1.1,
993
+ "eval_loss": 0.14499780535697937,
994
+ "eval_runtime": 1046.3411,
995
+ "eval_samples_per_second": 15.298,
996
+ "eval_steps_per_second": 1.912,
997
+ "eval_wer": 0.14962307440183548,
998
+ "step": 15000
999
+ },
1000
+ {
1001
+ "epoch": 1.11,
1002
+ "learning_rate": 2.9140549904124422e-05,
1003
+ "loss": 0.2066,
1004
+ "step": 15100
1005
+ },
1006
+ {
1007
+ "epoch": 1.12,
1008
+ "learning_rate": 2.8743135545057887e-05,
1009
+ "loss": 0.2124,
1010
+ "step": 15200
1011
+ },
1012
+ {
1013
+ "epoch": 1.12,
1014
+ "learning_rate": 2.8346552794010703e-05,
1015
+ "loss": 0.2089,
1016
+ "step": 15300
1017
+ },
1018
+ {
1019
+ "epoch": 1.13,
1020
+ "learning_rate": 2.7950854361313814e-05,
1021
+ "loss": 0.2121,
1022
+ "step": 15400
1023
+ },
1024
+ {
1025
+ "epoch": 1.14,
1026
+ "learning_rate": 2.755609283976226e-05,
1027
+ "loss": 0.209,
1028
+ "step": 15500
1029
+ },
1030
+ {
1031
+ "epoch": 1.14,
1032
+ "learning_rate": 2.7162320697625e-05,
1033
+ "loss": 0.2052,
1034
+ "step": 15600
1035
+ },
1036
+ {
1037
+ "epoch": 1.15,
1038
+ "learning_rate": 2.676959027167128e-05,
1039
+ "loss": 0.209,
1040
+ "step": 15700
1041
+ },
1042
+ {
1043
+ "epoch": 1.16,
1044
+ "learning_rate": 2.6377953760214495e-05,
1045
+ "loss": 0.2089,
1046
+ "step": 15800
1047
+ },
1048
+ {
1049
+ "epoch": 1.17,
1050
+ "learning_rate": 2.598746321617443e-05,
1051
+ "loss": 0.2021,
1052
+ "step": 15900
1053
+ },
1054
+ {
1055
+ "epoch": 1.17,
1056
+ "learning_rate": 2.5598170540158846e-05,
1057
+ "loss": 0.2047,
1058
+ "step": 16000
1059
+ },
1060
+ {
1061
+ "epoch": 1.18,
1062
+ "learning_rate": 2.5210127473565314e-05,
1063
+ "loss": 0.1979,
1064
+ "step": 16100
1065
+ },
1066
+ {
1067
+ "epoch": 1.19,
1068
+ "learning_rate": 2.482338559170417e-05,
1069
+ "loss": 0.2044,
1070
+ "step": 16200
1071
+ },
1072
+ {
1073
+ "epoch": 1.2,
1074
+ "learning_rate": 2.4437996296943596e-05,
1075
+ "loss": 0.2082,
1076
+ "step": 16300
1077
+ },
1078
+ {
1079
+ "epoch": 1.2,
1080
+ "learning_rate": 2.4057843550135512e-05,
1081
+ "loss": 0.2004,
1082
+ "step": 16400
1083
+ },
1084
+ {
1085
+ "epoch": 1.21,
1086
+ "learning_rate": 2.3675298110320073e-05,
1087
+ "loss": 0.1953,
1088
+ "step": 16500
1089
+ },
1090
+ {
1091
+ "epoch": 1.21,
1092
+ "eval_loss": 0.13918258249759674,
1093
+ "eval_runtime": 1060.8068,
1094
+ "eval_samples_per_second": 15.089,
1095
+ "eval_steps_per_second": 1.886,
1096
+ "eval_wer": 0.13435485633125752,
1097
+ "step": 16500
1098
+ },
1099
+ {
1100
+ "epoch": 1.22,
1101
+ "learning_rate": 2.3294257851410495e-05,
1102
+ "loss": 0.1984,
1103
+ "step": 16600
1104
+ },
1105
+ {
1106
+ "epoch": 1.23,
1107
+ "learning_rate": 2.2914773417964826e-05,
1108
+ "loss": 0.1972,
1109
+ "step": 16700
1110
+ },
1111
+ {
1112
+ "epoch": 1.23,
1113
+ "learning_rate": 2.2536895247754305e-05,
1114
+ "loss": 0.194,
1115
+ "step": 16800
1116
+ },
1117
+ {
1118
+ "epoch": 1.24,
1119
+ "learning_rate": 2.2160673565059625e-05,
1120
+ "loss": 0.2016,
1121
+ "step": 16900
1122
+ },
1123
+ {
1124
+ "epoch": 1.25,
1125
+ "learning_rate": 2.1786158373995577e-05,
1126
+ "loss": 0.1973,
1127
+ "step": 17000
1128
+ },
1129
+ {
1130
+ "epoch": 1.25,
1131
+ "learning_rate": 2.1413399451864916e-05,
1132
+ "loss": 0.1872,
1133
+ "step": 17100
1134
+ },
1135
+ {
1136
+ "epoch": 1.26,
1137
+ "learning_rate": 2.1042446342542387e-05,
1138
+ "loss": 0.2035,
1139
+ "step": 17200
1140
+ },
1141
+ {
1142
+ "epoch": 1.27,
1143
+ "learning_rate": 2.0673348349889817e-05,
1144
+ "loss": 0.1937,
1145
+ "step": 17300
1146
+ },
1147
+ {
1148
+ "epoch": 1.28,
1149
+ "learning_rate": 2.0306154531203048e-05,
1150
+ "loss": 0.1938,
1151
+ "step": 17400
1152
+ },
1153
+ {
1154
+ "epoch": 1.28,
1155
+ "learning_rate": 1.994091369069168e-05,
1156
+ "loss": 0.1953,
1157
+ "step": 17500
1158
+ },
1159
+ {
1160
+ "epoch": 1.29,
1161
+ "learning_rate": 1.957767437299243e-05,
1162
+ "loss": 0.1887,
1163
+ "step": 17600
1164
+ },
1165
+ {
1166
+ "epoch": 1.3,
1167
+ "learning_rate": 1.9216484856717008e-05,
1168
+ "loss": 0.1921,
1169
+ "step": 17700
1170
+ },
1171
+ {
1172
+ "epoch": 1.31,
1173
+ "learning_rate": 1.8857393148035336e-05,
1174
+ "loss": 0.1917,
1175
+ "step": 17800
1176
+ },
1177
+ {
1178
+ "epoch": 1.31,
1179
+ "learning_rate": 1.8500446974295e-05,
1180
+ "loss": 0.1836,
1181
+ "step": 17900
1182
+ },
1183
+ {
1184
+ "epoch": 1.32,
1185
+ "learning_rate": 1.8145693777677743e-05,
1186
+ "loss": 0.1923,
1187
+ "step": 18000
1188
+ },
1189
+ {
1190
+ "epoch": 1.32,
1191
+ "eval_loss": 0.1327279508113861,
1192
+ "eval_runtime": 1039.577,
1193
+ "eval_samples_per_second": 15.398,
1194
+ "eval_steps_per_second": 1.925,
1195
+ "eval_wer": 0.13173959357587675,
1196
+ "step": 18000
1197
+ },
1198
+ {
1199
+ "epoch": 1.33,
1200
+ "learning_rate": 1.779669459685722e-05,
1201
+ "loss": 0.1927,
1202
+ "step": 18100
1203
+ },
1204
+ {
1205
+ "epoch": 1.34,
1206
+ "learning_rate": 1.7446445408141307e-05,
1207
+ "loss": 0.1959,
1208
+ "step": 18200
1209
+ },
1210
+ {
1211
+ "epoch": 1.34,
1212
+ "learning_rate": 1.7098529285272e-05,
1213
+ "loss": 0.1872,
1214
+ "step": 18300
1215
+ },
1216
+ {
1217
+ "epoch": 1.35,
1218
+ "learning_rate": 1.6752992470235188e-05,
1219
+ "loss": 0.1848,
1220
+ "step": 18400
1221
+ },
1222
+ {
1223
+ "epoch": 1.36,
1224
+ "learning_rate": 1.640988088877985e-05,
1225
+ "loss": 0.184,
1226
+ "step": 18500
1227
+ },
1228
+ {
1229
+ "epoch": 1.36,
1230
+ "learning_rate": 1.6069240144314012e-05,
1231
+ "loss": 0.1919,
1232
+ "step": 18600
1233
+ },
1234
+ {
1235
+ "epoch": 1.37,
1236
+ "learning_rate": 1.5731115511843525e-05,
1237
+ "loss": 0.176,
1238
+ "step": 18700
1239
+ },
1240
+ {
1241
+ "epoch": 1.38,
1242
+ "learning_rate": 1.5395551931954524e-05,
1243
+ "loss": 0.1789,
1244
+ "step": 18800
1245
+ },
1246
+ {
1247
+ "epoch": 1.39,
1248
+ "learning_rate": 1.5062594004840269e-05,
1249
+ "loss": 0.1937,
1250
+ "step": 18900
1251
+ },
1252
+ {
1253
+ "epoch": 1.39,
1254
+ "learning_rate": 1.4732285984373345e-05,
1255
+ "loss": 0.1861,
1256
+ "step": 19000
1257
+ },
1258
+ {
1259
+ "epoch": 1.4,
1260
+ "learning_rate": 1.440467177222377e-05,
1261
+ "loss": 0.1757,
1262
+ "step": 19100
1263
+ },
1264
+ {
1265
+ "epoch": 1.41,
1266
+ "learning_rate": 1.4079794912023988e-05,
1267
+ "loss": 0.1839,
1268
+ "step": 19200
1269
+ },
1270
+ {
1271
+ "epoch": 1.42,
1272
+ "learning_rate": 1.3757698583581431e-05,
1273
+ "loss": 0.1844,
1274
+ "step": 19300
1275
+ },
1276
+ {
1277
+ "epoch": 1.42,
1278
+ "learning_rate": 1.3438425597139414e-05,
1279
+ "loss": 0.1855,
1280
+ "step": 19400
1281
+ },
1282
+ {
1283
+ "epoch": 1.43,
1284
+ "learning_rate": 1.3122018387687183e-05,
1285
+ "loss": 0.1804,
1286
+ "step": 19500
1287
+ },
1288
+ {
1289
+ "epoch": 1.43,
1290
+ "eval_loss": 0.12711018323898315,
1291
+ "eval_runtime": 1058.0447,
1292
+ "eval_samples_per_second": 15.129,
1293
+ "eval_steps_per_second": 1.891,
1294
+ "eval_wer": 0.12372992461488037,
1295
+ "step": 19500
1296
+ },
1297
+ {
1298
+ "epoch": 1.44,
1299
+ "learning_rate": 1.280851900931984e-05,
1300
+ "loss": 0.1833,
1301
+ "step": 19600
1302
+ },
1303
+ {
1304
+ "epoch": 1.45,
1305
+ "learning_rate": 1.2497969129648841e-05,
1306
+ "loss": 0.181,
1307
+ "step": 19700
1308
+ },
1309
+ {
1310
+ "epoch": 1.45,
1311
+ "learning_rate": 1.2190410024263938e-05,
1312
+ "loss": 0.1719,
1313
+ "step": 19800
1314
+ },
1315
+ {
1316
+ "epoch": 1.46,
1317
+ "learning_rate": 1.1885882571247166e-05,
1318
+ "loss": 0.1758,
1319
+ "step": 19900
1320
+ },
1321
+ {
1322
+ "epoch": 1.47,
1323
+ "learning_rate": 1.1584427245739682e-05,
1324
+ "loss": 0.1792,
1325
+ "step": 20000
1326
+ },
1327
+ {
1328
+ "epoch": 1.47,
1329
+ "learning_rate": 1.1286084114562175e-05,
1330
+ "loss": 0.1774,
1331
+ "step": 20100
1332
+ },
1333
+ {
1334
+ "epoch": 1.48,
1335
+ "learning_rate": 1.0990892830889517e-05,
1336
+ "loss": 0.1796,
1337
+ "step": 20200
1338
+ },
1339
+ {
1340
+ "epoch": 1.49,
1341
+ "learning_rate": 1.0698892628980422e-05,
1342
+ "loss": 0.1816,
1343
+ "step": 20300
1344
+ },
1345
+ {
1346
+ "epoch": 1.5,
1347
+ "learning_rate": 1.041012231896276e-05,
1348
+ "loss": 0.174,
1349
+ "step": 20400
1350
+ },
1351
+ {
1352
+ "epoch": 1.5,
1353
+ "learning_rate": 1.012462028167525e-05,
1354
+ "loss": 0.1717,
1355
+ "step": 20500
1356
+ },
1357
+ {
1358
+ "epoch": 1.51,
1359
+ "learning_rate": 9.842424463566227e-06,
1360
+ "loss": 0.1793,
1361
+ "step": 20600
1362
+ },
1363
+ {
1364
+ "epoch": 1.52,
1365
+ "learning_rate": 9.563572371650113e-06,
1366
+ "loss": 0.1699,
1367
+ "step": 20700
1368
+ },
1369
+ {
1370
+ "epoch": 1.53,
1371
+ "learning_rate": 9.288101068522322e-06,
1372
+ "loss": 0.1726,
1373
+ "step": 20800
1374
+ },
1375
+ {
1376
+ "epoch": 1.53,
1377
+ "learning_rate": 9.016047167433221e-06,
1378
+ "loss": 0.1734,
1379
+ "step": 20900
1380
+ },
1381
+ {
1382
+ "epoch": 1.54,
1383
+ "learning_rate": 8.747446827421805e-06,
1384
+ "loss": 0.1776,
1385
+ "step": 21000
1386
+ },
1387
+ {
1388
+ "epoch": 1.54,
1389
+ "eval_loss": 0.12307832390069962,
1390
+ "eval_runtime": 1040.1826,
1391
+ "eval_samples_per_second": 15.389,
1392
+ "eval_steps_per_second": 1.924,
1393
+ "eval_wer": 0.1186018245384027,
1394
+ "step": 21000
1395
+ },
1396
+ {
1397
+ "epoch": 1.55,
1398
+ "learning_rate": 8.482335748509769e-06,
1399
+ "loss": 0.1755,
1400
+ "step": 21100
1401
+ },
1402
+ {
1403
+ "epoch": 1.56,
1404
+ "learning_rate": 8.220749166956552e-06,
1405
+ "loss": 0.1717,
1406
+ "step": 21200
1407
+ },
1408
+ {
1409
+ "epoch": 1.56,
1410
+ "learning_rate": 7.962721850576054e-06,
1411
+ "loss": 0.167,
1412
+ "step": 21300
1413
+ },
1414
+ {
1415
+ "epoch": 1.57,
1416
+ "learning_rate": 7.708288094115607e-06,
1417
+ "loss": 0.1698,
1418
+ "step": 21400
1419
+ },
1420
+ {
1421
+ "epoch": 1.58,
1422
+ "learning_rate": 7.457481714697784e-06,
1423
+ "loss": 0.1709,
1424
+ "step": 21500
1425
+ },
1426
+ {
1427
+ "epoch": 1.58,
1428
+ "learning_rate": 7.210336047325761e-06,
1429
+ "loss": 0.1748,
1430
+ "step": 21600
1431
+ },
1432
+ {
1433
+ "epoch": 1.59,
1434
+ "learning_rate": 6.9668839404526865e-06,
1435
+ "loss": 0.1776,
1436
+ "step": 21700
1437
+ },
1438
+ {
1439
+ "epoch": 1.6,
1440
+ "learning_rate": 6.727157751615771e-06,
1441
+ "loss": 0.1664,
1442
+ "step": 21800
1443
+ },
1444
+ {
1445
+ "epoch": 1.61,
1446
+ "learning_rate": 6.491189343135589e-06,
1447
+ "loss": 0.1754,
1448
+ "step": 21900
1449
+ },
1450
+ {
1451
+ "epoch": 1.61,
1452
+ "learning_rate": 6.2590100778812376e-06,
1453
+ "loss": 0.1766,
1454
+ "step": 22000
1455
+ },
1456
+ {
1457
+ "epoch": 1.62,
1458
+ "learning_rate": 6.030650815101828e-06,
1459
+ "loss": 0.171,
1460
+ "step": 22100
1461
+ },
1462
+ {
1463
+ "epoch": 1.63,
1464
+ "learning_rate": 5.808367837755271e-06,
1465
+ "loss": 0.1703,
1466
+ "step": 22200
1467
+ },
1468
+ {
1469
+ "epoch": 1.64,
1470
+ "learning_rate": 5.5877001747984834e-06,
1471
+ "loss": 0.164,
1472
+ "step": 22300
1473
+ },
1474
+ {
1475
+ "epoch": 1.64,
1476
+ "learning_rate": 5.3709417389918604e-06,
1477
+ "loss": 0.1664,
1478
+ "step": 22400
1479
+ },
1480
+ {
1481
+ "epoch": 1.65,
1482
+ "learning_rate": 5.158121339981953e-06,
1483
+ "loss": 0.1671,
1484
+ "step": 22500
1485
+ },
1486
+ {
1487
+ "epoch": 1.65,
1488
+ "eval_loss": 0.11993325501680374,
1489
+ "eval_runtime": 1045.6599,
1490
+ "eval_samples_per_second": 15.308,
1491
+ "eval_steps_per_second": 1.914,
1492
+ "eval_wer": 0.11479159838304381,
1493
+ "step": 22500
1494
+ },
1495
+ {
1496
+ "epoch": 1.66,
1497
+ "learning_rate": 4.949267264005701e-06,
1498
+ "loss": 0.1576,
1499
+ "step": 22600
1500
+ },
1501
+ {
1502
+ "epoch": 1.67,
1503
+ "learning_rate": 4.7444072701308795e-06,
1504
+ "loss": 0.1583,
1505
+ "step": 22700
1506
+ },
1507
+ {
1508
+ "epoch": 1.67,
1509
+ "learning_rate": 4.543568586566601e-06,
1510
+ "loss": 0.1678,
1511
+ "step": 22800
1512
+ },
1513
+ {
1514
+ "epoch": 1.68,
1515
+ "learning_rate": 4.346777907044375e-06,
1516
+ "loss": 0.1687,
1517
+ "step": 22900
1518
+ },
1519
+ {
1520
+ "epoch": 1.69,
1521
+ "learning_rate": 4.154061387270205e-06,
1522
+ "loss": 0.1671,
1523
+ "step": 23000
1524
+ },
1525
+ {
1526
+ "epoch": 1.69,
1527
+ "learning_rate": 3.965444641448219e-06,
1528
+ "loss": 0.1656,
1529
+ "step": 23100
1530
+ },
1531
+ {
1532
+ "epoch": 1.7,
1533
+ "learning_rate": 3.780952738876231e-06,
1534
+ "loss": 0.169,
1535
+ "step": 23200
1536
+ },
1537
+ {
1538
+ "epoch": 1.71,
1539
+ "learning_rate": 3.600610200613753e-06,
1540
+ "loss": 0.1619,
1541
+ "step": 23300
1542
+ },
1543
+ {
1544
+ "epoch": 1.72,
1545
+ "learning_rate": 3.4244409962228724e-06,
1546
+ "loss": 0.1702,
1547
+ "step": 23400
1548
+ },
1549
+ {
1550
+ "epoch": 1.72,
1551
+ "learning_rate": 3.252468540582438e-06,
1552
+ "loss": 0.1654,
1553
+ "step": 23500
1554
+ },
1555
+ {
1556
+ "epoch": 1.73,
1557
+ "learning_rate": 3.0847156907759337e-06,
1558
+ "loss": 0.1593,
1559
+ "step": 23600
1560
+ },
1561
+ {
1562
+ "epoch": 1.74,
1563
+ "learning_rate": 2.92120474305353e-06,
1564
+ "loss": 0.1622,
1565
+ "step": 23700
1566
+ },
1567
+ {
1568
+ "epoch": 1.75,
1569
+ "learning_rate": 2.7619574298686577e-06,
1570
+ "loss": 0.1653,
1571
+ "step": 23800
1572
+ },
1573
+ {
1574
+ "epoch": 1.75,
1575
+ "learning_rate": 2.6069949169895127e-06,
1576
+ "loss": 0.1637,
1577
+ "step": 23900
1578
+ },
1579
+ {
1580
+ "epoch": 1.76,
1581
+ "learning_rate": 2.4578229939112028e-06,
1582
+ "loss": 0.1597,
1583
+ "step": 24000
1584
+ },
1585
+ {
1586
+ "epoch": 1.76,
1587
+ "eval_loss": 0.11753135174512863,
1588
+ "eval_runtime": 1040.817,
1589
+ "eval_samples_per_second": 15.379,
1590
+ "eval_steps_per_second": 1.923,
1591
+ "eval_wer": 0.11268846279908226,
1592
+ "step": 24000
1593
+ },
1594
+ {
1595
+ "epoch": 1.77,
1596
+ "learning_rate": 2.311447946777479e-06,
1597
+ "loss": 0.1599,
1598
+ "step": 24100
1599
+ },
1600
+ {
1601
+ "epoch": 1.78,
1602
+ "learning_rate": 2.1694175777527574e-06,
1603
+ "loss": 0.1628,
1604
+ "step": 24200
1605
+ },
1606
+ {
1607
+ "epoch": 1.78,
1608
+ "learning_rate": 2.0317507642787156e-06,
1609
+ "loss": 0.1638,
1610
+ "step": 24300
1611
+ },
1612
+ {
1613
+ "epoch": 1.79,
1614
+ "learning_rate": 1.898465803831184e-06,
1615
+ "loss": 0.1651,
1616
+ "step": 24400
1617
+ },
1618
+ {
1619
+ "epoch": 1.8,
1620
+ "learning_rate": 1.7695804114881745e-06,
1621
+ "loss": 0.1629,
1622
+ "step": 24500
1623
+ },
1624
+ {
1625
+ "epoch": 1.8,
1626
+ "learning_rate": 1.6451117175753708e-06,
1627
+ "loss": 0.1699,
1628
+ "step": 24600
1629
+ },
1630
+ {
1631
+ "epoch": 1.81,
1632
+ "learning_rate": 1.5250762653892972e-06,
1633
+ "loss": 0.1578,
1634
+ "step": 24700
1635
+ },
1636
+ {
1637
+ "epoch": 1.82,
1638
+ "learning_rate": 1.4094900089985423e-06,
1639
+ "loss": 0.1648,
1640
+ "step": 24800
1641
+ },
1642
+ {
1643
+ "epoch": 1.83,
1644
+ "learning_rate": 1.2983683111232683e-06,
1645
+ "loss": 0.1607,
1646
+ "step": 24900
1647
+ },
1648
+ {
1649
+ "epoch": 1.83,
1650
+ "learning_rate": 1.1917259410933516e-06,
1651
+ "loss": 0.1593,
1652
+ "step": 25000
1653
+ },
1654
+ {
1655
+ "epoch": 1.84,
1656
+ "learning_rate": 1.0895770728853425e-06,
1657
+ "loss": 0.159,
1658
+ "step": 25100
1659
+ },
1660
+ {
1661
+ "epoch": 1.85,
1662
+ "learning_rate": 9.919352832386174e-07,
1663
+ "loss": 0.1608,
1664
+ "step": 25200
1665
+ },
1666
+ {
1667
+ "epoch": 1.86,
1668
+ "learning_rate": 8.988135498508481e-07,
1669
+ "loss": 0.1619,
1670
+ "step": 25300
1671
+ },
1672
+ {
1673
+ "epoch": 1.86,
1674
+ "learning_rate": 8.102242496531358e-07,
1675
+ "loss": 0.163,
1676
+ "step": 25400
1677
+ },
1678
+ {
1679
+ "epoch": 1.87,
1680
+ "learning_rate": 7.261791571649655e-07,
1681
+ "loss": 0.1619,
1682
+ "step": 25500
1683
+ },
1684
+ {
1685
+ "epoch": 1.87,
1686
+ "eval_loss": 0.11700794845819473,
1687
+ "eval_runtime": 1036.7542,
1688
+ "eval_samples_per_second": 15.44,
1689
+ "eval_steps_per_second": 1.93,
1690
+ "eval_wer": 0.11198514148366656,
1691
+ "step": 25500
1692
+ },
1693
+ {
1694
+ "epoch": 1.88,
1695
+ "learning_rate": 6.466894429292585e-07,
1696
+ "loss": 0.1627,
1697
+ "step": 25600
1698
+ },
1699
+ {
1700
+ "epoch": 1.89,
1701
+ "learning_rate": 5.717656720276581e-07,
1702
+ "loss": 0.165,
1703
+ "step": 25700
1704
+ },
1705
+ {
1706
+ "epoch": 1.89,
1707
+ "learning_rate": 5.014178026763216e-07,
1708
+ "loss": 0.1685,
1709
+ "step": 25800
1710
+ },
1711
+ {
1712
+ "epoch": 1.9,
1713
+ "learning_rate": 4.356551849023648e-07,
1714
+ "loss": 0.1632,
1715
+ "step": 25900
1716
+ },
1717
+ {
1718
+ "epoch": 1.91,
1719
+ "learning_rate": 3.7448655930113146e-07,
1720
+ "loss": 0.164,
1721
+ "step": 26000
1722
+ },
1723
+ {
1724
+ "epoch": 1.91,
1725
+ "learning_rate": 3.179200558744649e-07,
1726
+ "loss": 0.1649,
1727
+ "step": 26100
1728
+ },
1729
+ {
1730
+ "epoch": 1.92,
1731
+ "learning_rate": 2.6596319295015436e-07,
1732
+ "loss": 0.1583,
1733
+ "step": 26200
1734
+ },
1735
+ {
1736
+ "epoch": 1.93,
1737
+ "learning_rate": 2.1862287618264806e-07,
1738
+ "loss": 0.1629,
1739
+ "step": 26300
1740
+ },
1741
+ {
1742
+ "epoch": 1.94,
1743
+ "learning_rate": 1.7630967021918575e-07,
1744
+ "loss": 0.1608,
1745
+ "step": 26400
1746
+ },
1747
+ {
1748
+ "epoch": 1.94,
1749
+ "learning_rate": 1.3817439628416527e-07,
1750
+ "loss": 0.1629,
1751
+ "step": 26500
1752
+ },
1753
+ {
1754
+ "epoch": 1.95,
1755
+ "learning_rate": 1.0467265308166828e-07,
1756
+ "loss": 0.1656,
1757
+ "step": 26600
1758
+ },
1759
+ {
1760
+ "epoch": 1.96,
1761
+ "learning_rate": 7.58088933720985e-08,
1762
+ "loss": 0.1638,
1763
+ "step": 26700
1764
+ },
1765
+ {
1766
+ "epoch": 1.97,
1767
+ "learning_rate": 5.158695347542152e-08,
1768
+ "loss": 0.164,
1769
+ "step": 26800
1770
+ },
1771
+ {
1772
+ "epoch": 1.97,
1773
+ "learning_rate": 3.2010052761280434e-08,
1774
+ "loss": 0.1632,
1775
+ "step": 26900
1776
+ },
1777
+ {
1778
+ "epoch": 1.98,
1779
+ "learning_rate": 1.708079322109368e-08,
1780
+ "loss": 0.1664,
1781
+ "step": 27000
1782
+ },
1783
+ {
1784
+ "epoch": 1.98,
1785
+ "eval_loss": 0.11697087436914444,
1786
+ "eval_runtime": 1037.0691,
1787
+ "eval_samples_per_second": 15.435,
1788
+ "eval_steps_per_second": 1.929,
1789
+ "eval_wer": 0.11169152190538621,
1790
+ "step": 27000
1791
+ }
1792
+ ],
1793
+ "max_steps": 27260,
1794
+ "num_train_epochs": 2,
1795
+ "total_flos": 4.0022174178965815e+20,
1796
+ "trial_name": null,
1797
+ "trial_params": null
1798
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33687b9bde339fb2a61ab28ad3b77a61a2b72c0f63bd474a7eea4c3152b86aa6
3
+ size 2991
vocab.json ADDED
@@ -0,0 +1 @@
 
1
+ {"=": 1, "@": 2, "[": 3, "]": 4, "_": 5, "`": 6, "a": 7, "b": 8, "c": 9, "d": 10, "e": 11, "f": 12, "g": 13, "h": 14, "i": 15, "j": 16, "k": 17, "l": 18, "m": 19, "n": 20, "o": 21, "p": 22, "q": 23, "r": 24, "s": 25, "t": 26, "u": 27, "v": 28, "w": 29, "x": 30, "y": 31, "z": 32, "¡": 33, "§": 34, "«": 35, "°": 36, "´": 37, "µ": 38, "·": 39, "»": 40, "×": 41, "ß": 42, "à": 43, "á": 44, "â": 45, "ã": 46, "ä": 47, "å": 48, "æ": 49, "ç": 50, "è": 51, "é": 52, "ê": 53, "ë": 54, "ì": 55, "í": 56, "î": 57, "ï": 58, "ð": 59, "ñ": 60, "ò": 61, "ó": 62, "ô": 63, "õ": 64, "ö": 65, "ø": 66, "ù": 67, "ú": 68, "û": 69, "ü": 70, "ý": 71, "þ": 72, "ā": 73, "ă": 74, "ą": 75, "ć": 76, "č": 77, "ď": 78, "đ": 79, "ē": 80, "ė": 81, "ę": 82, "ě": 83, "ğ": 84, "ġ": 85, "ħ": 86, "ī": 87, "ı": 88, "ł": 89, "ń": 90, "ņ": 91, "ň": 92, "ō": 93, "ŏ": 94, "ő": 95, "œ": 96, "ř": 97, "ś": 98, "ş": 99, "š": 100, "ť": 101, "ū": 102, "ů": 103, "ź": 104, "ż": 105, "ž": 106, "ơ": 107, "ǐ": 108, "ǔ": 109, "ș": 110, "ț": 111, "ə": 112, "ʻ": 113, "ʾ": 114, "ʿ": 115, "̆": 116, "̇": 117, "̥": 118, "а": 119, "в": 120, "е": 121, "и": 122, "к": 123, "м": 124, "о": 125, "р": 126, "с": 127, "ф": 128, "ч": 129, "ш": 130, "ѹ": 131, "א": 132, "ב": 133, "נ": 134, "ע": 135, "ש": 136, "་": 137, "ན": 138, "ḫ": 139, "ṟ": 140, "ṣ": 141, "ṭ": 142, "ạ": 143, "ả": 144, "ắ": 145, "ằ": 146, "ế": 147, "ễ": 148, "ệ": 149, "ọ": 150, "ồ": 151, "ộ": 152, "ụ": 153, "ứ": 154, "‑": 155, "‚": 156, "„": 157, "‟": 158, "′": 159, "″": 160, "‹": 161, "›": 162, "→": 163, "−": 164, "≡": 165, "⟨": 166, "⟩": 167, "カ": 168, "东": 169, "临": 170, "乡": 171, "关": 172, "合": 173, "城": 174, "孙": 175, "尣": 176, "幺": 177, "支": 178, "比": 179, "毛": 180, "泽": 181, "無": 182, "生": 183, "臣": 184, "辶": 185, "道": 186, "镇": 187, "黃": 188, "|": 0, "[UNK]": 189, "[PAD]": 190}