AndrewMcDowell commited on
Commit
802dee6
1 Parent(s): 5262dde

Training in progress, step 1000

Browse files
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset, load_metric
32
+
33
+ import transformers
34
+ from transformers import (
35
+ AutoConfig,
36
+ AutoFeatureExtractor,
37
+ AutoModelForCTC,
38
+ AutoProcessor,
39
+ AutoTokenizer,
40
+ HfArgumentParser,
41
+ Trainer,
42
+ TrainingArguments,
43
+ Wav2Vec2Processor,
44
+ set_seed,
45
+ )
46
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
+ from transformers.utils import check_min_version
48
+ from transformers.utils.versions import require_version
49
+
50
+
51
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
52
+ check_min_version("4.16.0.dev0")
53
+
54
+ require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
55
+
56
+
57
+ logger = logging.getLogger(__name__)
58
+
59
+
60
+ def list_field(default=None, metadata=None):
61
+ return field(default_factory=lambda: default, metadata=metadata)
62
+
63
+
64
+ @dataclass
65
+ class ModelArguments:
66
+ """
67
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
68
+ """
69
+
70
+ model_name_or_path: str = field(
71
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
72
+ )
73
+ tokenizer_name_or_path: Optional[str] = field(
74
+ default=None,
75
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
76
+ )
77
+ cache_dir: Optional[str] = field(
78
+ default=None,
79
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
80
+ )
81
+ freeze_feature_encoder: bool = field(
82
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
83
+ )
84
+ attention_dropout: float = field(
85
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
86
+ )
87
+ activation_dropout: float = field(
88
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
89
+ )
90
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
91
+ hidden_dropout: float = field(
92
+ default=0.0,
93
+ metadata={
94
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
95
+ },
96
+ )
97
+ final_dropout: float = field(
98
+ default=0.0,
99
+ metadata={"help": "The dropout probability for the final projection layer."},
100
+ )
101
+ mask_time_prob: float = field(
102
+ default=0.05,
103
+ metadata={
104
+ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
105
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
106
+ "vectors will be masked along the time axis."
107
+ },
108
+ )
109
+ mask_time_length: int = field(
110
+ default=10,
111
+ metadata={"help": "Length of vector span to mask along the time axis."},
112
+ )
113
+ mask_feature_prob: float = field(
114
+ default=0.0,
115
+ metadata={
116
+ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
117
+ "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
118
+ },
119
+ )
120
+ mask_feature_length: int = field(
121
+ default=10,
122
+ metadata={"help": "Length of vector span to mask along the feature axis."},
123
+ )
124
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
125
+ ctc_loss_reduction: Optional[str] = field(
126
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class DataTrainingArguments:
132
+ """
133
+ Arguments pertaining to what data we are going to input our model for training and eval.
134
+
135
+ Using `HfArgumentParser` we can turn this class
136
+ into argparse arguments to be able to specify them on
137
+ the command line.
138
+ """
139
+
140
+ dataset_name: str = field(
141
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
142
+ )
143
+ dataset_config_name: str = field(
144
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
145
+ )
146
+ train_split_name: str = field(
147
+ default="train+validation",
148
+ metadata={
149
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
150
+ },
151
+ )
152
+ eval_split_name: str = field(
153
+ default="test",
154
+ metadata={
155
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
156
+ },
157
+ )
158
+ audio_column_name: str = field(
159
+ default="audio",
160
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
161
+ )
162
+ text_column_name: str = field(
163
+ default="text",
164
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
165
+ )
166
+ overwrite_cache: bool = field(
167
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
168
+ )
169
+ preprocessing_num_workers: Optional[int] = field(
170
+ default=None,
171
+ metadata={"help": "The number of processes to use for the preprocessing."},
172
+ )
173
+ max_train_samples: Optional[int] = field(
174
+ default=None,
175
+ metadata={
176
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
177
+ "value if set."
178
+ },
179
+ )
180
+ max_eval_samples: Optional[int] = field(
181
+ default=None,
182
+ metadata={
183
+ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
184
+ "value if set."
185
+ },
186
+ )
187
+ chars_to_ignore: Optional[List[str]] = list_field(
188
+ default=None,
189
+ metadata={"help": "A list of characters to remove from the transcripts."},
190
+ )
191
+ eval_metrics: List[str] = list_field(
192
+ default=["wer"],
193
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
194
+ )
195
+ max_duration_in_seconds: float = field(
196
+ default=20.0,
197
+ metadata={
198
+ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
199
+ },
200
+ )
201
+ min_duration_in_seconds: float = field(
202
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
203
+ )
204
+ preprocessing_only: bool = field(
205
+ default=False,
206
+ metadata={
207
+ "help": "Whether to only do data preprocessing and skip training. "
208
+ "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
209
+ "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
210
+ "so that the cached datasets can consequently be loaded in distributed training"
211
+ },
212
+ )
213
+ use_auth_token: bool = field(
214
+ default=False,
215
+ metadata={
216
+ "help": "If :obj:`True`, will use the token generated when running"
217
+ ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
218
+ },
219
+ )
220
+ unk_token: str = field(
221
+ default="[UNK]",
222
+ metadata={"help": "The unk token for the tokenizer"},
223
+ )
224
+ pad_token: str = field(
225
+ default="[PAD]",
226
+ metadata={"help": "The padding token for the tokenizer"},
227
+ )
228
+ word_delimiter_token: str = field(
229
+ default="|",
230
+ metadata={"help": "The word delimiter token for the tokenizer"},
231
+ )
232
+ phoneme_language: Optional[str] = field(
233
+ default=None,
234
+ metadata={
235
+ "help": "The target language that should be used be"
236
+ " passed to the tokenizer for tokenization. Note that"
237
+ " this is only relevant if the model classifies the"
238
+ " input audio to a sequence of phoneme sequences."
239
+ },
240
+ )
241
+
242
+
243
+ @dataclass
244
+ class DataCollatorCTCWithPadding:
245
+ """
246
+ Data collator that will dynamically pad the inputs received.
247
+ Args:
248
+ processor (:class:`~transformers.AutoProcessor`)
249
+ The processor used for proccessing the data.
250
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
251
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
252
+ among:
253
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
254
+ sequence if provided).
255
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
256
+ maximum acceptable input length for the model if that argument is not provided.
257
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
258
+ different lengths).
259
+ max_length (:obj:`int`, `optional`):
260
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
261
+ max_length_labels (:obj:`int`, `optional`):
262
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
263
+ pad_to_multiple_of (:obj:`int`, `optional`):
264
+ If set will pad the sequence to a multiple of the provided value.
265
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
266
+ 7.5 (Volta).
267
+ """
268
+
269
+ processor: AutoProcessor
270
+ padding: Union[bool, str] = "longest"
271
+ pad_to_multiple_of: Optional[int] = None
272
+ pad_to_multiple_of_labels: Optional[int] = None
273
+
274
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
275
+ # split inputs and labels since they have to be of different lenghts and need
276
+ # different padding methods
277
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
278
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
279
+
280
+ batch = self.processor.pad(
281
+ input_features,
282
+ padding=self.padding,
283
+ pad_to_multiple_of=self.pad_to_multiple_of,
284
+ return_tensors="pt",
285
+ )
286
+
287
+ with self.processor.as_target_processor():
288
+ labels_batch = self.processor.pad(
289
+ label_features,
290
+ padding=self.padding,
291
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
292
+ return_tensors="pt",
293
+ )
294
+
295
+ # replace padding with -100 to ignore loss correctly
296
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
297
+
298
+ batch["labels"] = labels
299
+
300
+ return batch
301
+
302
+
303
+ def create_vocabulary_from_data(
304
+ datasets: DatasetDict,
305
+ word_delimiter_token: Optional[str] = None,
306
+ unk_token: Optional[str] = None,
307
+ pad_token: Optional[str] = None,
308
+ ):
309
+ # Given training and test labels create vocabulary
310
+ def extract_all_chars(batch):
311
+ all_text = " ".join(batch["target_text"])
312
+ vocab = list(set(all_text))
313
+ return {"vocab": [vocab], "all_text": [all_text]}
314
+
315
+ vocabs = datasets.map(
316
+ extract_all_chars,
317
+ batched=True,
318
+ batch_size=-1,
319
+ keep_in_memory=True,
320
+ remove_columns=datasets["train"].column_names,
321
+ load_from_cache_file=False
322
+ )
323
+
324
+ # take union of all unique characters in each dataset
325
+ vocab_set = functools.reduce(
326
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
327
+ )
328
+
329
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
330
+
331
+ # replace white space with delimiter token
332
+ if word_delimiter_token is not None:
333
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
334
+ del vocab_dict[" "]
335
+
336
+ # add unk and pad token
337
+ if unk_token is not None:
338
+ vocab_dict[unk_token] = len(vocab_dict)
339
+
340
+ if pad_token is not None:
341
+ vocab_dict[pad_token] = len(vocab_dict)
342
+
343
+ return vocab_dict
344
+
345
+
346
+ def main():
347
+ # See all possible arguments in src/transformers/training_args.py
348
+ # or by passing the --help flag to this script.
349
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
350
+
351
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
352
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
353
+ # If we pass only one argument to the script and it's the path to a json file,
354
+ # let's parse it to get our arguments.
355
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
356
+ else:
357
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
358
+
359
+ # Detecting last checkpoint.
360
+ last_checkpoint = None
361
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
362
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
363
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
364
+ raise ValueError(
365
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
366
+ "Use --overwrite_output_dir to overcome."
367
+ )
368
+ elif last_checkpoint is not None:
369
+ logger.info(
370
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
371
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
372
+ )
373
+
374
+ # Setup logging
375
+ logging.basicConfig(
376
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
377
+ datefmt="%m/%d/%Y %H:%M:%S",
378
+ handlers=[logging.StreamHandler(sys.stdout)],
379
+ )
380
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
381
+
382
+ # Log on each process the small summary:
383
+ logger.warning(
384
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
385
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
386
+ )
387
+ # Set the verbosity to info of the Transformers logger (on main process only):
388
+ if is_main_process(training_args.local_rank):
389
+ transformers.utils.logging.set_verbosity_info()
390
+ logger.info("Training/evaluation parameters %s", training_args)
391
+
392
+ # Set seed before initializing model.
393
+ set_seed(training_args.seed)
394
+
395
+ # 1. First, let's load the dataset
396
+ raw_datasets = DatasetDict()
397
+
398
+ if training_args.do_train:
399
+ raw_datasets["train"] = load_dataset(
400
+ data_args.dataset_name,
401
+ data_args.dataset_config_name,
402
+ split=data_args.train_split_name,
403
+ use_auth_token=data_args.use_auth_token,
404
+ )
405
+
406
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
407
+ raise ValueError(
408
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
409
+ "Make sure to set `--audio_column_name` to the correct audio column - one of "
410
+ f"{', '.join(raw_datasets['train'].column_names)}."
411
+ )
412
+
413
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
414
+ raise ValueError(
415
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
416
+ "Make sure to set `--text_column_name` to the correct text column - one of "
417
+ f"{', '.join(raw_datasets['train'].column_names)}."
418
+ )
419
+
420
+ if data_args.max_train_samples is not None:
421
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
422
+
423
+ if training_args.do_eval:
424
+ raw_datasets["eval"] = load_dataset(
425
+ data_args.dataset_name,
426
+ data_args.dataset_config_name,
427
+ split=data_args.eval_split_name,
428
+ use_auth_token=data_args.use_auth_token,
429
+ )
430
+
431
+ if data_args.max_eval_samples is not None:
432
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
433
+
434
+ # 2. We remove some special characters from the datasets
435
+ # that make training complicated and do not help in transcribing the speech
436
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
437
+ # that could be easily picked up by the model
438
+ odd_chars_regex_string = '$&()*+.\/=@\[\]_`¡§«°´µ·»×àáâãåæçèéêëìíîïðñòóôõøùúûýþāăąćčďđēėęěğġħīıłńņňōŏőœřśşšťūůźżžơǐǔșțəʻʾʿ̥̆̇авеикморсфчшѹאבנעש་ནḫṟṣṭạảắằếễệọồộụứ‑‚„‟′″‹›→−≡⟨⟩カ东临乡关合城孙尣幺支比毛泽無生臣辶道镇黃'
439
+
440
+ chars_to_ignore_regex = (
441
+ f'[{"".join(data_args.chars_to_ignore)+ odd_chars_regex_string}]' if data_args.chars_to_ignore is not None else None
442
+ )
443
+ text_column_name = data_args.text_column_name
444
+
445
+ def remove_special_characters(batch):
446
+ if chars_to_ignore_regex is not None:
447
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
448
+ else:
449
+ batch["target_text"] = batch[text_column_name].lower() + " "
450
+ return batch
451
+
452
+ with training_args.main_process_first(desc="dataset map special characters removal"):
453
+ raw_datasets = raw_datasets.map(
454
+ remove_special_characters,
455
+ remove_columns=[text_column_name],
456
+ desc="remove special characters from datasets",
457
+ load_from_cache_file=False
458
+ )
459
+
460
+ # save special tokens for tokenizer
461
+ word_delimiter_token = data_args.word_delimiter_token
462
+ unk_token = data_args.unk_token
463
+ pad_token = data_args.pad_token
464
+
465
+ # 3. Next, let's load the config as we might need it to create
466
+ # the tokenizer
467
+ # load config
468
+ config = AutoConfig.from_pretrained(
469
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
470
+ )
471
+
472
+ # 4. Next, if no tokenizer file is defined,
473
+ # we create the vocabulary of the model by extracting all unique characters from
474
+ # the training and evaluation datasets
475
+ # We need to make sure that only first rank saves vocabulary
476
+ # make sure all processes wait until vocab is created
477
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
478
+ tokenizer_kwargs = {}
479
+ if tokenizer_name_or_path is None:
480
+ # save vocab in training output dir
481
+ tokenizer_name_or_path = training_args.output_dir
482
+
483
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
484
+
485
+ with training_args.main_process_first():
486
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
487
+ os.remove(vocab_file)
488
+
489
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
490
+ if not os.path.isfile(vocab_file):
491
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
492
+ vocab_dict = create_vocabulary_from_data(
493
+ raw_datasets,
494
+ word_delimiter_token=word_delimiter_token,
495
+ unk_token=unk_token,
496
+ pad_token=pad_token,
497
+ )
498
+
499
+ # save vocab dict to be loaded into tokenizer
500
+ with open(vocab_file, "w") as file:
501
+ json.dump(vocab_dict, file)
502
+
503
+ # if tokenizer has just been created
504
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
505
+ tokenizer_kwargs = {
506
+ "config": config if config.tokenizer_class is not None else None,
507
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
508
+ "unk_token": unk_token,
509
+ "pad_token": pad_token,
510
+ "word_delimiter_token": word_delimiter_token,
511
+ }
512
+
513
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
514
+ # Note for distributed training, the .from_pretrained methods guarantee that only
515
+ # one local process can concurrently download model & vocab.
516
+
517
+ # load feature_extractor and tokenizer
518
+ tokenizer = AutoTokenizer.from_pretrained(
519
+ tokenizer_name_or_path,
520
+ use_auth_token=data_args.use_auth_token,
521
+ **tokenizer_kwargs,
522
+ )
523
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
524
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
525
+ )
526
+
527
+ # adapt config
528
+ config.update(
529
+ {
530
+ "feat_proj_dropout": model_args.feat_proj_dropout,
531
+ "attention_dropout": model_args.attention_dropout,
532
+ "hidden_dropout": model_args.hidden_dropout,
533
+ "final_dropout": model_args.final_dropout,
534
+ "mask_time_prob": model_args.mask_time_prob,
535
+ "mask_time_length": model_args.mask_time_length,
536
+ "mask_feature_prob": model_args.mask_feature_prob,
537
+ "mask_feature_length": model_args.mask_feature_length,
538
+ "gradient_checkpointing": training_args.gradient_checkpointing,
539
+ "layerdrop": model_args.layerdrop,
540
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
541
+ "pad_token_id": tokenizer.pad_token_id,
542
+ "vocab_size": len(tokenizer),
543
+ "activation_dropout": model_args.activation_dropout,
544
+ }
545
+ )
546
+
547
+ # create model
548
+ model = AutoModelForCTC.from_pretrained(
549
+ model_args.model_name_or_path,
550
+ cache_dir=model_args.cache_dir,
551
+ config=config,
552
+ use_auth_token=data_args.use_auth_token,
553
+ )
554
+
555
+ # freeze encoder
556
+ if model_args.freeze_feature_encoder:
557
+ model.freeze_feature_encoder()
558
+
559
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
560
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
561
+ # so that we just need to set the correct target sampling rate and normalize the input
562
+ # via the `feature_extractor`
563
+
564
+ # make sure that dataset decodes audio with correct sampling rate
565
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
566
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
567
+ raw_datasets = raw_datasets.cast_column(
568
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
569
+ )
570
+
571
+ # derive max & min input length for sample rate & max duration
572
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
573
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
574
+ audio_column_name = data_args.audio_column_name
575
+ num_workers = data_args.preprocessing_num_workers
576
+
577
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
578
+ phoneme_language = data_args.phoneme_language
579
+
580
+ # Preprocessing the datasets.
581
+ # We need to read the audio files as arrays and tokenize the targets.
582
+ def prepare_dataset(batch):
583
+ # load audio
584
+ sample = batch[audio_column_name]
585
+
586
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
587
+ batch["input_values"] = inputs.input_values[0]
588
+ batch["input_length"] = len(batch["input_values"])
589
+
590
+ # encode targets
591
+ additional_kwargs = {}
592
+ if phoneme_language is not None:
593
+ additional_kwargs["phonemizer_lang"] = phoneme_language
594
+
595
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
596
+ return batch
597
+
598
+ with training_args.main_process_first(desc="dataset map preprocessing"):
599
+
600
+ def is_text_still_present(string):
601
+ return len(string) > 5
602
+
603
+ # filter rows that have less than 5 characters after filtering.
604
+ raw_datasets = raw_datasets.filter(
605
+ is_text_still_present,
606
+ num_proc=num_workers,
607
+ input_columns=["target_text"],
608
+ )
609
+
610
+ vectorized_datasets = raw_datasets.map(
611
+ prepare_dataset,
612
+ remove_columns=next(iter(raw_datasets.values())).column_names,
613
+ num_proc=num_workers,
614
+ desc="preprocess datasets",
615
+ )
616
+
617
+ def is_audio_in_length_range(length):
618
+ return length > min_input_length and length < max_input_length
619
+
620
+ # filter data that is shorter than min_input_length
621
+ vectorized_datasets = vectorized_datasets.filter(
622
+ is_audio_in_length_range,
623
+ num_proc=num_workers,
624
+ input_columns=["input_length"],
625
+ )
626
+
627
+
628
+
629
+ # 7. Next, we can prepare the training.
630
+ # Let's use word error rate (WER) as our evaluation metric,
631
+ # instantiate a data collator and the trainer
632
+
633
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
634
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
635
+
636
+ # for large datasets it is advised to run the preprocessing on a
637
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
638
+ # be a timeout when running the script in distributed mode.
639
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
640
+ # cached dataset
641
+ if data_args.preprocessing_only:
642
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
643
+ return
644
+
645
+ def compute_metrics(pred):
646
+ pred_logits = pred.predictions
647
+ pred_ids = np.argmax(pred_logits, axis=-1)
648
+
649
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
650
+
651
+ pred_str = tokenizer.batch_decode(pred_ids)
652
+ # we do not want to group tokens when computing the metrics
653
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
654
+
655
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
656
+
657
+ return metrics
658
+
659
+ # Now save everything to be able to create a single processor later
660
+ if is_main_process(training_args.local_rank):
661
+ # save feature extractor, tokenizer and config
662
+ feature_extractor.save_pretrained(training_args.output_dir)
663
+ tokenizer.save_pretrained(training_args.output_dir)
664
+ config.save_pretrained(training_args.output_dir)
665
+
666
+ try:
667
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
668
+ except (OSError, KeyError):
669
+ warnings.warn(
670
+ "Loading a processor from a feature extractor config that does not"
671
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
672
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
673
+ " `'processor_class': 'Wav2Vec2Processor'`",
674
+ FutureWarning,
675
+ )
676
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
677
+
678
+ # Instantiate custom data collator
679
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
680
+
681
+ # Initialize Trainer
682
+ trainer = Trainer(
683
+ model=model,
684
+ data_collator=data_collator,
685
+ args=training_args,
686
+ compute_metrics=compute_metrics,
687
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
688
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
689
+ tokenizer=feature_extractor,
690
+ )
691
+
692
+ # 8. Finally, we can start training
693
+
694
+ # Training
695
+ if training_args.do_train:
696
+
697
+ # use last checkpoint if exist
698
+ if last_checkpoint is not None:
699
+ checkpoint = last_checkpoint
700
+ elif os.path.isdir(model_args.model_name_or_path):
701
+ checkpoint = model_args.model_name_or_path
702
+ else:
703
+ checkpoint = None
704
+
705
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
706
+ trainer.save_model()
707
+
708
+ metrics = train_result.metrics
709
+ max_train_samples = (
710
+ data_args.max_train_samples
711
+ if data_args.max_train_samples is not None
712
+ else len(vectorized_datasets["train"])
713
+ )
714
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
715
+
716
+ trainer.log_metrics("train", metrics)
717
+ trainer.save_metrics("train", metrics)
718
+ trainer.save_state()
719
+
720
+ # Evaluation
721
+ results = {}
722
+ if training_args.do_eval:
723
+ logger.info("*** Evaluate ***")
724
+ metrics = trainer.evaluate()
725
+ max_eval_samples = (
726
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
727
+ )
728
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
729
+
730
+ trainer.log_metrics("eval", metrics)
731
+ trainer.save_metrics("eval", metrics)
732
+
733
+ # Write model card and (optionally) push to hub
734
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
735
+ kwargs = {
736
+ "finetuned_from": model_args.model_name_or_path,
737
+ "tasks": "speech-recognition",
738
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
739
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
740
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
741
+ }
742
+ if "common_voice" in data_args.dataset_name:
743
+ kwargs["language"] = config_name
744
+
745
+ if training_args.push_to_hub:
746
+ trainer.push_to_hub(**kwargs)
747
+ else:
748
+ trainer.create_model_card(**kwargs)
749
+
750
+ return results
751
+
752
+
753
+ if __name__ == "__main__":
754
+ main()
.ipynb_checkpoints/run_training-checkpoint.sh CHANGED
@@ -3,7 +3,8 @@ python run_speech_recognition_ctc.py \
3
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
4
  --dataset_config_name="de" \
5
  --output_dir="./" \
6
- --num_train_epochs="2" \
 
7
  --per_device_train_batch_size="8" \
8
  --per_device_eval_batch_size="8" \
9
  --gradient_accumulation_steps="4" \
 
3
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
4
  --dataset_config_name="de" \
5
  --output_dir="./" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="2.5" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
.ipynb_checkpoints/special_tokens_map-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
.ipynb_checkpoints/vocab-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"!": 1, "\"": 2, "$": 3, "%": 4, "&": 5, "'": 6, "(": 7, ")": 8, "*": 9, "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, ":": 15, ";": 16, "=": 17, "?": 18, "@": 19, "[": 20, "]": 21, "_": 22, "`": 23, "a": 24, "b": 25, "c": 26, "d": 27, "e": 28, "f": 29, "g": 30, "h": 31, "i": 32, "j": 33, "k": 34, "l": 35, "m": 36, "n": 37, "o": 38, "p": 39, "q": 40, "r": 41, "s": 42, "t": 43, "u": 44, "v": 45, "w": 46, "x": 47, "y": 48, "z": 49, "\u00a1": 50, "\u00a7": 51, "\u00ab": 52, "\u00b0": 53, "\u00b4": 54, "\u00b5": 55, "\u00b7": 56, "\u00bb": 57, "\u00d7": 58, "\u00df": 59, "\u00e0": 60, "\u00e1": 61, "\u00e2": 62, "\u00e3": 63, "\u00e4": 64, "\u00e5": 65, "\u00e6": 66, "\u00e7": 67, "\u00e8": 68, "\u00e9": 69, "\u00ea": 70, "\u00eb": 71, "\u00ec": 72, "\u00ed": 73, "\u00ee": 74, "\u00ef": 75, "\u00f0": 76, "\u00f1": 77, "\u00f2": 78, "\u00f3": 79, "\u00f4": 80, "\u00f5": 81, "\u00f6": 82, "\u00f8": 83, "\u00f9": 84, "\u00fa": 85, "\u00fb": 86, "\u00fc": 87, "\u00fd": 88, "\u00fe": 89, "\u0101": 90, "\u0103": 91, "\u0105": 92, "\u0107": 93, "\u010d": 94, "\u010f": 95, "\u0111": 96, "\u0113": 97, "\u0117": 98, "\u0119": 99, "\u011b": 100, "\u011f": 101, "\u0121": 102, "\u0127": 103, "\u012b": 104, "\u0131": 105, "\u0142": 106, "\u0144": 107, "\u0146": 108, "\u0148": 109, "\u014d": 110, "\u014f": 111, "\u0151": 112, "\u0153": 113, "\u0159": 114, "\u015b": 115, "\u015f": 116, "\u0161": 117, "\u0165": 118, "\u016b": 119, "\u016f": 120, "\u017a": 121, "\u017c": 122, "\u017e": 123, "\u01a1": 124, "\u01d0": 125, "\u01d4": 126, "\u0219": 127, "\u021b": 128, "\u0259": 129, "\u02bb": 130, "\u02be": 131, "\u02bf": 132, "\u0306": 133, "\u0307": 134, "\u0325": 135, "\u0430": 136, "\u0432": 137, "\u0435": 138, "\u0438": 139, "\u043a": 140, "\u043c": 141, "\u043e": 142, "\u0440": 143, "\u0441": 144, "\u0444": 145, "\u0447": 146, "\u0448": 147, "\u0479": 148, "\u05d0": 149, "\u05d1": 150, "\u05e0": 151, "\u05e2": 152, "\u05e9": 153, "\u0f0b": 154, "\u0f53": 155, "\u1e2b": 156, "\u1e5f": 157, "\u1e63": 158, "\u1e6d": 159, "\u1ea1": 160, "\u1ea3": 161, "\u1eaf": 162, "\u1eb1": 163, "\u1ebf": 164, "\u1ec5": 165, "\u1ec7": 166, "\u1ecd": 167, "\u1ed3": 168, "\u1ed9": 169, "\u1ee5": 170, "\u1ee9": 171, "\u2011": 172, "\u2013": 173, "\u2014": 174, "\u2018": 175, "\u2019": 176, "\u201a": 177, "\u201c": 178, "\u201d": 179, "\u201e": 180, "\u201f": 181, "\u2026": 182, "\u2032": 183, "\u2033": 184, "\u2039": 185, "\u203a": 186, "\u2192": 187, "\u2212": 188, "\u2261": 189, "\u27e8": 190, "\u27e9": 191, "\u30ab": 192, "\u4e1c": 193, "\u4e34": 194, "\u4e61": 195, "\u5173": 196, "\u5408": 197, "\u57ce": 198, "\u5b59": 199, "\u5c23": 200, "\u5e7a": 201, "\u652f": 202, "\u6bd4": 203, "\u6bdb": 204, "\u6cfd": 205, "\u7121": 206, "\u751f": 207, "\u81e3": 208, "\u8fb6": 209, "\u9053": 210, "\u9547": 211, "\u9ec3": 212, "|": 0, "[UNK]": 213, "[PAD]": 214}
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 191, "</s>": 192}
 
1
+ {"<s>": 70, "</s>": 71}
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 48,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1280,
79
- "pad_token_id": 190,
80
  "proj_codevector_dim": 1024,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 193,
106
  "xvector_output_dim": 512
107
  }
 
76
  "num_hidden_layers": 48,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1280,
79
+ "pad_token_id": 69,
80
  "proj_codevector_dim": 1024,
81
  "tdnn_dilation": [
82
  1,
 
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 72,
106
  "xvector_output_dim": 512
107
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1ed4c2cebf9ed5304c0b72a8fc20e81595486f988de3f6e750ce22ad251c158
3
- size 3851301681
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a8d46236324825c647641375f6b303a64c6787dc4b05b5d6eb4f95910b8b10
3
+ size 3850681649
run_speech_recognition_ctc.py CHANGED
@@ -318,6 +318,7 @@ def create_vocabulary_from_data(
318
  batch_size=-1,
319
  keep_in_memory=True,
320
  remove_columns=datasets["train"].column_names,
 
321
  )
322
 
323
  # take union of all unique characters in each dataset
@@ -434,8 +435,10 @@ def main():
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
 
 
437
  chars_to_ignore_regex = (
438
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
439
  )
440
  text_column_name = data_args.text_column_name
441
 
@@ -451,6 +454,7 @@ def main():
451
  remove_special_characters,
452
  remove_columns=[text_column_name],
453
  desc="remove special characters from datasets",
 
454
  )
455
 
456
  # save special tokens for tokenizer
@@ -592,6 +596,17 @@ def main():
592
  return batch
593
 
594
  with training_args.main_process_first(desc="dataset map preprocessing"):
 
 
 
 
 
 
 
 
 
 
 
595
  vectorized_datasets = raw_datasets.map(
596
  prepare_dataset,
597
  remove_columns=next(iter(raw_datasets.values())).column_names,
@@ -608,6 +623,8 @@ def main():
608
  num_proc=num_workers,
609
  input_columns=["input_length"],
610
  )
 
 
611
 
612
  # 7. Next, we can prepare the training.
613
  # Let's use word error rate (WER) as our evaluation metric,
 
318
  batch_size=-1,
319
  keep_in_memory=True,
320
  remove_columns=datasets["train"].column_names,
321
+ load_from_cache_file=False
322
  )
323
 
324
  # take union of all unique characters in each dataset
 
435
  # that make training complicated and do not help in transcribing the speech
436
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
437
  # that could be easily picked up by the model
438
+ odd_chars_regex_string = '$&()*+.\/=@\[\]_`¡§«°´µ·»×àáâãåæçèéêëìíîïðñòóôõøùúûýþāăąćčďđēėęěğġħīıłńņňōŏőœřśşšťūůźżžơǐǔșțəʻʾʿ̥̆̇авеикморсфчшѹאבנעש་ནḫṟṣṭạảắằếễệọồộụứ‑‚„‟′″‹›→−≡⟨⟩カ东临乡关合城孙尣幺支比毛泽無生臣辶道镇黃'
439
+
440
  chars_to_ignore_regex = (
441
+ f'[{"".join(data_args.chars_to_ignore)+ odd_chars_regex_string}]' if data_args.chars_to_ignore is not None else None
442
  )
443
  text_column_name = data_args.text_column_name
444
 
 
454
  remove_special_characters,
455
  remove_columns=[text_column_name],
456
  desc="remove special characters from datasets",
457
+ load_from_cache_file=False
458
  )
459
 
460
  # save special tokens for tokenizer
 
596
  return batch
597
 
598
  with training_args.main_process_first(desc="dataset map preprocessing"):
599
+
600
+ def is_text_still_present(string):
601
+ return len(string) > 5
602
+
603
+ # filter rows that have less than 5 characters after filtering.
604
+ raw_datasets = raw_datasets.filter(
605
+ is_text_still_present,
606
+ num_proc=num_workers,
607
+ input_columns=["target_text"],
608
+ )
609
+
610
  vectorized_datasets = raw_datasets.map(
611
  prepare_dataset,
612
  remove_columns=next(iter(raw_datasets.values())).column_names,
 
623
  num_proc=num_workers,
624
  input_columns=["input_length"],
625
  )
626
+
627
+
628
 
629
  # 7. Next, we can prepare the training.
630
  # Let's use word error rate (WER) as our evaluation metric,
run_training.sh CHANGED
@@ -3,7 +3,8 @@ python run_speech_recognition_ctc.py \
3
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
4
  --dataset_config_name="de" \
5
  --output_dir="./" \
6
- --num_train_epochs="2" \
 
7
  --per_device_train_batch_size="8" \
8
  --per_device_eval_batch_size="8" \
9
  --gradient_accumulation_steps="4" \
 
3
  --model_name_or_path="facebook/wav2vec2-xls-r-1b" \
4
  --dataset_config_name="de" \
5
  --output_dir="./" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="2.5" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
trainer_state.json DELETED
@@ -1,1900 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.9999816584435355,
5
- "global_step": 27260,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 3.675e-06,
13
- "loss": 11.4989,
14
- "step": 100
15
- },
16
- {
17
- "epoch": 0.01,
18
- "learning_rate": 7.425e-06,
19
- "loss": 3.2394,
20
- "step": 200
21
- },
22
- {
23
- "epoch": 0.02,
24
- "learning_rate": 1.1174999999999999e-05,
25
- "loss": 3.0303,
26
- "step": 300
27
- },
28
- {
29
- "epoch": 0.03,
30
- "learning_rate": 1.4925e-05,
31
- "loss": 2.9052,
32
- "step": 400
33
- },
34
- {
35
- "epoch": 0.04,
36
- "learning_rate": 1.8675e-05,
37
- "loss": 2.1033,
38
- "step": 500
39
- },
40
- {
41
- "epoch": 0.04,
42
- "learning_rate": 2.2424999999999996e-05,
43
- "loss": 1.674,
44
- "step": 600
45
- },
46
- {
47
- "epoch": 0.05,
48
- "learning_rate": 2.6174999999999996e-05,
49
- "loss": 1.5568,
50
- "step": 700
51
- },
52
- {
53
- "epoch": 0.06,
54
- "learning_rate": 2.9925e-05,
55
- "loss": 1.4654,
56
- "step": 800
57
- },
58
- {
59
- "epoch": 0.07,
60
- "learning_rate": 3.3675e-05,
61
- "loss": 1.3031,
62
- "step": 900
63
- },
64
- {
65
- "epoch": 0.07,
66
- "learning_rate": 3.7424999999999995e-05,
67
- "loss": 1.1842,
68
- "step": 1000
69
- },
70
- {
71
- "epoch": 0.07,
72
- "eval_loss": 0.44609957933425903,
73
- "eval_runtime": 1053.3237,
74
- "eval_samples_per_second": 15.197,
75
- "eval_steps_per_second": 1.9,
76
- "eval_wer": 0.49177182344586473,
77
- "step": 1000
78
- },
79
- {
80
- "epoch": 0.08,
81
- "learning_rate": 4.1175e-05,
82
- "loss": 1.1329,
83
- "step": 1100
84
- },
85
- {
86
- "epoch": 0.09,
87
- "learning_rate": 4.4924999999999994e-05,
88
- "loss": 1.1316,
89
- "step": 1200
90
- },
91
- {
92
- "epoch": 0.1,
93
- "learning_rate": 4.8675e-05,
94
- "loss": 1.1092,
95
- "step": 1300
96
- },
97
- {
98
- "epoch": 0.1,
99
- "learning_rate": 5.2424999999999994e-05,
100
- "loss": 1.1215,
101
- "step": 1400
102
- },
103
- {
104
- "epoch": 0.11,
105
- "learning_rate": 5.6175e-05,
106
- "loss": 1.1165,
107
- "step": 1500
108
- },
109
- {
110
- "epoch": 0.12,
111
- "learning_rate": 5.9925e-05,
112
- "loss": 1.0946,
113
- "step": 1600
114
- },
115
- {
116
- "epoch": 0.12,
117
- "learning_rate": 6.367499999999999e-05,
118
- "loss": 1.1189,
119
- "step": 1700
120
- },
121
- {
122
- "epoch": 0.13,
123
- "learning_rate": 6.7425e-05,
124
- "loss": 1.1175,
125
- "step": 1800
126
- },
127
- {
128
- "epoch": 0.14,
129
- "learning_rate": 7.1175e-05,
130
- "loss": 1.1254,
131
- "step": 1900
132
- },
133
- {
134
- "epoch": 0.15,
135
- "learning_rate": 7.492499999999999e-05,
136
- "loss": 1.1317,
137
- "step": 2000
138
- },
139
- {
140
- "epoch": 0.15,
141
- "eval_loss": 0.2668535113334656,
142
- "eval_runtime": 988.5751,
143
- "eval_samples_per_second": 16.192,
144
- "eval_steps_per_second": 2.024,
145
- "eval_wer": 0.2748006118212608,
146
- "step": 2000
147
- },
148
- {
149
- "epoch": 0.15,
150
- "learning_rate": 7.470902612826603e-05,
151
- "loss": 1.1296,
152
- "step": 2100
153
- },
154
- {
155
- "epoch": 0.16,
156
- "learning_rate": 7.441211401425178e-05,
157
- "loss": 1.1406,
158
- "step": 2200
159
- },
160
- {
161
- "epoch": 0.17,
162
- "learning_rate": 7.411520190023751e-05,
163
- "loss": 1.1362,
164
- "step": 2300
165
- },
166
- {
167
- "epoch": 0.18,
168
- "learning_rate": 7.381828978622327e-05,
169
- "loss": 1.1292,
170
- "step": 2400
171
- },
172
- {
173
- "epoch": 0.18,
174
- "learning_rate": 7.352137767220902e-05,
175
- "loss": 1.105,
176
- "step": 2500
177
- },
178
- {
179
- "epoch": 0.19,
180
- "learning_rate": 7.322446555819477e-05,
181
- "loss": 1.1231,
182
- "step": 2600
183
- },
184
- {
185
- "epoch": 0.2,
186
- "learning_rate": 7.292755344418051e-05,
187
- "loss": 1.1187,
188
- "step": 2700
189
- },
190
- {
191
- "epoch": 0.21,
192
- "learning_rate": 7.263064133016626e-05,
193
- "loss": 1.1339,
194
- "step": 2800
195
- },
196
- {
197
- "epoch": 0.21,
198
- "learning_rate": 7.233372921615201e-05,
199
- "loss": 1.1241,
200
- "step": 2900
201
- },
202
- {
203
- "epoch": 0.22,
204
- "learning_rate": 7.203681710213777e-05,
205
- "loss": 1.1029,
206
- "step": 3000
207
- },
208
- {
209
- "epoch": 0.22,
210
- "eval_loss": 0.2638496458530426,
211
- "eval_runtime": 987.5568,
212
- "eval_samples_per_second": 16.209,
213
- "eval_steps_per_second": 2.026,
214
- "eval_wer": 0.2705875122910521,
215
- "step": 3000
216
- },
217
- {
218
- "epoch": 0.23,
219
- "learning_rate": 7.173990498812351e-05,
220
- "loss": 1.1215,
221
- "step": 3100
222
- },
223
- {
224
- "epoch": 0.23,
225
- "learning_rate": 7.144299287410925e-05,
226
- "loss": 1.1067,
227
- "step": 3200
228
- },
229
- {
230
- "epoch": 0.24,
231
- "learning_rate": 7.114608076009501e-05,
232
- "loss": 1.1126,
233
- "step": 3300
234
- },
235
- {
236
- "epoch": 0.25,
237
- "learning_rate": 7.084916864608076e-05,
238
- "loss": 1.109,
239
- "step": 3400
240
- },
241
- {
242
- "epoch": 0.26,
243
- "learning_rate": 7.05522565320665e-05,
244
- "loss": 1.1077,
245
- "step": 3500
246
- },
247
- {
248
- "epoch": 0.26,
249
- "learning_rate": 7.025534441805225e-05,
250
- "loss": 1.1,
251
- "step": 3600
252
- },
253
- {
254
- "epoch": 0.27,
255
- "learning_rate": 6.9958432304038e-05,
256
- "loss": 1.1061,
257
- "step": 3700
258
- },
259
- {
260
- "epoch": 0.28,
261
- "learning_rate": 6.966152019002374e-05,
262
- "loss": 1.103,
263
- "step": 3800
264
- },
265
- {
266
- "epoch": 0.29,
267
- "learning_rate": 6.936460807600949e-05,
268
- "loss": 1.0947,
269
- "step": 3900
270
- },
271
- {
272
- "epoch": 0.29,
273
- "learning_rate": 6.906769596199525e-05,
274
- "loss": 1.0949,
275
- "step": 4000
276
- },
277
- {
278
- "epoch": 0.29,
279
- "eval_loss": 0.25188884139060974,
280
- "eval_runtime": 996.0428,
281
- "eval_samples_per_second": 16.071,
282
- "eval_steps_per_second": 2.009,
283
- "eval_wer": 0.26274172402490986,
284
- "step": 4000
285
- },
286
- {
287
- "epoch": 0.3,
288
- "learning_rate": 6.8770783847981e-05,
289
- "loss": 1.1076,
290
- "step": 4100
291
- },
292
- {
293
- "epoch": 0.31,
294
- "learning_rate": 6.847387173396674e-05,
295
- "loss": 1.1012,
296
- "step": 4200
297
- },
298
- {
299
- "epoch": 0.32,
300
- "learning_rate": 6.817695961995249e-05,
301
- "loss": 1.081,
302
- "step": 4300
303
- },
304
- {
305
- "epoch": 0.32,
306
- "learning_rate": 6.788004750593824e-05,
307
- "loss": 1.0868,
308
- "step": 4400
309
- },
310
- {
311
- "epoch": 0.33,
312
- "learning_rate": 6.758313539192398e-05,
313
- "loss": 1.0956,
314
- "step": 4500
315
- },
316
- {
317
- "epoch": 0.34,
318
- "learning_rate": 6.728622327790973e-05,
319
- "loss": 1.0953,
320
- "step": 4600
321
- },
322
- {
323
- "epoch": 0.34,
324
- "learning_rate": 6.698931116389548e-05,
325
- "loss": 1.0952,
326
- "step": 4700
327
- },
328
- {
329
- "epoch": 0.35,
330
- "learning_rate": 6.669239904988122e-05,
331
- "loss": 1.0968,
332
- "step": 4800
333
- },
334
- {
335
- "epoch": 0.36,
336
- "learning_rate": 6.639548693586698e-05,
337
- "loss": 1.0827,
338
- "step": 4900
339
- },
340
- {
341
- "epoch": 0.37,
342
- "learning_rate": 6.609857482185273e-05,
343
- "loss": 1.0923,
344
- "step": 5000
345
- },
346
- {
347
- "epoch": 0.37,
348
- "eval_loss": 0.24751192331314087,
349
- "eval_runtime": 984.8205,
350
- "eval_samples_per_second": 16.254,
351
- "eval_steps_per_second": 2.032,
352
- "eval_wer": 0.25769556429585927,
353
- "step": 5000
354
- },
355
- {
356
- "epoch": 0.37,
357
- "learning_rate": 6.580166270783846e-05,
358
- "loss": 1.0895,
359
- "step": 5100
360
- },
361
- {
362
- "epoch": 0.38,
363
- "learning_rate": 6.550771971496436e-05,
364
- "loss": 1.0851,
365
- "step": 5200
366
- },
367
- {
368
- "epoch": 0.39,
369
- "learning_rate": 6.521080760095011e-05,
370
- "loss": 1.1124,
371
- "step": 5300
372
- },
373
- {
374
- "epoch": 0.4,
375
- "learning_rate": 6.491686460807601e-05,
376
- "loss": 1.0809,
377
- "step": 5400
378
- },
379
- {
380
- "epoch": 0.4,
381
- "learning_rate": 6.461995249406176e-05,
382
- "loss": 1.0985,
383
- "step": 5500
384
- },
385
- {
386
- "epoch": 0.41,
387
- "learning_rate": 6.432304038004749e-05,
388
- "loss": 1.086,
389
- "step": 5600
390
- },
391
- {
392
- "epoch": 0.42,
393
- "learning_rate": 6.402612826603325e-05,
394
- "loss": 1.0823,
395
- "step": 5700
396
- },
397
- {
398
- "epoch": 0.43,
399
- "learning_rate": 6.3729216152019e-05,
400
- "loss": 1.0732,
401
- "step": 5800
402
- },
403
- {
404
- "epoch": 0.43,
405
- "learning_rate": 6.343230403800475e-05,
406
- "loss": 1.076,
407
- "step": 5900
408
- },
409
- {
410
- "epoch": 0.44,
411
- "learning_rate": 6.313539192399049e-05,
412
- "loss": 1.0847,
413
- "step": 6000
414
- },
415
- {
416
- "epoch": 0.44,
417
- "eval_loss": 0.24355509877204895,
418
- "eval_runtime": 984.3756,
419
- "eval_samples_per_second": 16.261,
420
- "eval_steps_per_second": 2.033,
421
- "eval_wer": 0.26121217087293785,
422
- "step": 6000
423
- },
424
- {
425
- "epoch": 0.45,
426
- "learning_rate": 6.283847980997624e-05,
427
- "loss": 1.0748,
428
- "step": 6100
429
- },
430
- {
431
- "epoch": 0.45,
432
- "learning_rate": 6.254156769596199e-05,
433
- "loss": 1.0836,
434
- "step": 6200
435
- },
436
- {
437
- "epoch": 0.46,
438
- "learning_rate": 6.224465558194773e-05,
439
- "loss": 1.084,
440
- "step": 6300
441
- },
442
- {
443
- "epoch": 0.47,
444
- "learning_rate": 6.194774346793349e-05,
445
- "loss": 1.0649,
446
- "step": 6400
447
- },
448
- {
449
- "epoch": 0.48,
450
- "learning_rate": 6.165083135391923e-05,
451
- "loss": 1.0751,
452
- "step": 6500
453
- },
454
- {
455
- "epoch": 0.48,
456
- "learning_rate": 6.135391923990499e-05,
457
- "loss": 1.0773,
458
- "step": 6600
459
- },
460
- {
461
- "epoch": 0.49,
462
- "learning_rate": 6.105700712589073e-05,
463
- "loss": 1.095,
464
- "step": 6700
465
- },
466
- {
467
- "epoch": 0.5,
468
- "learning_rate": 6.076009501187648e-05,
469
- "loss": 1.0629,
470
- "step": 6800
471
- },
472
- {
473
- "epoch": 0.51,
474
- "learning_rate": 6.0463182897862234e-05,
475
- "loss": 1.0904,
476
- "step": 6900
477
- },
478
- {
479
- "epoch": 0.51,
480
- "learning_rate": 6.0166270783847974e-05,
481
- "loss": 1.0667,
482
- "step": 7000
483
- },
484
- {
485
- "epoch": 0.51,
486
- "eval_loss": 0.24724909663200378,
487
- "eval_runtime": 983.1677,
488
- "eval_samples_per_second": 16.281,
489
- "eval_steps_per_second": 2.035,
490
- "eval_wer": 0.26608762154484866,
491
- "step": 7000
492
- },
493
- {
494
- "epoch": 0.52,
495
- "learning_rate": 5.986935866983372e-05,
496
- "loss": 1.0825,
497
- "step": 7100
498
- },
499
- {
500
- "epoch": 0.53,
501
- "learning_rate": 5.9572446555819474e-05,
502
- "loss": 1.0811,
503
- "step": 7200
504
- },
505
- {
506
- "epoch": 0.54,
507
- "learning_rate": 5.927553444180522e-05,
508
- "loss": 1.0906,
509
- "step": 7300
510
- },
511
- {
512
- "epoch": 0.54,
513
- "learning_rate": 5.8978622327790975e-05,
514
- "loss": 1.0784,
515
- "step": 7400
516
- },
517
- {
518
- "epoch": 0.55,
519
- "learning_rate": 5.8681710213776715e-05,
520
- "loss": 1.0822,
521
- "step": 7500
522
- },
523
- {
524
- "epoch": 0.56,
525
- "learning_rate": 5.838479809976246e-05,
526
- "loss": 1.0802,
527
- "step": 7600
528
- },
529
- {
530
- "epoch": 0.56,
531
- "learning_rate": 5.8087885985748215e-05,
532
- "loss": 1.0805,
533
- "step": 7700
534
- },
535
- {
536
- "epoch": 0.57,
537
- "learning_rate": 5.779097387173396e-05,
538
- "loss": 1.093,
539
- "step": 7800
540
- },
541
- {
542
- "epoch": 0.58,
543
- "learning_rate": 5.749406175771971e-05,
544
- "loss": 1.0456,
545
- "step": 7900
546
- },
547
- {
548
- "epoch": 0.59,
549
- "learning_rate": 5.7197149643705455e-05,
550
- "loss": 1.0709,
551
- "step": 8000
552
- },
553
- {
554
- "epoch": 0.59,
555
- "eval_loss": 0.24887976050376892,
556
- "eval_runtime": 982.4054,
557
- "eval_samples_per_second": 16.294,
558
- "eval_steps_per_second": 2.037,
559
- "eval_wer": 0.26095269310608543,
560
- "step": 8000
561
- },
562
- {
563
- "epoch": 0.59,
564
- "learning_rate": 5.690023752969121e-05,
565
- "loss": 1.0677,
566
- "step": 8100
567
- },
568
- {
569
- "epoch": 0.6,
570
- "learning_rate": 5.6603325415676956e-05,
571
- "loss": 1.0659,
572
- "step": 8200
573
- },
574
- {
575
- "epoch": 0.61,
576
- "learning_rate": 5.630641330166271e-05,
577
- "loss": 1.0788,
578
- "step": 8300
579
- },
580
- {
581
- "epoch": 0.62,
582
- "learning_rate": 5.600950118764845e-05,
583
- "loss": 1.071,
584
- "step": 8400
585
- },
586
- {
587
- "epoch": 0.62,
588
- "learning_rate": 5.5712589073634196e-05,
589
- "loss": 1.0669,
590
- "step": 8500
591
- },
592
- {
593
- "epoch": 0.63,
594
- "learning_rate": 5.541567695961995e-05,
595
- "loss": 1.0728,
596
- "step": 8600
597
- },
598
- {
599
- "epoch": 0.64,
600
- "learning_rate": 5.512173396674584e-05,
601
- "loss": 1.0652,
602
- "step": 8700
603
- },
604
- {
605
- "epoch": 0.65,
606
- "learning_rate": 5.482482185273159e-05,
607
- "loss": 1.0632,
608
- "step": 8800
609
- },
610
- {
611
- "epoch": 0.65,
612
- "learning_rate": 5.452790973871733e-05,
613
- "loss": 1.048,
614
- "step": 8900
615
- },
616
- {
617
- "epoch": 0.66,
618
- "learning_rate": 5.4230997624703083e-05,
619
- "loss": 1.0472,
620
- "step": 9000
621
- },
622
- {
623
- "epoch": 0.66,
624
- "eval_loss": 0.23543120920658112,
625
- "eval_runtime": 985.8901,
626
- "eval_samples_per_second": 16.236,
627
- "eval_steps_per_second": 2.03,
628
- "eval_wer": 0.24997268655085764,
629
- "step": 9000
630
- },
631
- {
632
- "epoch": 0.67,
633
- "learning_rate": 5.393408551068883e-05,
634
- "loss": 1.0552,
635
- "step": 9100
636
- },
637
- {
638
- "epoch": 0.67,
639
- "learning_rate": 5.3637173396674584e-05,
640
- "loss": 1.0581,
641
- "step": 9200
642
- },
643
- {
644
- "epoch": 0.68,
645
- "learning_rate": 5.3340261282660324e-05,
646
- "loss": 1.0658,
647
- "step": 9300
648
- },
649
- {
650
- "epoch": 0.69,
651
- "learning_rate": 5.304334916864607e-05,
652
- "loss": 1.0603,
653
- "step": 9400
654
- },
655
- {
656
- "epoch": 0.7,
657
- "learning_rate": 5.2746437054631824e-05,
658
- "loss": 1.0661,
659
- "step": 9500
660
- },
661
- {
662
- "epoch": 0.7,
663
- "learning_rate": 5.244952494061757e-05,
664
- "loss": 1.0554,
665
- "step": 9600
666
- },
667
- {
668
- "epoch": 0.71,
669
- "learning_rate": 5.2152612826603325e-05,
670
- "loss": 1.0728,
671
- "step": 9700
672
- },
673
- {
674
- "epoch": 0.72,
675
- "learning_rate": 5.1855700712589065e-05,
676
- "loss": 1.0513,
677
- "step": 9800
678
- },
679
- {
680
- "epoch": 0.73,
681
- "learning_rate": 5.155878859857482e-05,
682
- "loss": 1.0379,
683
- "step": 9900
684
- },
685
- {
686
- "epoch": 0.73,
687
- "learning_rate": 5.1261876484560565e-05,
688
- "loss": 1.0604,
689
- "step": 10000
690
- },
691
- {
692
- "epoch": 0.73,
693
- "eval_loss": 0.23458585143089294,
694
- "eval_runtime": 986.1525,
695
- "eval_samples_per_second": 16.232,
696
- "eval_steps_per_second": 2.029,
697
- "eval_wer": 0.2485182453840271,
698
- "step": 10000
699
- },
700
- {
701
- "epoch": 0.74,
702
- "learning_rate": 5.096496437054632e-05,
703
- "loss": 1.0632,
704
- "step": 10100
705
- },
706
- {
707
- "epoch": 0.75,
708
- "learning_rate": 5.0668052256532065e-05,
709
- "loss": 1.0526,
710
- "step": 10200
711
- },
712
- {
713
- "epoch": 0.76,
714
- "learning_rate": 5.0371140142517805e-05,
715
- "loss": 1.0314,
716
- "step": 10300
717
- },
718
- {
719
- "epoch": 0.76,
720
- "learning_rate": 5.007422802850356e-05,
721
- "loss": 1.0508,
722
- "step": 10400
723
- },
724
- {
725
- "epoch": 0.77,
726
- "learning_rate": 4.9777315914489306e-05,
727
- "loss": 1.0446,
728
- "step": 10500
729
- },
730
- {
731
- "epoch": 0.78,
732
- "learning_rate": 4.948040380047506e-05,
733
- "loss": 1.0361,
734
- "step": 10600
735
- },
736
- {
737
- "epoch": 0.79,
738
- "learning_rate": 4.91834916864608e-05,
739
- "loss": 1.0319,
740
- "step": 10700
741
- },
742
- {
743
- "epoch": 0.79,
744
- "learning_rate": 4.8886579572446546e-05,
745
- "loss": 1.0178,
746
- "step": 10800
747
- },
748
- {
749
- "epoch": 0.8,
750
- "learning_rate": 4.85896674584323e-05,
751
- "loss": 1.0301,
752
- "step": 10900
753
- },
754
- {
755
- "epoch": 0.81,
756
- "learning_rate": 4.8292755344418046e-05,
757
- "loss": 1.0375,
758
- "step": 11000
759
- },
760
- {
761
- "epoch": 0.81,
762
- "eval_loss": 0.2285824865102768,
763
- "eval_runtime": 979.8277,
764
- "eval_samples_per_second": 16.337,
765
- "eval_steps_per_second": 2.042,
766
- "eval_wer": 0.23898585163334427,
767
- "step": 11000
768
- },
769
- {
770
- "epoch": 0.81,
771
- "learning_rate": 4.79958432304038e-05,
772
- "loss": 1.0398,
773
- "step": 11100
774
- },
775
- {
776
- "epoch": 0.82,
777
- "learning_rate": 4.769893111638954e-05,
778
- "loss": 1.0308,
779
- "step": 11200
780
- },
781
- {
782
- "epoch": 0.83,
783
- "learning_rate": 4.7402019002375294e-05,
784
- "loss": 1.0309,
785
- "step": 11300
786
- },
787
- {
788
- "epoch": 0.84,
789
- "learning_rate": 4.710510688836104e-05,
790
- "loss": 1.0287,
791
- "step": 11400
792
- },
793
- {
794
- "epoch": 0.84,
795
- "learning_rate": 4.6808194774346794e-05,
796
- "loss": 1.0195,
797
- "step": 11500
798
- },
799
- {
800
- "epoch": 0.85,
801
- "learning_rate": 4.651128266033254e-05,
802
- "loss": 1.0292,
803
- "step": 11600
804
- },
805
- {
806
- "epoch": 0.86,
807
- "learning_rate": 4.621437054631828e-05,
808
- "loss": 1.0147,
809
- "step": 11700
810
- },
811
- {
812
- "epoch": 0.87,
813
- "learning_rate": 4.5917458432304034e-05,
814
- "loss": 1.0242,
815
- "step": 11800
816
- },
817
- {
818
- "epoch": 0.87,
819
- "learning_rate": 4.562054631828978e-05,
820
- "loss": 1.029,
821
- "step": 11900
822
- },
823
- {
824
- "epoch": 0.88,
825
- "learning_rate": 4.5326603325415675e-05,
826
- "loss": 1.0193,
827
- "step": 12000
828
- },
829
- {
830
- "epoch": 0.88,
831
- "eval_loss": 0.22122837603092194,
832
- "eval_runtime": 981.4673,
833
- "eval_samples_per_second": 16.309,
834
- "eval_steps_per_second": 2.039,
835
- "eval_wer": 0.23376215448486834,
836
- "step": 12000
837
- },
838
- {
839
- "epoch": 0.89,
840
- "learning_rate": 4.502969121140143e-05,
841
- "loss": 1.0249,
842
- "step": 12100
843
- },
844
- {
845
- "epoch": 0.9,
846
- "learning_rate": 4.473277909738717e-05,
847
- "loss": 1.0165,
848
- "step": 12200
849
- },
850
- {
851
- "epoch": 0.9,
852
- "learning_rate": 4.4435866983372915e-05,
853
- "loss": 1.0303,
854
- "step": 12300
855
- },
856
- {
857
- "epoch": 0.91,
858
- "learning_rate": 4.413895486935867e-05,
859
- "loss": 1.0295,
860
- "step": 12400
861
- },
862
- {
863
- "epoch": 0.92,
864
- "learning_rate": 4.3842042755344415e-05,
865
- "loss": 1.0112,
866
- "step": 12500
867
- },
868
- {
869
- "epoch": 0.92,
870
- "learning_rate": 4.35480997624703e-05,
871
- "loss": 1.0056,
872
- "step": 12600
873
- },
874
- {
875
- "epoch": 0.93,
876
- "learning_rate": 4.325118764845605e-05,
877
- "loss": 1.0108,
878
- "step": 12700
879
- },
880
- {
881
- "epoch": 0.94,
882
- "learning_rate": 4.29542755344418e-05,
883
- "loss": 1.0133,
884
- "step": 12800
885
- },
886
- {
887
- "epoch": 0.95,
888
- "learning_rate": 4.265736342042755e-05,
889
- "loss": 1.0063,
890
- "step": 12900
891
- },
892
- {
893
- "epoch": 0.95,
894
- "learning_rate": 4.23604513064133e-05,
895
- "loss": 1.0077,
896
- "step": 13000
897
- },
898
- {
899
- "epoch": 0.95,
900
- "eval_loss": 0.21520280838012695,
901
- "eval_runtime": 983.9086,
902
- "eval_samples_per_second": 16.269,
903
- "eval_steps_per_second": 2.034,
904
- "eval_wer": 0.22689282202556538,
905
- "step": 13000
906
- },
907
- {
908
- "epoch": 0.96,
909
- "learning_rate": 4.206353919239904e-05,
910
- "loss": 1.0085,
911
- "step": 13100
912
- },
913
- {
914
- "epoch": 0.97,
915
- "learning_rate": 4.176662707838479e-05,
916
- "loss": 1.011,
917
- "step": 13200
918
- },
919
- {
920
- "epoch": 0.98,
921
- "learning_rate": 4.146971496437054e-05,
922
- "loss": 1.0131,
923
- "step": 13300
924
- },
925
- {
926
- "epoch": 0.98,
927
- "learning_rate": 4.117280285035629e-05,
928
- "loss": 0.998,
929
- "step": 13400
930
- },
931
- {
932
- "epoch": 0.99,
933
- "learning_rate": 4.0875890736342043e-05,
934
- "loss": 1.0002,
935
- "step": 13500
936
- },
937
- {
938
- "epoch": 1.0,
939
- "learning_rate": 4.0578978622327783e-05,
940
- "loss": 0.9916,
941
- "step": 13600
942
- },
943
- {
944
- "epoch": 1.01,
945
- "learning_rate": 4.028206650831354e-05,
946
- "loss": 0.9662,
947
- "step": 13700
948
- },
949
- {
950
- "epoch": 1.01,
951
- "learning_rate": 3.9985154394299284e-05,
952
- "loss": 0.9758,
953
- "step": 13800
954
- },
955
- {
956
- "epoch": 1.02,
957
- "learning_rate": 3.968824228028504e-05,
958
- "loss": 1.013,
959
- "step": 13900
960
- },
961
- {
962
- "epoch": 1.03,
963
- "learning_rate": 3.939133016627078e-05,
964
- "loss": 1.0004,
965
- "step": 14000
966
- },
967
- {
968
- "epoch": 1.03,
969
- "eval_loss": 0.2093251347541809,
970
- "eval_runtime": 986.9604,
971
- "eval_samples_per_second": 16.218,
972
- "eval_steps_per_second": 2.027,
973
- "eval_wer": 0.22069949743253578,
974
- "step": 14000
975
- },
976
- {
977
- "epoch": 1.03,
978
- "learning_rate": 3.9094418052256524e-05,
979
- "loss": 0.9852,
980
- "step": 14100
981
- },
982
- {
983
- "epoch": 1.04,
984
- "learning_rate": 3.879750593824228e-05,
985
- "loss": 0.9765,
986
- "step": 14200
987
- },
988
- {
989
- "epoch": 1.05,
990
- "learning_rate": 3.8500593824228025e-05,
991
- "loss": 0.9978,
992
- "step": 14300
993
- },
994
- {
995
- "epoch": 1.06,
996
- "learning_rate": 3.820368171021378e-05,
997
- "loss": 0.9807,
998
- "step": 14400
999
- },
1000
- {
1001
- "epoch": 1.06,
1002
- "learning_rate": 3.790676959619952e-05,
1003
- "loss": 0.9988,
1004
- "step": 14500
1005
- },
1006
- {
1007
- "epoch": 1.07,
1008
- "learning_rate": 3.7609857482185265e-05,
1009
- "loss": 0.977,
1010
- "step": 14600
1011
- },
1012
- {
1013
- "epoch": 1.08,
1014
- "learning_rate": 3.731294536817102e-05,
1015
- "loss": 0.9735,
1016
- "step": 14700
1017
- },
1018
- {
1019
- "epoch": 1.09,
1020
- "learning_rate": 3.7016033254156765e-05,
1021
- "loss": 0.9767,
1022
- "step": 14800
1023
- },
1024
- {
1025
- "epoch": 1.09,
1026
- "learning_rate": 3.671912114014251e-05,
1027
- "loss": 0.9555,
1028
- "step": 14900
1029
- },
1030
- {
1031
- "epoch": 1.1,
1032
- "learning_rate": 3.6422209026128266e-05,
1033
- "loss": 0.9649,
1034
- "step": 15000
1035
- },
1036
- {
1037
- "epoch": 1.1,
1038
- "eval_loss": 0.19932541251182556,
1039
- "eval_runtime": 986.5773,
1040
- "eval_samples_per_second": 16.225,
1041
- "eval_steps_per_second": 2.028,
1042
- "eval_wer": 0.21130367092756475,
1043
- "step": 15000
1044
- },
1045
- {
1046
- "epoch": 1.11,
1047
- "learning_rate": 3.612529691211401e-05,
1048
- "loss": 0.9608,
1049
- "step": 15100
1050
- },
1051
- {
1052
- "epoch": 1.12,
1053
- "learning_rate": 3.582838479809976e-05,
1054
- "loss": 0.9549,
1055
- "step": 15200
1056
- },
1057
- {
1058
- "epoch": 1.12,
1059
- "learning_rate": 3.5531472684085506e-05,
1060
- "loss": 0.9636,
1061
- "step": 15300
1062
- },
1063
- {
1064
- "epoch": 1.13,
1065
- "learning_rate": 3.523456057007125e-05,
1066
- "loss": 0.9605,
1067
- "step": 15400
1068
- },
1069
- {
1070
- "epoch": 1.14,
1071
- "learning_rate": 3.4937648456057006e-05,
1072
- "loss": 0.962,
1073
- "step": 15500
1074
- },
1075
- {
1076
- "epoch": 1.14,
1077
- "learning_rate": 3.464073634204275e-05,
1078
- "loss": 0.9565,
1079
- "step": 15600
1080
- },
1081
- {
1082
- "epoch": 1.15,
1083
- "learning_rate": 3.43438242280285e-05,
1084
- "loss": 0.9609,
1085
- "step": 15700
1086
- },
1087
- {
1088
- "epoch": 1.16,
1089
- "learning_rate": 3.404691211401425e-05,
1090
- "loss": 0.9552,
1091
- "step": 15800
1092
- },
1093
- {
1094
- "epoch": 1.17,
1095
- "learning_rate": 3.375e-05,
1096
- "loss": 0.9503,
1097
- "step": 15900
1098
- },
1099
- {
1100
- "epoch": 1.17,
1101
- "learning_rate": 3.345308788598574e-05,
1102
- "loss": 0.9509,
1103
- "step": 16000
1104
- },
1105
- {
1106
- "epoch": 1.17,
1107
- "eval_loss": 0.19342663884162903,
1108
- "eval_runtime": 984.1094,
1109
- "eval_samples_per_second": 16.265,
1110
- "eval_steps_per_second": 2.033,
1111
- "eval_wer": 0.20888643067846607,
1112
- "step": 16000
1113
- },
1114
- {
1115
- "epoch": 1.18,
1116
- "learning_rate": 3.3156175771971494e-05,
1117
- "loss": 0.9369,
1118
- "step": 16100
1119
- },
1120
- {
1121
- "epoch": 1.19,
1122
- "learning_rate": 3.285926365795724e-05,
1123
- "loss": 0.9549,
1124
- "step": 16200
1125
- },
1126
- {
1127
- "epoch": 1.2,
1128
- "learning_rate": 3.256235154394299e-05,
1129
- "loss": 0.9503,
1130
- "step": 16300
1131
- },
1132
- {
1133
- "epoch": 1.2,
1134
- "learning_rate": 3.226543942992874e-05,
1135
- "loss": 0.9553,
1136
- "step": 16400
1137
- },
1138
- {
1139
- "epoch": 1.21,
1140
- "learning_rate": 3.196852731591449e-05,
1141
- "loss": 0.9508,
1142
- "step": 16500
1143
- },
1144
- {
1145
- "epoch": 1.22,
1146
- "learning_rate": 3.1671615201900235e-05,
1147
- "loss": 0.9411,
1148
- "step": 16600
1149
- },
1150
- {
1151
- "epoch": 1.23,
1152
- "learning_rate": 3.137470308788598e-05,
1153
- "loss": 0.9435,
1154
- "step": 16700
1155
- },
1156
- {
1157
- "epoch": 1.23,
1158
- "learning_rate": 3.107779097387173e-05,
1159
- "loss": 0.9439,
1160
- "step": 16800
1161
- },
1162
- {
1163
- "epoch": 1.24,
1164
- "learning_rate": 3.078087885985748e-05,
1165
- "loss": 0.946,
1166
- "step": 16900
1167
- },
1168
- {
1169
- "epoch": 1.25,
1170
- "learning_rate": 3.048396674584323e-05,
1171
- "loss": 0.9533,
1172
- "step": 17000
1173
- },
1174
- {
1175
- "epoch": 1.25,
1176
- "eval_loss": 0.18736572563648224,
1177
- "eval_runtime": 984.7341,
1178
- "eval_samples_per_second": 16.255,
1179
- "eval_steps_per_second": 2.032,
1180
- "eval_wer": 0.20231071779744347,
1181
- "step": 17000
1182
- },
1183
- {
1184
- "epoch": 1.25,
1185
- "learning_rate": 3.018705463182898e-05,
1186
- "loss": 0.9322,
1187
- "step": 17100
1188
- },
1189
- {
1190
- "epoch": 1.26,
1191
- "learning_rate": 2.9890142517814722e-05,
1192
- "loss": 0.94,
1193
- "step": 17200
1194
- },
1195
- {
1196
- "epoch": 1.27,
1197
- "learning_rate": 2.9593230403800473e-05,
1198
- "loss": 0.9373,
1199
- "step": 17300
1200
- },
1201
- {
1202
- "epoch": 1.28,
1203
- "learning_rate": 2.9299287410926363e-05,
1204
- "loss": 0.924,
1205
- "step": 17400
1206
- },
1207
- {
1208
- "epoch": 1.28,
1209
- "learning_rate": 2.9005344418052253e-05,
1210
- "loss": 0.9357,
1211
- "step": 17500
1212
- },
1213
- {
1214
- "epoch": 1.29,
1215
- "learning_rate": 2.8708432304038003e-05,
1216
- "loss": 0.9351,
1217
- "step": 17600
1218
- },
1219
- {
1220
- "epoch": 1.3,
1221
- "learning_rate": 2.841152019002375e-05,
1222
- "loss": 0.9371,
1223
- "step": 17700
1224
- },
1225
- {
1226
- "epoch": 1.31,
1227
- "learning_rate": 2.81146080760095e-05,
1228
- "loss": 0.9253,
1229
- "step": 17800
1230
- },
1231
- {
1232
- "epoch": 1.31,
1233
- "learning_rate": 2.7817695961995246e-05,
1234
- "loss": 0.9264,
1235
- "step": 17900
1236
- },
1237
- {
1238
- "epoch": 1.32,
1239
- "learning_rate": 2.7520783847980997e-05,
1240
- "loss": 0.9248,
1241
- "step": 18000
1242
- },
1243
- {
1244
- "epoch": 1.32,
1245
- "eval_loss": 0.1818237155675888,
1246
- "eval_runtime": 1114.2718,
1247
- "eval_samples_per_second": 14.365,
1248
- "eval_steps_per_second": 1.796,
1249
- "eval_wer": 0.19742843876324703,
1250
- "step": 18000
1251
- },
1252
- {
1253
- "epoch": 1.33,
1254
- "learning_rate": 2.722387173396674e-05,
1255
- "loss": 0.9448,
1256
- "step": 18100
1257
- },
1258
- {
1259
- "epoch": 1.34,
1260
- "learning_rate": 2.692695961995249e-05,
1261
- "loss": 0.9284,
1262
- "step": 18200
1263
- },
1264
- {
1265
- "epoch": 1.34,
1266
- "learning_rate": 2.663004750593824e-05,
1267
- "loss": 0.9141,
1268
- "step": 18300
1269
- },
1270
- {
1271
- "epoch": 1.35,
1272
- "learning_rate": 2.6333135391923987e-05,
1273
- "loss": 0.9117,
1274
- "step": 18400
1275
- },
1276
- {
1277
- "epoch": 1.36,
1278
- "learning_rate": 2.6036223277909737e-05,
1279
- "loss": 0.917,
1280
- "step": 18500
1281
- },
1282
- {
1283
- "epoch": 1.36,
1284
- "learning_rate": 2.5739311163895484e-05,
1285
- "loss": 0.9165,
1286
- "step": 18600
1287
- },
1288
- {
1289
- "epoch": 1.37,
1290
- "learning_rate": 2.5442399049881234e-05,
1291
- "loss": 0.9099,
1292
- "step": 18700
1293
- },
1294
- {
1295
- "epoch": 1.38,
1296
- "learning_rate": 2.5145486935866978e-05,
1297
- "loss": 0.9022,
1298
- "step": 18800
1299
- },
1300
- {
1301
- "epoch": 1.39,
1302
- "learning_rate": 2.4848574821852728e-05,
1303
- "loss": 0.9246,
1304
- "step": 18900
1305
- },
1306
- {
1307
- "epoch": 1.39,
1308
- "learning_rate": 2.4551662707838478e-05,
1309
- "loss": 0.9216,
1310
- "step": 19000
1311
- },
1312
- {
1313
- "epoch": 1.39,
1314
- "eval_loss": 0.17756715416908264,
1315
- "eval_runtime": 1032.2412,
1316
- "eval_samples_per_second": 15.507,
1317
- "eval_steps_per_second": 1.939,
1318
- "eval_wer": 0.19256664481590735,
1319
- "step": 19000
1320
- },
1321
- {
1322
- "epoch": 1.4,
1323
- "learning_rate": 2.4254750593824225e-05,
1324
- "loss": 0.9142,
1325
- "step": 19100
1326
- },
1327
- {
1328
- "epoch": 1.41,
1329
- "learning_rate": 2.3957838479809975e-05,
1330
- "loss": 0.9275,
1331
- "step": 19200
1332
- },
1333
- {
1334
- "epoch": 1.42,
1335
- "learning_rate": 2.3660926365795722e-05,
1336
- "loss": 0.9132,
1337
- "step": 19300
1338
- },
1339
- {
1340
- "epoch": 1.42,
1341
- "learning_rate": 2.3364014251781472e-05,
1342
- "loss": 0.9111,
1343
- "step": 19400
1344
- },
1345
- {
1346
- "epoch": 1.43,
1347
- "learning_rate": 2.3067102137767216e-05,
1348
- "loss": 0.8974,
1349
- "step": 19500
1350
- },
1351
- {
1352
- "epoch": 1.44,
1353
- "learning_rate": 2.2770190023752966e-05,
1354
- "loss": 0.9013,
1355
- "step": 19600
1356
- },
1357
- {
1358
- "epoch": 1.45,
1359
- "learning_rate": 2.2473277909738716e-05,
1360
- "loss": 0.9093,
1361
- "step": 19700
1362
- },
1363
- {
1364
- "epoch": 1.45,
1365
- "learning_rate": 2.2176365795724463e-05,
1366
- "loss": 0.8926,
1367
- "step": 19800
1368
- },
1369
- {
1370
- "epoch": 1.46,
1371
- "learning_rate": 2.1879453681710213e-05,
1372
- "loss": 0.9026,
1373
- "step": 19900
1374
- },
1375
- {
1376
- "epoch": 1.47,
1377
- "learning_rate": 2.158254156769596e-05,
1378
- "loss": 0.8964,
1379
- "step": 20000
1380
- },
1381
- {
1382
- "epoch": 1.47,
1383
- "eval_loss": 0.1722368746995926,
1384
- "eval_runtime": 1019.2936,
1385
- "eval_samples_per_second": 15.704,
1386
- "eval_steps_per_second": 1.963,
1387
- "eval_wer": 0.19043619578280346,
1388
- "step": 20000
1389
- },
1390
- {
1391
- "epoch": 1.47,
1392
- "learning_rate": 2.128859857482185e-05,
1393
- "loss": 0.8906,
1394
- "step": 20100
1395
- },
1396
- {
1397
- "epoch": 1.48,
1398
- "learning_rate": 2.09916864608076e-05,
1399
- "loss": 0.8878,
1400
- "step": 20200
1401
- },
1402
- {
1403
- "epoch": 1.49,
1404
- "learning_rate": 2.0694774346793347e-05,
1405
- "loss": 0.9024,
1406
- "step": 20300
1407
- },
1408
- {
1409
- "epoch": 1.5,
1410
- "learning_rate": 2.0397862232779097e-05,
1411
- "loss": 0.8903,
1412
- "step": 20400
1413
- },
1414
- {
1415
- "epoch": 1.5,
1416
- "learning_rate": 2.0100950118764844e-05,
1417
- "loss": 0.8843,
1418
- "step": 20500
1419
- },
1420
- {
1421
- "epoch": 1.51,
1422
- "learning_rate": 1.9804038004750594e-05,
1423
- "loss": 0.8911,
1424
- "step": 20600
1425
- },
1426
- {
1427
- "epoch": 1.52,
1428
- "learning_rate": 1.9507125890736337e-05,
1429
- "loss": 0.8795,
1430
- "step": 20700
1431
- },
1432
- {
1433
- "epoch": 1.53,
1434
- "learning_rate": 1.9210213776722087e-05,
1435
- "loss": 0.8777,
1436
- "step": 20800
1437
- },
1438
- {
1439
- "epoch": 1.53,
1440
- "learning_rate": 1.8913301662707838e-05,
1441
- "loss": 0.889,
1442
- "step": 20900
1443
- },
1444
- {
1445
- "epoch": 1.54,
1446
- "learning_rate": 1.8616389548693584e-05,
1447
- "loss": 0.8941,
1448
- "step": 21000
1449
- },
1450
- {
1451
- "epoch": 1.54,
1452
- "eval_loss": 0.16895848512649536,
1453
- "eval_runtime": 1022.9987,
1454
- "eval_samples_per_second": 15.647,
1455
- "eval_steps_per_second": 1.956,
1456
- "eval_wer": 0.18521932699661314,
1457
- "step": 21000
1458
- },
1459
- {
1460
- "epoch": 1.55,
1461
- "learning_rate": 1.831947743467933e-05,
1462
- "loss": 0.882,
1463
- "step": 21100
1464
- },
1465
- {
1466
- "epoch": 1.56,
1467
- "learning_rate": 1.802256532066508e-05,
1468
- "loss": 0.8801,
1469
- "step": 21200
1470
- },
1471
- {
1472
- "epoch": 1.56,
1473
- "learning_rate": 1.772565320665083e-05,
1474
- "loss": 0.8718,
1475
- "step": 21300
1476
- },
1477
- {
1478
- "epoch": 1.57,
1479
- "learning_rate": 1.742874109263658e-05,
1480
- "loss": 0.8904,
1481
- "step": 21400
1482
- },
1483
- {
1484
- "epoch": 1.58,
1485
- "learning_rate": 1.7131828978622325e-05,
1486
- "loss": 0.8729,
1487
- "step": 21500
1488
- },
1489
- {
1490
- "epoch": 1.58,
1491
- "learning_rate": 1.6834916864608075e-05,
1492
- "loss": 0.8722,
1493
- "step": 21600
1494
- },
1495
- {
1496
- "epoch": 1.59,
1497
- "learning_rate": 1.6538004750593822e-05,
1498
- "loss": 0.8739,
1499
- "step": 21700
1500
- },
1501
- {
1502
- "epoch": 1.6,
1503
- "learning_rate": 1.624109263657957e-05,
1504
- "loss": 0.8635,
1505
- "step": 21800
1506
- },
1507
- {
1508
- "epoch": 1.61,
1509
- "learning_rate": 1.594418052256532e-05,
1510
- "loss": 0.8767,
1511
- "step": 21900
1512
- },
1513
- {
1514
- "epoch": 1.61,
1515
- "learning_rate": 1.564726840855107e-05,
1516
- "loss": 0.871,
1517
- "step": 22000
1518
- },
1519
- {
1520
- "epoch": 1.61,
1521
- "eval_loss": 0.16269078850746155,
1522
- "eval_runtime": 1042.6643,
1523
- "eval_samples_per_second": 15.352,
1524
- "eval_steps_per_second": 1.919,
1525
- "eval_wer": 0.17805637495902982,
1526
- "step": 22000
1527
- },
1528
- {
1529
- "epoch": 1.62,
1530
- "learning_rate": 1.5350356294536816e-05,
1531
- "loss": 0.8663,
1532
- "step": 22100
1533
- },
1534
- {
1535
- "epoch": 1.63,
1536
- "learning_rate": 1.5056413301662706e-05,
1537
- "loss": 0.8732,
1538
- "step": 22200
1539
- },
1540
- {
1541
- "epoch": 1.64,
1542
- "learning_rate": 1.4759501187648455e-05,
1543
- "loss": 0.8625,
1544
- "step": 22300
1545
- },
1546
- {
1547
- "epoch": 1.64,
1548
- "learning_rate": 1.4462589073634203e-05,
1549
- "loss": 0.854,
1550
- "step": 22400
1551
- },
1552
- {
1553
- "epoch": 1.65,
1554
- "learning_rate": 1.416567695961995e-05,
1555
- "loss": 0.8692,
1556
- "step": 22500
1557
- },
1558
- {
1559
- "epoch": 1.66,
1560
- "learning_rate": 1.38687648456057e-05,
1561
- "loss": 0.8477,
1562
- "step": 22600
1563
- },
1564
- {
1565
- "epoch": 1.67,
1566
- "learning_rate": 1.3571852731591449e-05,
1567
- "loss": 0.8494,
1568
- "step": 22700
1569
- },
1570
- {
1571
- "epoch": 1.67,
1572
- "learning_rate": 1.3277909738717339e-05,
1573
- "loss": 0.8599,
1574
- "step": 22800
1575
- },
1576
- {
1577
- "epoch": 1.68,
1578
- "learning_rate": 1.2980997624703087e-05,
1579
- "loss": 0.863,
1580
- "step": 22900
1581
- },
1582
- {
1583
- "epoch": 1.69,
1584
- "learning_rate": 1.2684085510688834e-05,
1585
- "loss": 0.847,
1586
- "step": 23000
1587
- },
1588
- {
1589
- "epoch": 1.69,
1590
- "eval_loss": 0.15907420217990875,
1591
- "eval_runtime": 1036.4519,
1592
- "eval_samples_per_second": 15.444,
1593
- "eval_steps_per_second": 1.931,
1594
- "eval_wer": 0.17514066426308314,
1595
- "step": 23000
1596
- },
1597
- {
1598
- "epoch": 1.69,
1599
- "learning_rate": 1.2387173396674582e-05,
1600
- "loss": 0.8487,
1601
- "step": 23100
1602
- },
1603
- {
1604
- "epoch": 1.7,
1605
- "learning_rate": 1.2090261282660333e-05,
1606
- "loss": 0.8637,
1607
- "step": 23200
1608
- },
1609
- {
1610
- "epoch": 1.71,
1611
- "learning_rate": 1.1793349168646081e-05,
1612
- "loss": 0.8456,
1613
- "step": 23300
1614
- },
1615
- {
1616
- "epoch": 1.72,
1617
- "learning_rate": 1.1496437054631828e-05,
1618
- "loss": 0.8518,
1619
- "step": 23400
1620
- },
1621
- {
1622
- "epoch": 1.72,
1623
- "learning_rate": 1.1199524940617576e-05,
1624
- "loss": 0.8456,
1625
- "step": 23500
1626
- },
1627
- {
1628
- "epoch": 1.73,
1629
- "learning_rate": 1.0902612826603325e-05,
1630
- "loss": 0.8349,
1631
- "step": 23600
1632
- },
1633
- {
1634
- "epoch": 1.74,
1635
- "learning_rate": 1.0605700712589072e-05,
1636
- "loss": 0.8426,
1637
- "step": 23700
1638
- },
1639
- {
1640
- "epoch": 1.75,
1641
- "learning_rate": 1.030878859857482e-05,
1642
- "loss": 0.8503,
1643
- "step": 23800
1644
- },
1645
- {
1646
- "epoch": 1.75,
1647
- "learning_rate": 1.001187648456057e-05,
1648
- "loss": 0.844,
1649
- "step": 23900
1650
- },
1651
- {
1652
- "epoch": 1.76,
1653
- "learning_rate": 9.714964370546319e-06,
1654
- "loss": 0.822,
1655
- "step": 24000
1656
- },
1657
- {
1658
- "epoch": 1.76,
1659
- "eval_loss": 0.1550702005624771,
1660
- "eval_runtime": 1027.8442,
1661
- "eval_samples_per_second": 15.573,
1662
- "eval_steps_per_second": 1.947,
1663
- "eval_wer": 0.17010133289631815,
1664
- "step": 24000
1665
- },
1666
- {
1667
- "epoch": 1.77,
1668
- "learning_rate": 9.418052256532066e-06,
1669
- "loss": 0.8452,
1670
- "step": 24100
1671
- },
1672
- {
1673
- "epoch": 1.78,
1674
- "learning_rate": 9.121140142517814e-06,
1675
- "loss": 0.843,
1676
- "step": 24200
1677
- },
1678
- {
1679
- "epoch": 1.78,
1680
- "learning_rate": 8.824228028503563e-06,
1681
- "loss": 0.8429,
1682
- "step": 24300
1683
- },
1684
- {
1685
- "epoch": 1.79,
1686
- "learning_rate": 8.527315914489311e-06,
1687
- "loss": 0.8513,
1688
- "step": 24400
1689
- },
1690
- {
1691
- "epoch": 1.8,
1692
- "learning_rate": 8.23040380047506e-06,
1693
- "loss": 0.834,
1694
- "step": 24500
1695
- },
1696
- {
1697
- "epoch": 1.8,
1698
- "learning_rate": 7.933491686460806e-06,
1699
- "loss": 0.8383,
1700
- "step": 24600
1701
- },
1702
- {
1703
- "epoch": 1.81,
1704
- "learning_rate": 7.636579572446555e-06,
1705
- "loss": 0.8294,
1706
- "step": 24700
1707
- },
1708
- {
1709
- "epoch": 1.82,
1710
- "learning_rate": 7.339667458432303e-06,
1711
- "loss": 0.8335,
1712
- "step": 24800
1713
- },
1714
- {
1715
- "epoch": 1.83,
1716
- "learning_rate": 7.042755344418052e-06,
1717
- "loss": 0.8207,
1718
- "step": 24900
1719
- },
1720
- {
1721
- "epoch": 1.83,
1722
- "learning_rate": 6.745843230403799e-06,
1723
- "loss": 0.8188,
1724
- "step": 25000
1725
- },
1726
- {
1727
- "epoch": 1.83,
1728
- "eval_loss": 0.1527515947818756,
1729
- "eval_runtime": 1034.5359,
1730
- "eval_samples_per_second": 15.473,
1731
- "eval_steps_per_second": 1.934,
1732
- "eval_wer": 0.16672812192723696,
1733
- "step": 25000
1734
- },
1735
- {
1736
- "epoch": 1.84,
1737
- "learning_rate": 6.448931116389549e-06,
1738
- "loss": 0.8289,
1739
- "step": 25100
1740
- },
1741
- {
1742
- "epoch": 1.85,
1743
- "learning_rate": 6.152019002375296e-06,
1744
- "loss": 0.8306,
1745
- "step": 25200
1746
- },
1747
- {
1748
- "epoch": 1.86,
1749
- "learning_rate": 5.855106888361045e-06,
1750
- "loss": 0.8335,
1751
- "step": 25300
1752
- },
1753
- {
1754
- "epoch": 1.86,
1755
- "learning_rate": 5.5581947743467925e-06,
1756
- "loss": 0.8291,
1757
- "step": 25400
1758
- },
1759
- {
1760
- "epoch": 1.87,
1761
- "learning_rate": 5.261282660332541e-06,
1762
- "loss": 0.8206,
1763
- "step": 25500
1764
- },
1765
- {
1766
- "epoch": 1.88,
1767
- "learning_rate": 4.9643705463182895e-06,
1768
- "loss": 0.8242,
1769
- "step": 25600
1770
- },
1771
- {
1772
- "epoch": 1.89,
1773
- "learning_rate": 4.667458432304038e-06,
1774
- "loss": 0.8189,
1775
- "step": 25700
1776
- },
1777
- {
1778
- "epoch": 1.89,
1779
- "learning_rate": 4.370546318289786e-06,
1780
- "loss": 0.8275,
1781
- "step": 25800
1782
- },
1783
- {
1784
- "epoch": 1.9,
1785
- "learning_rate": 4.073634204275534e-06,
1786
- "loss": 0.8142,
1787
- "step": 25900
1788
- },
1789
- {
1790
- "epoch": 1.91,
1791
- "learning_rate": 3.776722090261282e-06,
1792
- "loss": 0.8305,
1793
- "step": 26000
1794
- },
1795
- {
1796
- "epoch": 1.91,
1797
- "eval_loss": 0.14921718835830688,
1798
- "eval_runtime": 1026.6478,
1799
- "eval_samples_per_second": 15.592,
1800
- "eval_steps_per_second": 1.949,
1801
- "eval_wer": 0.16312957500273134,
1802
- "step": 26000
1803
- },
1804
- {
1805
- "epoch": 1.91,
1806
- "learning_rate": 3.4798099762470307e-06,
1807
- "loss": 0.833,
1808
- "step": 26100
1809
- },
1810
- {
1811
- "epoch": 1.92,
1812
- "learning_rate": 3.1828978622327788e-06,
1813
- "loss": 0.8175,
1814
- "step": 26200
1815
- },
1816
- {
1817
- "epoch": 1.93,
1818
- "learning_rate": 2.888954869358669e-06,
1819
- "loss": 0.8259,
1820
- "step": 26300
1821
- },
1822
- {
1823
- "epoch": 1.94,
1824
- "learning_rate": 2.5920427553444177e-06,
1825
- "loss": 0.8262,
1826
- "step": 26400
1827
- },
1828
- {
1829
- "epoch": 1.94,
1830
- "learning_rate": 2.295130641330166e-06,
1831
- "loss": 0.8223,
1832
- "step": 26500
1833
- },
1834
- {
1835
- "epoch": 1.95,
1836
- "learning_rate": 1.9982185273159142e-06,
1837
- "loss": 0.8285,
1838
- "step": 26600
1839
- },
1840
- {
1841
- "epoch": 1.96,
1842
- "learning_rate": 1.7013064133016625e-06,
1843
- "loss": 0.8226,
1844
- "step": 26700
1845
- },
1846
- {
1847
- "epoch": 1.97,
1848
- "learning_rate": 1.404394299287411e-06,
1849
- "loss": 0.8154,
1850
- "step": 26800
1851
- },
1852
- {
1853
- "epoch": 1.97,
1854
- "learning_rate": 1.107482185273159e-06,
1855
- "loss": 0.8176,
1856
- "step": 26900
1857
- },
1858
- {
1859
- "epoch": 1.98,
1860
- "learning_rate": 8.105700712589074e-07,
1861
- "loss": 0.8122,
1862
- "step": 27000
1863
- },
1864
- {
1865
- "epoch": 1.98,
1866
- "eval_loss": 0.14789555966854095,
1867
- "eval_runtime": 1030.7995,
1868
- "eval_samples_per_second": 15.529,
1869
- "eval_steps_per_second": 1.941,
1870
- "eval_wer": 0.16106740959248333,
1871
- "step": 27000
1872
- },
1873
- {
1874
- "epoch": 1.99,
1875
- "learning_rate": 5.136579572446555e-07,
1876
- "loss": 0.818,
1877
- "step": 27100
1878
- },
1879
- {
1880
- "epoch": 2.0,
1881
- "learning_rate": 2.167458432304038e-07,
1882
- "loss": 0.8284,
1883
- "step": 27200
1884
- },
1885
- {
1886
- "epoch": 2.0,
1887
- "step": 27260,
1888
- "total_flos": 4.0396309180498005e+20,
1889
- "train_loss": 0.32739020716330625,
1890
- "train_runtime": 49115.8494,
1891
- "train_samples_per_second": 17.761,
1892
- "train_steps_per_second": 0.555
1893
- }
1894
- ],
1895
- "max_steps": 27260,
1896
- "num_train_epochs": 2,
1897
- "total_flos": 4.0396309180498005e+20,
1898
- "trial_name": null,
1899
- "trial_params": null
1900
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5702656574d36c90da8eb2dc371da036eaa568fdda0350f1ed96dfd11f67b798
3
  size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d671fb0f181e146452d1d68a46c3b54df59aa573465bc6cf0a59cb0e02b849a
3
  size 2991
vocab.json CHANGED
@@ -1 +1 @@
1
- {"=": 1, "@": 2, "[": 3, "]": 4, "_": 5, "`": 6, "a": 7, "b": 8, "c": 9, "d": 10, "e": 11, "f": 12, "g": 13, "h": 14, "i": 15, "j": 16, "k": 17, "l": 18, "m": 19, "n": 20, "o": 21, "p": 22, "q": 23, "r": 24, "s": 25, "t": 26, "u": 27, "v": 28, "w": 29, "x": 30, "y": 31, "z": 32, "¡": 33, "§": 34, "«": 35, "°": 36, "´": 37, "µ": 38, "·": 39, "»": 40, "×": 41, "ß": 42, "à": 43, "á": 44, "â": 45, "ã": 46, "ä": 47, "å": 48, "æ": 49, "ç": 50, "è": 51, "é": 52, "ê": 53, "ë": 54, "ì": 55, "í": 56, "î": 57, "ï": 58, "ð": 59, "ñ": 60, "ò": 61, "ó": 62, "ô": 63, "õ": 64, "ö": 65, "ø": 66, "ù": 67, "ú": 68, "û": 69, "ü": 70, "ý": 71, "þ": 72, "ā": 73, "ă": 74, "ą": 75, "ć": 76, "č": 77, "ď": 78, "đ": 79, "ē": 80, "ė": 81, "ę": 82, "ě": 83, "ğ": 84, "ġ": 85, "ħ": 86, "ī": 87, "ı": 88, "ł": 89, "ń": 90, "ņ": 91, "ň": 92, "ō": 93, "ŏ": 94, "ő": 95, "œ": 96, "ř": 97, "ś": 98, "ş": 99, "š": 100, "ť": 101, "ū": 102, "ů": 103, "ź": 104, "ż": 105, "ž": 106, "ơ": 107, "ǐ": 108, "ǔ": 109, "ș": 110, "ț": 111, "ə": 112, "ʻ": 113, "ʾ": 114, "ʿ": 115, "̆": 116, "̇": 117, "̥": 118, "а": 119, "в": 120, "е": 121, "и": 122, "к": 123, "м": 124, "о": 125, "р": 126, "с": 127, "ф": 128, "ч": 129, "ш": 130, "ѹ": 131, "א": 132, "ב": 133, "נ": 134, "ע": 135, "ש": 136, "་": 137, "ན": 138, "ḫ": 139, "ṟ": 140, "ṣ": 141, "ṭ": 142, "ạ": 143, "ả": 144, "ắ": 145, "ằ": 146, "ế": 147, "ễ": 148, "ệ": 149, "ọ": 150, "ồ": 151, "ộ": 152, "ụ": 153, "ứ": 154, "‑": 155, "‚": 156, "„": 157, "‟": 158, "′": 159, "″": 160, "‹": 161, "›": 162, "→": 163, "−": 164, "≡": 165, "⟨": 166, "⟩": 167, "カ": 168, "东": 169, "临": 170, "乡": 171, "关": 172, "合": 173, "城": 174, "孙": 175, "尣": 176, "幺": 177, "支": 178, "比": 179, "毛": 180, "泽": 181, "無": 182, "生": 183, "臣": 184, "辶": 185, "道": 186, "镇": 187, "黃": 188, "|": 0, "[UNK]": 189, "[PAD]": 190}
 
1
+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ß": 27, "à": 28, "á": 29, "â": 30, "ä": 31, "æ": 32, "ç": 33, "é": 34, "í": 35, "î": 36, "ó": 37, "ô": 38, "ö": 39, "ø": 40, "ú": 41, "ü": 42, "þ": 43, "ā": 44, "č": 45, "đ": 46, "ħ": 47, "ī": 48, "ł": 49, "ō": 50, "ő": 51, "œ": 52, "ř": 53, "ś": 54, "ş": 55, "š": 56, "ż": 57, "ž": 58, "ș": 59, "ț": 60, "ə": 61, "̇": 62, "о": 63, "с": 64, "ш": 65, "ѹ": 66, "": 67, "|": 0, "[UNK]": 68, "[PAD]": 69}