pere commited on
Commit
8b460b7
1 Parent(s): 66d5aa3
run_speech_recognition_whisper.py → old/run_speech_recognition_whisper.py RENAMED
File without changes
run_whisper.py → old/run_whisper.py RENAMED
File without changes
old/run_whisper_finetuning.py ADDED
@@ -0,0 +1,813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers Whisper model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset, load_metric, AUdio
32
+
33
+ import transformers
34
+ from transformers import (
35
+ AutoConfig,
36
+ AutoFeatureExtractor,
37
+ AutoModelForCTC,
38
+ AutoProcessor,
39
+ AutoTokenizer,
40
+ HfArgumentParser,
41
+ Trainer,
42
+ TrainingArguments,
43
+ Wav2Vec2Processor,
44
+ set_seed,
45
+ )
46
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
+ from transformers.utils import check_min_version
48
+ from transformers.utils.versions import require_version
49
+ from transformers import WhisperFeatureExtractor
50
+ from transformers import WhisperTokenizer
51
+ from transformers import WhisperProcessor
52
+ from transformers import WhisperForConditionalGeneration
53
+
54
+ from transformers import Seq2SeqTrainingArguments
55
+ from transformers import Seq2SeqTrainer
56
+
57
+ import evaluate
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ # ** Check if this is needed...
63
+
64
+ def list_field(default=None, metadata=None):
65
+ return field(default_factory=lambda: default, metadata=metadata)
66
+
67
+
68
+ @dataclass
69
+ class ModelArguments:
70
+ """
71
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
72
+ """
73
+
74
+ model_name_or_path: str = field(
75
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
76
+ )
77
+ tokenizer_name_or_path: Optional[str] = field(
78
+ default=None,
79
+ metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
80
+ )
81
+ cache_dir: Optional[str] = field(
82
+ default=None,
83
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
84
+ )
85
+ freeze_feature_encoder: bool = field(
86
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
87
+ )
88
+ attention_dropout: float = field(
89
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
90
+ )
91
+ activation_dropout: float = field(
92
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
93
+ )
94
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
95
+ hidden_dropout: float = field(
96
+ default=0.0,
97
+ metadata={
98
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
99
+ },
100
+ )
101
+ final_dropout: float = field(
102
+ default=0.0,
103
+ metadata={"help": "The dropout probability for the final projection layer."},
104
+ )
105
+ mask_time_prob: float = field(
106
+ default=0.05,
107
+ metadata={
108
+ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
109
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
110
+ "vectors will be masked along the time axis."
111
+ },
112
+ )
113
+ mask_time_length: int = field(
114
+ default=10,
115
+ metadata={"help": "Length of vector span to mask along the time axis."},
116
+ )
117
+ mask_feature_prob: float = field(
118
+ default=0.0,
119
+ metadata={
120
+ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
121
+ "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
122
+ },
123
+ )
124
+ mask_feature_length: int = field(
125
+ default=10,
126
+ metadata={"help": "Length of vector span to mask along the feature axis."},
127
+ )
128
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
129
+ ctc_loss_reduction: Optional[str] = field(
130
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
131
+ )
132
+ ctc_zero_infinity: Optional[bool] = field(
133
+ default=False, metadata={"help": "If True, will try yo aboud the CTC loss goinf to infinity."}
134
+ )
135
+
136
+
137
+ @dataclass
138
+ class DataTrainingArguments:
139
+ """
140
+ Arguments pertaining to what data we are going to input our model for training and eval.
141
+
142
+ Using `HfArgumentParser` we can turn this class
143
+ into argparse arguments to be able to specify them on
144
+ the command line.
145
+ """
146
+
147
+ # dataset_name: str = field(
148
+ # metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
149
+ # )
150
+ # dataset_config_name: str = field(
151
+ # default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
152
+ # )
153
+ train_split_name: str = field(
154
+ default="train",
155
+ metadata={
156
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
157
+ },
158
+ )
159
+ eval_split_name: str = field(
160
+ default="test",
161
+ metadata={
162
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
163
+ },
164
+ )
165
+ audio_column_name: str = field(
166
+ default="audio",
167
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
168
+ )
169
+ text_column_name: str = field(
170
+ default="text",
171
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
172
+ )
173
+ overwrite_cache: bool = field(
174
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
175
+ )
176
+ preprocessing_num_workers: Optional[int] = field(
177
+ default=None,
178
+ metadata={"help": "The number of processes to use for the preprocessing."},
179
+ )
180
+ max_train_samples: Optional[int] = field(
181
+ default=None,
182
+ metadata={
183
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
184
+ "value if set."
185
+ },
186
+ )
187
+ max_eval_samples: Optional[int] = field(
188
+ default=None,
189
+ metadata={
190
+ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
191
+ "value if set."
192
+ },
193
+ )
194
+ chars_to_ignore: Optional[List[str]] = list_field(
195
+ default=None,
196
+ metadata={"help": "A list of characters to remove from the transcripts."},
197
+ )
198
+ eval_metrics: List[str] = list_field(
199
+ default=["wer"],
200
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
201
+ )
202
+ max_duration_in_seconds: float = field(
203
+ default=20.0,
204
+ metadata={
205
+ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
206
+ },
207
+ )
208
+ min_duration_in_seconds: float = field(
209
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
210
+ )
211
+ preprocessing_only: bool = field(
212
+ default=False,
213
+ metadata={
214
+ "help": "Whether to only do data preprocessing and skip training. "
215
+ "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
216
+ "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
217
+ "so that the cached datasets can consequently be loaded in distributed training"
218
+ },
219
+ )
220
+ use_auth_token: bool = field(
221
+ default=False,
222
+ metadata={
223
+ "help": "If :obj:`True`, will use the token generated when running"
224
+ ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
225
+ },
226
+ )
227
+ unk_token: str = field(
228
+ default="[UNK]",
229
+ metadata={"help": "The unk token for the tokenizer"},
230
+ )
231
+ pad_token: str = field(
232
+ default="[PAD]",
233
+ metadata={"help": "The padding token for the tokenizer"},
234
+ )
235
+ word_delimiter_token: str = field(
236
+ default="|",
237
+ metadata={"help": "The word delimiter token for the tokenizer"},
238
+ )
239
+ phoneme_language: Optional[str] = field(
240
+ default=None,
241
+ metadata={
242
+ "help": "The target language that should be used be"
243
+ " passed to the tokenizer for tokenization. Note that"
244
+ " this is only relevant if the model classifies the"
245
+ " input audio to a sequence of phoneme sequences."
246
+ },
247
+ )
248
+
249
+
250
+ @dataclass
251
+ class DataCollatorCTCWithPadding:
252
+ """
253
+ Data collator that will dynamically pad the inputs received.
254
+ Args:
255
+ processor (:class:`~transformers.AutoProcessor`)
256
+ The processor used for proccessing the data.
257
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
258
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
259
+ among:
260
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
261
+ sequence if provided).
262
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
263
+ maximum acceptable input length for the model if that argument is not provided.
264
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
265
+ different lengths).
266
+ max_length (:obj:`int`, `optional`):
267
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
268
+ max_length_labels (:obj:`int`, `optional`):
269
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
270
+ pad_to_multiple_of (:obj:`int`, `optional`):
271
+ If set will pad the sequence to a multiple of the provided value.
272
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
273
+ 7.5 (Volta).
274
+ """
275
+
276
+ processor: AutoProcessor
277
+ padding: Union[bool, str] = "longest"
278
+ pad_to_multiple_of: Optional[int] = None
279
+ pad_to_multiple_of_labels: Optional[int] = None
280
+
281
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
282
+ # split inputs and labels since they have to be of different lenghts and need
283
+ # different padding methods
284
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
285
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
286
+
287
+ batch = self.processor.pad(
288
+ input_features,
289
+ padding=self.padding,
290
+ pad_to_multiple_of=self.pad_to_multiple_of,
291
+ return_tensors="pt",
292
+ )
293
+
294
+ with self.processor.as_target_processor():
295
+ labels_batch = self.processor.pad(
296
+ label_features,
297
+ padding=self.padding,
298
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
299
+ return_tensors="pt",
300
+ )
301
+
302
+ # replace padding with -100 to ignore loss correctly
303
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
304
+
305
+ batch["labels"] = labels
306
+
307
+ return batch
308
+
309
+
310
+ def create_vocabulary_from_data(
311
+ datasets: DatasetDict,
312
+ word_delimiter_token: Optional[str] = None,
313
+ unk_token: Optional[str] = None,
314
+ pad_token: Optional[str] = None,
315
+ ):
316
+ # Given training and test labels create vocabulary
317
+ alphabet = set()
318
+
319
+ def extract_all_chars(batch):
320
+ all_text = " ".join(batch["target_text"])
321
+ alphabet.update(all_text)
322
+
323
+ datasets.map(
324
+ extract_all_chars,
325
+ batched=True,
326
+ batch_size=-1,
327
+ keep_in_memory=True,
328
+ remove_columns=datasets["train"].column_names,
329
+ )
330
+
331
+ # # take union of all unique characters in each dataset
332
+ # vocab_set = functools.reduce(
333
+ # lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
334
+ # )["vocab"][0]
335
+
336
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(alphabet)))}
337
+
338
+ # replace white space with delimiter token
339
+ if word_delimiter_token is not None:
340
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
341
+ del vocab_dict[" "]
342
+
343
+ # add unk and pad token
344
+ if unk_token is not None:
345
+ vocab_dict[unk_token] = len(vocab_dict)
346
+
347
+ if pad_token is not None:
348
+ vocab_dict[pad_token] = len(vocab_dict)
349
+
350
+ return vocab_dict
351
+
352
+
353
+ def make_dataset(seed=42):
354
+ # Pre-processing dataset
355
+ import re
356
+
357
+ def map_nst(entry):
358
+ text = entry["text"].lower()
359
+ text = text.replace("(...Vær stille under dette opptaket...)", "")
360
+ text = re.sub('[áàâ]', 'a', text)
361
+ text = re.sub('[ä]', 'æ', text)
362
+ text = re.sub('[éèëê]', 'e', text)
363
+ text = re.sub('[íìïî]', 'i', text)
364
+ text = re.sub('[óòöô]', 'o', text)
365
+ text = re.sub('[ö]', 'ø', text)
366
+ text = re.sub('[ç]', 'c', text)
367
+ text = re.sub('[úùüû]', 'u', text)
368
+ # text = re.sub('\\(?=(Punktum|Komma|Utropstegn|Spørsmålstegn))', ' ', text)
369
+ text = re.sub('\s+', ' ', text)
370
+ return {"text": text}
371
+
372
+ def filter_nst(entry):
373
+ if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
374
+ return False # Too short
375
+ if re.match(entry["type"], "pIW|CA"):
376
+ return False # Spelling out words
377
+ return True
378
+
379
+ def filter_npsc(entry):
380
+ # False if there are digits in the text
381
+ if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
382
+ return False # Too short
383
+ if re.search("\d", entry["text"]):
384
+ return False
385
+ return True
386
+
387
+ def map_npsc(entry):
388
+ batch = {"text": entry["text"].lower()}
389
+ batch["text"] = re.sub('[áàâ]', 'a', batch["text"])
390
+ batch["text"] = re.sub('[ä]', 'æ', batch["text"])
391
+ batch["text"] = re.sub('[éèëê]', 'e', batch["text"])
392
+ batch["text"] = re.sub('[íìïî]', 'i', batch["text"])
393
+ batch["text"] = re.sub('[óòöô]', 'o', batch["text"])
394
+ batch["text"] = re.sub('[ö]', 'ø', batch["text"])
395
+ batch["text"] = re.sub('[ç]', 'c', batch["text"])
396
+ batch["text"] = re.sub('[úùüû]', 'u', batch["text"])
397
+ batch["text"] = re.sub('\s', ' ', batch["text"])
398
+ batch["text"] = re.sub('<ee>', 'eee', batch["text"])
399
+ batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
400
+ batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
401
+ batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
402
+ # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
403
+ if "<" in batch["text"]:
404
+ raise ValueError(batch["text"])
405
+ return batch
406
+
407
+ nst = datasets.load_dataset("NbAiLab/NST", "no-close")
408
+ npsc = datasets.load_dataset("NbAiLab/NPSC", "16K_mp3")
409
+ # TODO NST_hesitate
410
+
411
+ split = len(npsc["train"]) / (len(npsc["train"]) + len(npsc["validation"])) # Use same train/val ratio as NPSC
412
+ nst_train = nst["train"].train_test_split(train_size=split, seed=seed)
413
+ nst["train"] = nst_train["train"]
414
+ nst["validation"] = nst_train["test"]
415
+
416
+ nst = nst.filter(filter_nst).map(map_nst).shuffle(seed=seed)
417
+ npsc = npsc.filter(filter_npsc).map(map_npsc).shuffle(seed=seed)
418
+
419
+ npsc_base = npsc.remove_columns([col for col in npsc["train"].column_names if col not in ["text", "audio"]])
420
+ nst_base = nst.remove_columns([col for col in nst["train"].column_names if col not in ["text", "audio"]])
421
+
422
+ combined = {}
423
+ for split in "train", "validation", "test":
424
+ probs = np.array([len(nst_base[split]), len(npsc_base[split])]) # Weight by number of examples
425
+ probs = (probs / probs.sum()).tolist()
426
+ comb = datasets.interleave_datasets([nst_base[split], npsc_base[split]], probabilities=probs, seed=seed)
427
+ combined[split] = comb
428
+
429
+ return datasets.DatasetDict(**combined)
430
+
431
+
432
+ def main():
433
+ # See all possible arguments in src/transformers/training_args.py
434
+ # or by passing the --help flag to this script.
435
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
436
+
437
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
438
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
439
+ # If we pass only one argument to the script and it's the path to a json file,
440
+ # let's parse it to get our arguments.
441
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
442
+ else:
443
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
444
+
445
+ # Detecting last checkpoint.
446
+ last_checkpoint = None
447
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
448
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
449
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
450
+ raise ValueError(
451
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
452
+ "Use --overwrite_output_dir to overcome."
453
+ )
454
+ elif last_checkpoint is not None:
455
+ logger.info(
456
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
457
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
458
+ )
459
+
460
+ # Setup logging
461
+ logging.basicConfig(
462
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
463
+ datefmt="%m/%d/%Y %H:%M:%S",
464
+ handlers=[logging.StreamHandler(sys.stdout)],
465
+ )
466
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
467
+
468
+ # Log on each process the small summary:
469
+ logger.warning(
470
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
471
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
472
+ )
473
+ # Set the verbosity to info of the Transformers logger (on main process only):
474
+ if is_main_process(training_args.local_rank):
475
+ transformers.utils.logging.set_verbosity_info()
476
+ logger.info("Training/evaluation parameters %s", training_args)
477
+
478
+ # Set seed before initializing model.
479
+ set_seed(training_args.seed)
480
+
481
+ # 1. First, let's load the dataset
482
+ raw_datasets = make_dataset(seed=training_args.seed)
483
+
484
+ if training_args.do_train:
485
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
486
+ raise ValueError(
487
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
488
+ "Make sure to set `--audio_column_name` to the correct audio column - one of "
489
+ f"{', '.join(raw_datasets['train'].column_names)}."
490
+ )
491
+
492
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
493
+ raise ValueError(
494
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
495
+ "Make sure to set `--text_column_name` to the correct text column - one of "
496
+ f"{', '.join(raw_datasets['train'].column_names)}."
497
+ )
498
+
499
+ if data_args.max_train_samples is not None:
500
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
501
+
502
+ if training_args.do_eval:
503
+ if data_args.max_eval_samples is not None:
504
+ raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
505
+
506
+ # 2. We remove some special characters from the datasets
507
+ # that make training complicated and do not help in transcribing the speech
508
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
509
+ # that could be easily picked up by the model
510
+ # chars_to_ignore_regex = (
511
+ # f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
512
+ # )
513
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
514
+
515
+ text_column_name = data_args.text_column_name
516
+
517
+ def remove_special_characters(batch):
518
+ if chars_to_ignore_regex is not None:
519
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
520
+ else:
521
+ batch["target_text"] = batch[text_column_name].lower() + " "
522
+ return batch
523
+
524
+ with training_args.main_process_first(desc="dataset map special characters removal"):
525
+ raw_datasets = raw_datasets.map(
526
+ remove_special_characters,
527
+ remove_columns=[text_column_name],
528
+ desc="remove special characters from datasets",
529
+ )
530
+
531
+ # save special tokens for tokenizer
532
+ word_delimiter_token = data_args.word_delimiter_token
533
+ unk_token = data_args.unk_token
534
+ pad_token = data_args.pad_token
535
+
536
+ # 3. Next, let's load the config as we might need it to create
537
+ # the tokenizer
538
+ # load config
539
+ config = AutoConfig.from_pretrained(
540
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
541
+ )
542
+
543
+ # 4. Next, if no tokenizer file is defined,
544
+ # we create the vocabulary of the model by extracting all unique characters from
545
+ # the training and evaluation datasets
546
+ # We need to make sure that only first rank saves vocabulary
547
+ # make sure all processes wait until vocab is created
548
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
549
+ tokenizer_kwargs = {}
550
+ if tokenizer_name_or_path is None:
551
+ # save vocab in training output dir
552
+ tokenizer_name_or_path = training_args.output_dir
553
+
554
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
555
+
556
+ with training_args.main_process_first():
557
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
558
+ os.remove(vocab_file)
559
+
560
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
561
+ if not os.path.isfile(vocab_file):
562
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
563
+ vocab_dict = create_vocabulary_from_data(
564
+ raw_datasets,
565
+ word_delimiter_token=word_delimiter_token,
566
+ unk_token=unk_token,
567
+ pad_token=pad_token,
568
+ )
569
+
570
+ # save vocab dict to be loaded into tokenizer
571
+ with open(vocab_file, "w") as file:
572
+ json.dump(vocab_dict, file)
573
+
574
+ # if tokenizer has just been created
575
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
576
+ tokenizer_kwargs = {
577
+ "config": config if config.tokenizer_class is not None else None,
578
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
579
+ "unk_token": unk_token,
580
+ "pad_token": pad_token,
581
+ "word_delimiter_token": word_delimiter_token,
582
+ }
583
+
584
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
585
+ # Note for distributed training, the .from_pretrained methods guarantee that only
586
+ # one local process can concurrently download model & vocab.
587
+
588
+ # load feature_extractor and tokenizer
589
+ tokenizer = AutoTokenizer.from_pretrained(
590
+ tokenizer_name_or_path,
591
+ use_auth_token=data_args.use_auth_token,
592
+ **tokenizer_kwargs,
593
+ )
594
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
595
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
596
+ )
597
+
598
+ # adapt config
599
+ config.update(
600
+ {
601
+ "feat_proj_dropout": model_args.feat_proj_dropout,
602
+ "attention_dropout": model_args.attention_dropout,
603
+ "hidden_dropout": model_args.hidden_dropout,
604
+ "final_dropout": model_args.final_dropout,
605
+ "mask_time_prob": model_args.mask_time_prob,
606
+ "mask_time_length": model_args.mask_time_length,
607
+ "mask_feature_prob": model_args.mask_feature_prob,
608
+ "mask_feature_length": model_args.mask_feature_length,
609
+ "gradient_checkpointing": training_args.gradient_checkpointing,
610
+ "layerdrop": model_args.layerdrop,
611
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
612
+ "ctc_zero_infinity": model_args.ctc_zero_infinity,
613
+ "pad_token_id": tokenizer.pad_token_id,
614
+ "vocab_size": len(tokenizer),
615
+ "activation_dropout": model_args.activation_dropout,
616
+ }
617
+ )
618
+
619
+ # create model
620
+ model = AutoModelForCTC.from_pretrained(
621
+ model_args.model_name_or_path,
622
+ cache_dir=model_args.cache_dir,
623
+ config=config,
624
+ use_auth_token=data_args.use_auth_token,
625
+ )
626
+
627
+ # freeze encoder
628
+ if model_args.freeze_feature_encoder:
629
+ model.freeze_feature_encoder()
630
+
631
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
632
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
633
+ # so that we just need to set the correct target sampling rate and normalize the input
634
+ # via the `feature_extractor`
635
+
636
+ # make sure that dataset decodes audio with correct sampling rate
637
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
638
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
639
+ raw_datasets = raw_datasets.cast_column(
640
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
641
+ )
642
+
643
+ # derive max & min input length for sample rate & max duration
644
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
645
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
646
+ audio_column_name = data_args.audio_column_name
647
+ num_workers = data_args.preprocessing_num_workers
648
+
649
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
650
+ phoneme_language = data_args.phoneme_language
651
+
652
+ # Preprocessing the datasets.
653
+ # We need to read the audio files as arrays and tokenize the targets.
654
+ def prepare_dataset(batch):
655
+ # load audio
656
+ sample = batch[audio_column_name]
657
+
658
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
659
+ batch["input_values"] = inputs.input_values[0]
660
+ batch["input_length"] = len(batch["input_values"])
661
+
662
+ # encode targets
663
+ additional_kwargs = {}
664
+ if phoneme_language is not None:
665
+ additional_kwargs["phonemizer_lang"] = phoneme_language
666
+
667
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
668
+ return batch
669
+
670
+ with training_args.main_process_first(desc="dataset map preprocessing"):
671
+ vectorized_datasets = raw_datasets.map(
672
+ prepare_dataset,
673
+ remove_columns=next(iter(raw_datasets.values())).column_names,
674
+ num_proc=num_workers,
675
+ desc="preprocess datasets",
676
+ )
677
+
678
+ def is_audio_in_length_range(length):
679
+ return length > min_input_length and length < max_input_length
680
+
681
+ # filter data that is shorter than min_input_length
682
+ vectorized_datasets = vectorized_datasets.filter(
683
+ is_audio_in_length_range,
684
+ num_proc=num_workers,
685
+ input_columns=["input_length"],
686
+ )
687
+
688
+ # 7. Next, we can prepare the training.
689
+ # Let's use word error rate (WER) as our evaluation metric,
690
+ # instantiate a data collator and the trainer
691
+
692
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
693
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
694
+
695
+ # for large datasets it is advised to run the preprocessing on a
696
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
697
+ # be a timeout when running the script in distributed mode.
698
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
699
+ # cached dataset
700
+ if data_args.preprocessing_only:
701
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
702
+ return
703
+
704
+ def compute_metrics(pred):
705
+ pred_logits = pred.predictions
706
+ pred_ids = np.argmax(pred_logits, axis=-1)
707
+
708
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
709
+
710
+ pred_str = tokenizer.batch_decode(pred_ids)
711
+ # we do not want to group tokens when computing the metrics
712
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
713
+
714
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
715
+
716
+ return metrics
717
+
718
+ # Now save everything to be able to create a single processor later
719
+ if is_main_process(training_args.local_rank):
720
+ # save feature extractor, tokenizer and config
721
+ feature_extractor.save_pretrained(training_args.output_dir)
722
+ tokenizer.save_pretrained(training_args.output_dir)
723
+ config.save_pretrained(training_args.output_dir)
724
+
725
+ try:
726
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
727
+ except (OSError, KeyError):
728
+ warnings.warn(
729
+ "Loading a processor from a feature extractor config that does not"
730
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
731
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
732
+ " `'processor_class': 'Wav2Vec2Processor'`",
733
+ FutureWarning,
734
+ )
735
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
736
+
737
+ # Instantiate custom data collator
738
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
739
+
740
+ # Initialize Trainer
741
+ trainer = Trainer(
742
+ model=model,
743
+ data_collator=data_collator,
744
+ args=training_args,
745
+ compute_metrics=compute_metrics,
746
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
747
+ eval_dataset=vectorized_datasets["validation"] if training_args.do_eval else None,
748
+ tokenizer=feature_extractor,
749
+ )
750
+
751
+ # 8. Finally, we can start training
752
+
753
+ # Training
754
+ if training_args.do_train:
755
+
756
+ # use last checkpoint if exist
757
+ if last_checkpoint is not None:
758
+ checkpoint = last_checkpoint
759
+ elif os.path.isdir(model_args.model_name_or_path):
760
+ checkpoint = model_args.model_name_or_path
761
+ else:
762
+ checkpoint = None
763
+
764
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
765
+ trainer.save_model()
766
+
767
+ metrics = train_result.metrics
768
+ max_train_samples = (
769
+ data_args.max_train_samples
770
+ if data_args.max_train_samples is not None
771
+ else len(vectorized_datasets["train"])
772
+ )
773
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
774
+
775
+ trainer.log_metrics("train", metrics)
776
+ trainer.save_metrics("train", metrics)
777
+ trainer.save_state()
778
+
779
+ # Evaluation
780
+ results = {}
781
+ if training_args.do_eval:
782
+ logger.info("*** Evaluate ***")
783
+ metrics = trainer.evaluate()
784
+ max_eval_samples = (
785
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
786
+ )
787
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
788
+
789
+ trainer.log_metrics("eval", metrics)
790
+ trainer.save_metrics("eval", metrics)
791
+
792
+ # Write model card and (optionally) push to hub
793
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
794
+ kwargs = {
795
+ "finetuned_from": model_args.model_name_or_path,
796
+ "tasks": "speech-recognition",
797
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
798
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
799
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
800
+ }
801
+ if "common_voice" in data_args.dataset_name:
802
+ kwargs["language"] = config_name
803
+
804
+ if training_args.push_to_hub:
805
+ trainer.push_to_hub(**kwargs)
806
+ else:
807
+ trainer.create_model_card(**kwargs)
808
+
809
+ return results
810
+
811
+
812
+ if __name__ == "__main__":
813
+ main()
run_speech_recognition_whisper_pere.py → old/run_whisper_finetuning_old.py RENAMED
@@ -22,7 +22,7 @@ import re
22
  import sys
23
  import warnings
24
  from dataclasses import dataclass, field
25
- from typing import Any, Dict, List, Optional,Union
26
  import evaluate
27
 
28
  import numpy as np
@@ -47,9 +47,11 @@ from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
  from transformers.utils import check_min_version
48
  from transformers.utils.versions import require_version
49
 
 
50
  def list_field(default=None, metadata=None):
51
  return field(default_factory=lambda: default, metadata=metadata)
52
 
 
53
  @dataclass
54
  class ModelArguments:
55
  """
@@ -57,21 +59,25 @@ class ModelArguments:
57
  """
58
 
59
  model_name_or_path: str = field(
60
- metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 
61
  )
62
  language: str = field(
63
  metadata={"help": "Whisper specific language"}
64
  )
65
  task: str = field(
66
- metadata={"help": "Whisper specific task, i.e., 'transcribe' or 'translate'"}
 
67
  )
68
  tokenizer_name_or_path: Optional[str] = field(
69
  default=None,
70
- metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
 
71
  )
72
  cache_dir: Optional[str] = field(
73
  default=None,
74
- metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 
75
  )
76
  freeze_feature_encoder: bool = field(
77
  default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
@@ -82,7 +88,8 @@ class ModelArguments:
82
  activation_dropout: float = field(
83
  default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
84
  )
85
- feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
 
86
  hidden_dropout: float = field(
87
  default=0.0,
88
  metadata={
@@ -91,7 +98,8 @@ class ModelArguments:
91
  )
92
  final_dropout: float = field(
93
  default=0.0,
94
- metadata={"help": "The dropout probability for the final projection layer."},
 
95
  )
96
  mask_time_prob: float = field(
97
  default=0.05,
@@ -116,7 +124,8 @@ class ModelArguments:
116
  default=10,
117
  metadata={"help": "Length of vector span to mask along the feature axis."},
118
  )
119
- layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
 
120
  ctc_loss_reduction: Optional[str] = field(
121
  default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
122
  )
@@ -136,7 +145,8 @@ class DataTrainingArguments:
136
  """
137
 
138
  dataset_name: str = field(
139
- metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
 
140
  )
141
  dataset_config_name: str = field(
142
  default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
@@ -155,11 +165,13 @@ class DataTrainingArguments:
155
  )
156
  audio_column_name: str = field(
157
  default="audio",
158
- metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
 
159
  )
160
  text_column_name: str = field(
161
  default="text",
162
- metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
 
163
  )
164
  overwrite_cache: bool = field(
165
  default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
@@ -188,7 +200,8 @@ class DataTrainingArguments:
188
  )
189
  eval_metrics: List[str] = list_field(
190
  default=["wer"],
191
- metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
 
192
  )
193
  max_duration_in_seconds: float = field(
194
  default=20.0,
@@ -229,7 +242,8 @@ class DataTrainingArguments:
229
  )
230
  predict_with_generate: bool = field(
231
  default=True,
232
- metadata={"help": "Output tokens in addition to loss and digits for calculating metrics"},
 
233
  )
234
  generation_max_length: int = field(
235
  default=225,
@@ -259,16 +273,21 @@ class DataCollatorSpeechSeq2SeqWithPadding:
259
  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
260
  # split inputs and labels since they have to be of different lengths and need different padding methods
261
  # first treat the audio inputs by simply returning torch tensors
262
- input_features = [{"input_features": feature["input_features"]} for feature in features]
263
- batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
 
 
264
 
265
  # get the tokenized label sequences
266
- label_features = [{"input_ids": feature["labels"]} for feature in features]
 
267
  # pad the labels to max length
268
- labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
 
269
 
270
  # replace padding with -100 to ignore loss correctly
271
- labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
 
272
 
273
  # if bos token is appended in previous tokenization step,
274
  # cut bos token here as it's append later anyways
@@ -279,15 +298,16 @@ class DataCollatorSpeechSeq2SeqWithPadding:
279
  return batch
280
 
281
 
282
- def main():
283
  # See all possible arguments in src/transformers/training_args.py
284
  # or by passing the --help flag to this script.
285
  # We now keep distinct sets of args, for a cleaner separation of concerns.
286
- parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
 
287
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
288
-
289
 
290
  # Metrics
 
291
  def compute_metrics(pred):
292
  pred_ids = pred.predictions
293
  label_ids = pred.label_ids
@@ -315,16 +335,16 @@ def main():
315
  # encode target text to label ids
316
  batch["labels"] = tokenizer(batch["sentence"]).input_ids
317
  return batch
318
-
319
  def print_training_arguments(model_args, data_args, training_args):
320
  print("Starting with the following parameters:")
321
  print("\n* Model arguments:")
322
- pprint(vars(model_args),indent=2)
323
  print("\n* Data arguments")
324
- pprint(vars(data_args),indent=2)
325
  print("\n* Training arguments")
326
- pprint(vars(training_args),indent=2)
327
-
328
  # TODO - Might use this function later
329
  # def make_dataset(training_args, data_args):
330
  # seed = training_args.seed or 42
@@ -338,28 +358,26 @@ def main():
338
  # Load dataset
339
  speech_data = DatasetDict()
340
  speech_data["train"] = load_dataset(
341
- data_args.dataset_name, data_args.dataset_config_name, split="train", use_auth_token=True)
342
  speech_data["test"] = load_dataset(
343
- data_args.dataset_name, data_args.dataset_config_name, split="test", use_auth_token=True)
344
 
345
  # TODO - Implement streaming and include this
346
  # speech_data = make_dataset(training_args, data_args)
347
 
348
-
349
- # Adapt dataset - Change column names and delete extra data
350
- # Map columns
351
  if "audio" not in speech_data.column_names["train"]:
352
- speech_data = speech_data.rename_column(source, "audio")
 
353
 
354
  if "sentence" not in speech_data.column_names["train"]:
355
- speech_data = speech_data.rename_column(target, "sentence")
356
-
357
- # Remove not needed columns
358
- remove_list = [i for i in speech_data.column_names["train"]
359
- if i not in ["audio", "sentence"]]
360
-
361
- speech_data = speech_data.remove_columns(remove_list)
362
 
 
 
 
363
 
364
  # Initialise
365
  feature_extractor = WhisperFeatureExtractor.from_pretrained(
@@ -375,12 +393,10 @@ def main():
375
  speech_data = speech_data.map(
376
  prepare_dataset, remove_columns=speech_data.column_names["train"], num_proc=1)
377
 
378
-
379
-
380
  # Metrics
381
  metric = evaluate.load("wer")
382
 
383
- #Detecting last checkpoint.
384
  last_checkpoint = None
385
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
386
  last_checkpoint = get_last_checkpoint(training_args.output_dir)
@@ -393,8 +409,8 @@ def main():
393
  logger.info(
394
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
395
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
396
-
397
- )
398
 
399
  # Training
400
  if training_args.do_train:
@@ -403,22 +419,21 @@ def main():
403
  if last_checkpoint is not None:
404
  checkpoint = last_checkpoint
405
  elif os.path.isdir(model_args.model_name_or_path):
406
- checkpoint = model_args.model_name_or_path
407
  else:
408
  checkpoint = None
409
-
410
  # We need to set use_cache=False here if we want to use gradient accumulation
411
  model = WhisperForConditionalGeneration.from_pretrained(
412
- "openai/whisper-small", use_cache=False)
413
 
414
  # Overriding generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):
415
  model.config.forced_decoder_ids = None
416
  model.config.suppress_tokens = []
417
-
418
  # Set seed before initializing model.
419
  set_seed(training_args.seed)
420
 
421
-
422
  trainer = Seq2SeqTrainer(
423
  args=training_args,
424
  model=model,
@@ -428,7 +443,7 @@ def main():
428
  compute_metrics=compute_metrics,
429
  tokenizer=processor.feature_extractor,
430
  )
431
-
432
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
433
  trainer.save_model()
434
 
@@ -436,23 +451,25 @@ def main():
436
  trainer.log_metrics("train", metrics)
437
  trainer.save_metrics("train", metrics)
438
  trainer.save_state()
439
-
440
  if training_args.push_to_hub:
441
  trainer.push_to_hub(**kwargs)
442
  else:
443
  trainer.create_model_card(**kwargs)
444
-
445
  # TODO - Look closer into the evaluation and the model card writing.
446
-
447
  # Evaluation
448
  results = {}
449
  if training_args.do_eval:
450
  logger.info("*** Evaluate ***")
451
  metrics = trainer.evaluate()
452
  max_eval_samples = (
453
- data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
 
454
  )
455
- metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
 
456
 
457
  trainer.log_metrics("eval", metrics)
458
  trainer.save_metrics("eval", metrics)
@@ -467,15 +484,16 @@ def main():
467
  "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
468
  "language": model_args.language,
469
  }
470
-
471
  return results
472
 
473
 
474
- #XLA hook
475
  def _mp_fn(index):
476
  # For xla_spawn (TPUs)
477
  print("The XLA is initiated")
478
  main()
479
 
 
480
  if __name__ == "__main__":
481
  main()
 
22
  import sys
23
  import warnings
24
  from dataclasses import dataclass, field
25
+ from typing import Any, Dict, List, Optional, Union
26
  import evaluate
27
 
28
  import numpy as np
 
47
  from transformers.utils import check_min_version
48
  from transformers.utils.versions import require_version
49
 
50
+
51
  def list_field(default=None, metadata=None):
52
  return field(default_factory=lambda: default, metadata=metadata)
53
 
54
+
55
  @dataclass
56
  class ModelArguments:
57
  """
 
59
  """
60
 
61
  model_name_or_path: str = field(
62
+ metadata={
63
+ "help": "Path to pretrained model or model identifier from huggingface.co/models"}
64
  )
65
  language: str = field(
66
  metadata={"help": "Whisper specific language"}
67
  )
68
  task: str = field(
69
+ metadata={
70
+ "help": "Whisper specific task, i.e., 'transcribe' or 'translate'"}
71
  )
72
  tokenizer_name_or_path: Optional[str] = field(
73
  default=None,
74
+ metadata={
75
+ "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
76
  )
77
  cache_dir: Optional[str] = field(
78
  default=None,
79
+ metadata={
80
+ "help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
81
  )
82
  freeze_feature_encoder: bool = field(
83
  default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
 
88
  activation_dropout: float = field(
89
  default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
90
  )
91
+ feat_proj_dropout: float = field(default=0.0, metadata={
92
+ "help": "The dropout ratio for the projected features."})
93
  hidden_dropout: float = field(
94
  default=0.0,
95
  metadata={
 
98
  )
99
  final_dropout: float = field(
100
  default=0.0,
101
+ metadata={
102
+ "help": "The dropout probability for the final projection layer."},
103
  )
104
  mask_time_prob: float = field(
105
  default=0.05,
 
124
  default=10,
125
  metadata={"help": "Length of vector span to mask along the feature axis."},
126
  )
127
+ layerdrop: float = field(default=0.0, metadata={
128
+ "help": "The LayerDrop probability."})
129
  ctc_loss_reduction: Optional[str] = field(
130
  default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
131
  )
 
145
  """
146
 
147
  dataset_name: str = field(
148
+ metadata={
149
+ "help": "The configuration name of the dataset to use (via the datasets library)."}
150
  )
151
  dataset_config_name: str = field(
152
  default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
 
165
  )
166
  audio_column_name: str = field(
167
  default="audio",
168
+ metadata={
169
+ "help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
170
  )
171
  text_column_name: str = field(
172
  default="text",
173
+ metadata={
174
+ "help": "The name of the dataset column containing the text data. Defaults to 'text'"},
175
  )
176
  overwrite_cache: bool = field(
177
  default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 
200
  )
201
  eval_metrics: List[str] = list_field(
202
  default=["wer"],
203
+ metadata={
204
+ "help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
205
  )
206
  max_duration_in_seconds: float = field(
207
  default=20.0,
 
242
  )
243
  predict_with_generate: bool = field(
244
  default=True,
245
+ metadata={
246
+ "help": "Output tokens in addition to loss and digits for calculating metrics"},
247
  )
248
  generation_max_length: int = field(
249
  default=225,
 
273
  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
274
  # split inputs and labels since they have to be of different lengths and need different padding methods
275
  # first treat the audio inputs by simply returning torch tensors
276
+ input_features = [{"input_features": feature["input_features"]}
277
+ for feature in features]
278
+ batch = self.processor.feature_extractor.pad(
279
+ input_features, return_tensors="pt")
280
 
281
  # get the tokenized label sequences
282
+ label_features = [{"input_ids": feature["labels"]}
283
+ for feature in features]
284
  # pad the labels to max length
285
+ labels_batch = self.processor.tokenizer.pad(
286
+ label_features, return_tensors="pt")
287
 
288
  # replace padding with -100 to ignore loss correctly
289
+ labels = labels_batch["input_ids"].masked_fill(
290
+ labels_batch.attention_mask.ne(1), -100)
291
 
292
  # if bos token is appended in previous tokenization step,
293
  # cut bos token here as it's append later anyways
 
298
  return batch
299
 
300
 
301
+ def main():
302
  # See all possible arguments in src/transformers/training_args.py
303
  # or by passing the --help flag to this script.
304
  # We now keep distinct sets of args, for a cleaner separation of concerns.
305
+ parser = HfArgumentParser(
306
+ (ModelArguments, DataTrainingArguments, TrainingArguments))
307
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
308
 
309
  # Metrics
310
+
311
  def compute_metrics(pred):
312
  pred_ids = pred.predictions
313
  label_ids = pred.label_ids
 
335
  # encode target text to label ids
336
  batch["labels"] = tokenizer(batch["sentence"]).input_ids
337
  return batch
338
+
339
  def print_training_arguments(model_args, data_args, training_args):
340
  print("Starting with the following parameters:")
341
  print("\n* Model arguments:")
342
+ pprint(vars(model_args), indent=2)
343
  print("\n* Data arguments")
344
+ pprint(vars(data_args), indent=2)
345
  print("\n* Training arguments")
346
+ pprint(vars(training_args), indent=2)
347
+
348
  # TODO - Might use this function later
349
  # def make_dataset(training_args, data_args):
350
  # seed = training_args.seed or 42
 
358
  # Load dataset
359
  speech_data = DatasetDict()
360
  speech_data["train"] = load_dataset(
361
+ data_args.dataset_name, data_args.dataset_config_name, split="train", streaming=True, use_auth_token=True)
362
  speech_data["test"] = load_dataset(
363
+ data_args.dataset_name, data_args.dataset_config_name, split="test", streaming=True, use_auth_token=True)
364
 
365
  # TODO - Implement streaming and include this
366
  # speech_data = make_dataset(training_args, data_args)
367
 
368
+ breakpoint()
369
+
370
+ # Rename columns
371
  if "audio" not in speech_data.column_names["train"]:
372
+ speech_data = speech_data.rename_column(
373
+ data_args.audio_column_name, "audio")
374
 
375
  if "sentence" not in speech_data.column_names["train"]:
376
+ speech_data = speech_data.rename_column(data_args.text_column_name, "sentence")
 
 
 
 
 
 
377
 
378
+ # Remove not needed columns
379
+ speech_data = speech_data.remove_columns(
380
+ [i for i in speech_data.column_names["train"] if i not in ["audio", "sentence"]])
381
 
382
  # Initialise
383
  feature_extractor = WhisperFeatureExtractor.from_pretrained(
 
393
  speech_data = speech_data.map(
394
  prepare_dataset, remove_columns=speech_data.column_names["train"], num_proc=1)
395
 
 
 
396
  # Metrics
397
  metric = evaluate.load("wer")
398
 
399
+ # Detecting last checkpoint.
400
  last_checkpoint = None
401
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
402
  last_checkpoint = get_last_checkpoint(training_args.output_dir)
 
409
  logger.info(
410
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
411
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
412
+
413
+ )
414
 
415
  # Training
416
  if training_args.do_train:
 
419
  if last_checkpoint is not None:
420
  checkpoint = last_checkpoint
421
  elif os.path.isdir(model_args.model_name_or_path):
422
+ checkpoint = model_args.model_name_or_path
423
  else:
424
  checkpoint = None
425
+
426
  # We need to set use_cache=False here if we want to use gradient accumulation
427
  model = WhisperForConditionalGeneration.from_pretrained(
428
+ "openai/whisper-small", use_cache=False)
429
 
430
  # Overriding generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):
431
  model.config.forced_decoder_ids = None
432
  model.config.suppress_tokens = []
433
+
434
  # Set seed before initializing model.
435
  set_seed(training_args.seed)
436
 
 
437
  trainer = Seq2SeqTrainer(
438
  args=training_args,
439
  model=model,
 
443
  compute_metrics=compute_metrics,
444
  tokenizer=processor.feature_extractor,
445
  )
446
+
447
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
448
  trainer.save_model()
449
 
 
451
  trainer.log_metrics("train", metrics)
452
  trainer.save_metrics("train", metrics)
453
  trainer.save_state()
454
+
455
  if training_args.push_to_hub:
456
  trainer.push_to_hub(**kwargs)
457
  else:
458
  trainer.create_model_card(**kwargs)
459
+
460
  # TODO - Look closer into the evaluation and the model card writing.
461
+
462
  # Evaluation
463
  results = {}
464
  if training_args.do_eval:
465
  logger.info("*** Evaluate ***")
466
  metrics = trainer.evaluate()
467
  max_eval_samples = (
468
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
469
+ vectorized_datasets["eval"])
470
  )
471
+ metrics["eval_samples"] = min(
472
+ max_eval_samples, len(vectorized_datasets["eval"]))
473
 
474
  trainer.log_metrics("eval", metrics)
475
  trainer.save_metrics("eval", metrics)
 
484
  "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
485
  "language": model_args.language,
486
  }
487
+
488
  return results
489
 
490
 
491
+ # XLA hook
492
  def _mp_fn(index):
493
  # For xla_spawn (TPUs)
494
  print("The XLA is initiated")
495
  main()
496
 
497
+
498
  if __name__ == "__main__":
499
  main()
run.sh CHANGED
@@ -1,31 +1,30 @@
1
 
2
- python run_speech_recognition_whisper_pere.py \
3
  --model_name_or_path="openai/whisper-small" \
4
  --output_dir="../whisper-testrun1" \
 
5
  --overwrite_output_dir=True \
6
  --language="Norwegian" \
7
  --task="transcribe" \
8
- --dataset_name="mozilla-foundation/common_voice_11_0" \
9
- --dataset_config="nn-NO" \
10
- --output_dir="./whisper-small-hi" \
11
  --do_train=True \
12
  --do_eval=True \
13
  --audio_column_name="audio" \
14
- --text_column_name="sentence" \
15
  --per_device_train_batch_size=16 \
16
  --per_device_train_batch_size=16 \
17
  --learning_rate=2e-5 \
18
  --warmup_steps=500 \
19
- --max_steps=5000 \
20
  --gradient_checkpointing=True \
21
  --gradient_accumulation_steps=1 \
22
- --group_by_length=True \
23
  --evaluation_strategy="steps" \
24
  --save_steps=1000 \
25
  --eval_steps=1000 \
26
- --logging_steps=25 \
27
  --fp16=True \
28
- --save_steps=1000 \
29
  --load_best_model_at_end=True \
30
  --metric_for_best_model="wer" \
31
  --greater_is_better=False \
 
1
 
2
+ python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
  --output_dir="../whisper-testrun1" \
5
+ --repo_id="NbAiLab/whisper-testrun1" \
6
  --overwrite_output_dir=True \
7
  --language="Norwegian" \
8
  --task="transcribe" \
9
+ --dataset_name="NbAiLab/NPSC" \
10
+ --dataset_config="16K_mp3" \
 
11
  --do_train=True \
12
  --do_eval=True \
13
  --audio_column_name="audio" \
14
+ --text_column_name="normsentence_text" \
15
  --per_device_train_batch_size=16 \
16
  --per_device_train_batch_size=16 \
17
  --learning_rate=2e-5 \
18
  --warmup_steps=500 \
19
+ --max_steps=10000 \
20
  --gradient_checkpointing=True \
21
  --gradient_accumulation_steps=1 \
22
+ --group_by_length=False \
23
  --evaluation_strategy="steps" \
24
  --save_steps=1000 \
25
  --eval_steps=1000 \
26
+ --logging_steps=250 \
27
  --fp16=True \
 
28
  --load_best_model_at_end=True \
29
  --metric_for_best_model="wer" \
30
  --greater_is_better=False \
run_whisper_finetuning.py CHANGED
@@ -1,6 +1,5 @@
1
  #!/usr/bin/env python
2
  # coding=utf-8
3
- # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
  #
5
  # Licensed under the Apache License, Version 2.0 (the "License");
6
  # you may not use this file except in compliance with the License.
@@ -23,43 +22,31 @@ import re
23
  import sys
24
  import warnings
25
  from dataclasses import dataclass, field
26
- from typing import Dict, List, Optional, Union
 
27
 
28
- import datasets
29
  import numpy as np
30
  import torch
31
- from datasets import DatasetDict, load_dataset, load_metric, AUdio
 
 
 
32
 
33
- import transformers
34
  from transformers import (
35
- AutoConfig,
36
- AutoFeatureExtractor,
37
- AutoModelForCTC,
38
- AutoProcessor,
39
- AutoTokenizer,
40
  HfArgumentParser,
41
- Trainer,
42
  TrainingArguments,
43
- Wav2Vec2Processor,
44
  set_seed,
 
 
 
 
 
 
45
  )
46
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
  from transformers.utils import check_min_version
48
  from transformers.utils.versions import require_version
49
- from transformers import WhisperFeatureExtractor
50
- from transformers import WhisperTokenizer
51
- from transformers import WhisperProcessor
52
- from transformers import WhisperForConditionalGeneration
53
-
54
- from transformers import Seq2SeqTrainingArguments
55
- from transformers import Seq2SeqTrainer
56
-
57
- import evaluate
58
-
59
- logger = logging.getLogger(__name__)
60
-
61
 
62
- # ** Check if this is needed...
63
 
64
  def list_field(default=None, metadata=None):
65
  return field(default_factory=lambda: default, metadata=metadata)
@@ -72,15 +59,25 @@ class ModelArguments:
72
  """
73
 
74
  model_name_or_path: str = field(
75
- metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 
 
 
 
 
 
 
 
76
  )
77
  tokenizer_name_or_path: Optional[str] = field(
78
  default=None,
79
- metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
 
80
  )
81
  cache_dir: Optional[str] = field(
82
  default=None,
83
- metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 
84
  )
85
  freeze_feature_encoder: bool = field(
86
  default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
@@ -91,7 +88,8 @@ class ModelArguments:
91
  activation_dropout: float = field(
92
  default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
93
  )
94
- feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
 
95
  hidden_dropout: float = field(
96
  default=0.0,
97
  metadata={
@@ -100,7 +98,8 @@ class ModelArguments:
100
  )
101
  final_dropout: float = field(
102
  default=0.0,
103
- metadata={"help": "The dropout probability for the final projection layer."},
 
104
  )
105
  mask_time_prob: float = field(
106
  default=0.05,
@@ -125,7 +124,8 @@ class ModelArguments:
125
  default=10,
126
  metadata={"help": "Length of vector span to mask along the feature axis."},
127
  )
128
- layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
 
129
  ctc_loss_reduction: Optional[str] = field(
130
  default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
131
  )
@@ -144,12 +144,13 @@ class DataTrainingArguments:
144
  the command line.
145
  """
146
 
147
- # dataset_name: str = field(
148
- # metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
149
- # )
150
- # dataset_config_name: str = field(
151
- # default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
152
- # )
 
153
  train_split_name: str = field(
154
  default="train",
155
  metadata={
@@ -164,11 +165,13 @@ class DataTrainingArguments:
164
  )
165
  audio_column_name: str = field(
166
  default="audio",
167
- metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
 
168
  )
169
  text_column_name: str = field(
170
  default="text",
171
- metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
 
172
  )
173
  overwrite_cache: bool = field(
174
  default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
@@ -197,7 +200,8 @@ class DataTrainingArguments:
197
  )
198
  eval_metrics: List[str] = list_field(
199
  default=["wer"],
200
- metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
 
201
  )
202
  max_duration_in_seconds: float = field(
203
  default=20.0,
@@ -236,6 +240,15 @@ class DataTrainingArguments:
236
  default="|",
237
  metadata={"help": "The word delimiter token for the tokenizer"},
238
  )
 
 
 
 
 
 
 
 
 
239
  phoneme_language: Optional[str] = field(
240
  default=None,
241
  metadata={
@@ -245,202 +258,133 @@ class DataTrainingArguments:
245
  " input audio to a sequence of phoneme sequences."
246
  },
247
  )
 
 
 
 
 
 
248
 
249
 
250
  @dataclass
251
- class DataCollatorCTCWithPadding:
252
- """
253
- Data collator that will dynamically pad the inputs received.
254
- Args:
255
- processor (:class:`~transformers.AutoProcessor`)
256
- The processor used for proccessing the data.
257
- padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
258
- Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
259
- among:
260
- * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
261
- sequence if provided).
262
- * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
263
- maximum acceptable input length for the model if that argument is not provided.
264
- * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
265
- different lengths).
266
- max_length (:obj:`int`, `optional`):
267
- Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
268
- max_length_labels (:obj:`int`, `optional`):
269
- Maximum length of the ``labels`` returned list and optionally padding length (see above).
270
- pad_to_multiple_of (:obj:`int`, `optional`):
271
- If set will pad the sequence to a multiple of the provided value.
272
- This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
273
- 7.5 (Volta).
274
- """
275
-
276
- processor: AutoProcessor
277
- padding: Union[bool, str] = "longest"
278
- pad_to_multiple_of: Optional[int] = None
279
- pad_to_multiple_of_labels: Optional[int] = None
280
 
281
  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
282
- # split inputs and labels since they have to be of different lenghts and need
283
- # different padding methods
284
- input_features = [{"input_values": feature["input_values"]} for feature in features]
285
- label_features = [{"input_ids": feature["labels"]} for feature in features]
286
-
287
- batch = self.processor.pad(
288
- input_features,
289
- padding=self.padding,
290
- pad_to_multiple_of=self.pad_to_multiple_of,
291
- return_tensors="pt",
292
- )
293
-
294
- with self.processor.as_target_processor():
295
- labels_batch = self.processor.pad(
296
- label_features,
297
- padding=self.padding,
298
- pad_to_multiple_of=self.pad_to_multiple_of_labels,
299
- return_tensors="pt",
300
- )
301
 
302
  # replace padding with -100 to ignore loss correctly
303
- labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
 
304
 
305
- batch["labels"] = labels
 
 
 
306
 
 
307
  return batch
308
 
309
 
310
- def create_vocabulary_from_data(
311
- datasets: DatasetDict,
312
- word_delimiter_token: Optional[str] = None,
313
- unk_token: Optional[str] = None,
314
- pad_token: Optional[str] = None,
315
- ):
316
- # Given training and test labels create vocabulary
317
- alphabet = set()
318
-
319
- def extract_all_chars(batch):
320
- all_text = " ".join(batch["target_text"])
321
- alphabet.update(all_text)
322
-
323
- datasets.map(
324
- extract_all_chars,
325
- batched=True,
326
- batch_size=-1,
327
- keep_in_memory=True,
328
- remove_columns=datasets["train"].column_names,
329
- )
330
-
331
- # # take union of all unique characters in each dataset
332
- # vocab_set = functools.reduce(
333
- # lambda vocab_1, vocab_2: {"vocab": list(set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]))}, vocabs.values()
334
- # )["vocab"][0]
335
-
336
- vocab_dict = {v: k for k, v in enumerate(sorted(list(alphabet)))}
337
-
338
- # replace white space with delimiter token
339
- if word_delimiter_token is not None:
340
- vocab_dict[word_delimiter_token] = vocab_dict[" "]
341
- del vocab_dict[" "]
342
-
343
- # add unk and pad token
344
- if unk_token is not None:
345
- vocab_dict[unk_token] = len(vocab_dict)
346
-
347
- if pad_token is not None:
348
- vocab_dict[pad_token] = len(vocab_dict)
349
-
350
- return vocab_dict
351
-
352
-
353
- def make_dataset(seed=42):
354
- # Pre-processing dataset
355
- import re
356
-
357
- def map_nst(entry):
358
- text = entry["text"].lower()
359
- text = text.replace("(...Vær stille under dette opptaket...)", "")
360
- text = re.sub('[áàâ]', 'a', text)
361
- text = re.sub('[ä]', 'æ', text)
362
- text = re.sub('[éèëê]', 'e', text)
363
- text = re.sub('[íìïî]', 'i', text)
364
- text = re.sub('[óòöô]', 'o', text)
365
- text = re.sub('[ö]', 'ø', text)
366
- text = re.sub('[ç]', 'c', text)
367
- text = re.sub('[úùüû]', 'u', text)
368
- # text = re.sub('\\(?=(Punktum|Komma|Utropstegn|Spørsmålstegn))', ' ', text)
369
- text = re.sub('\s+', ' ', text)
370
- return {"text": text}
371
-
372
- def filter_nst(entry):
373
- if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
374
- return False # Too short
375
- if re.match(entry["type"], "pIW|CA"):
376
- return False # Spelling out words
377
- return True
378
-
379
- def filter_npsc(entry):
380
- # False if there are digits in the text
381
- if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
382
- return False # Too short
383
- if re.search("\d", entry["text"]):
384
- return False
385
- return True
386
-
387
- def map_npsc(entry):
388
- batch = {"text": entry["text"].lower()}
389
- batch["text"] = re.sub('[áàâ]', 'a', batch["text"])
390
- batch["text"] = re.sub('[ä]', 'æ', batch["text"])
391
- batch["text"] = re.sub('[éèëê]', 'e', batch["text"])
392
- batch["text"] = re.sub('[íìïî]', 'i', batch["text"])
393
- batch["text"] = re.sub('[óòöô]', 'o', batch["text"])
394
- batch["text"] = re.sub('[ö]', 'ø', batch["text"])
395
- batch["text"] = re.sub('[ç]', 'c', batch["text"])
396
- batch["text"] = re.sub('[úùüû]', 'u', batch["text"])
397
- batch["text"] = re.sub('\s', ' ', batch["text"])
398
- batch["text"] = re.sub('<ee>', 'eee', batch["text"])
399
- batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
400
- batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
401
- batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
402
- # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
403
- if "<" in batch["text"]:
404
- raise ValueError(batch["text"])
405
- return batch
406
 
407
- nst = datasets.load_dataset("NbAiLab/NST", "no-close")
408
- npsc = datasets.load_dataset("NbAiLab/NPSC", "16K_mp3")
409
- # TODO NST_hesitate
410
 
411
- split = len(npsc["train"]) / (len(npsc["train"]) + len(npsc["validation"])) # Use same train/val ratio as NPSC
412
- nst_train = nst["train"].train_test_split(train_size=split, seed=seed)
413
- nst["train"] = nst_train["train"]
414
- nst["validation"] = nst_train["test"]
415
 
416
- nst = nst.filter(filter_nst).map(map_nst).shuffle(seed=seed)
417
- npsc = npsc.filter(filter_npsc).map(map_npsc).shuffle(seed=seed)
 
418
 
419
- npsc_base = npsc.remove_columns([col for col in npsc["train"].column_names if col not in ["text", "audio"]])
420
- nst_base = nst.remove_columns([col for col in nst["train"].column_names if col not in ["text", "audio"]])
421
 
422
- combined = {}
423
- for split in "train", "validation", "test":
424
- probs = np.array([len(nst_base[split]), len(npsc_base[split])]) # Weight by number of examples
425
- probs = (probs / probs.sum()).tolist()
426
- comb = datasets.interleave_datasets([nst_base[split], npsc_base[split]], probabilities=probs, seed=seed)
427
- combined[split] = comb
428
 
429
- return datasets.DatasetDict(**combined)
 
 
 
430
 
 
 
 
431
 
432
- def main():
433
- # See all possible arguments in src/transformers/training_args.py
434
- # or by passing the --help flag to this script.
435
- # We now keep distinct sets of args, for a cleaner separation of concerns.
436
 
437
- parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
438
- if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
439
- # If we pass only one argument to the script and it's the path to a json file,
440
- # let's parse it to get our arguments.
441
- model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
442
- else:
443
- model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  # Detecting last checkpoint.
446
  last_checkpoint = None
@@ -455,302 +399,10 @@ def main():
455
  logger.info(
456
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
457
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
458
- )
459
-
460
- # Setup logging
461
- logging.basicConfig(
462
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
463
- datefmt="%m/%d/%Y %H:%M:%S",
464
- handlers=[logging.StreamHandler(sys.stdout)],
465
- )
466
- logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
467
-
468
- # Log on each process the small summary:
469
- logger.warning(
470
- f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
471
- f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
472
- )
473
- # Set the verbosity to info of the Transformers logger (on main process only):
474
- if is_main_process(training_args.local_rank):
475
- transformers.utils.logging.set_verbosity_info()
476
- logger.info("Training/evaluation parameters %s", training_args)
477
 
478
- # Set seed before initializing model.
479
- set_seed(training_args.seed)
480
-
481
- # 1. First, let's load the dataset
482
- raw_datasets = make_dataset(seed=training_args.seed)
483
-
484
- if training_args.do_train:
485
- if data_args.audio_column_name not in raw_datasets["train"].column_names:
486
- raise ValueError(
487
- f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
488
- "Make sure to set `--audio_column_name` to the correct audio column - one of "
489
- f"{', '.join(raw_datasets['train'].column_names)}."
490
- )
491
-
492
- if data_args.text_column_name not in raw_datasets["train"].column_names:
493
- raise ValueError(
494
- f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
495
- "Make sure to set `--text_column_name` to the correct text column - one of "
496
- f"{', '.join(raw_datasets['train'].column_names)}."
497
  )
498
 
499
- if data_args.max_train_samples is not None:
500
- raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
501
-
502
- if training_args.do_eval:
503
- if data_args.max_eval_samples is not None:
504
- raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
505
-
506
- # 2. We remove some special characters from the datasets
507
- # that make training complicated and do not help in transcribing the speech
508
- # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
509
- # that could be easily picked up by the model
510
- # chars_to_ignore_regex = (
511
- # f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
512
- # )
513
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
514
-
515
- text_column_name = data_args.text_column_name
516
-
517
- def remove_special_characters(batch):
518
- if chars_to_ignore_regex is not None:
519
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
520
- else:
521
- batch["target_text"] = batch[text_column_name].lower() + " "
522
- return batch
523
-
524
- with training_args.main_process_first(desc="dataset map special characters removal"):
525
- raw_datasets = raw_datasets.map(
526
- remove_special_characters,
527
- remove_columns=[text_column_name],
528
- desc="remove special characters from datasets",
529
- )
530
-
531
- # save special tokens for tokenizer
532
- word_delimiter_token = data_args.word_delimiter_token
533
- unk_token = data_args.unk_token
534
- pad_token = data_args.pad_token
535
-
536
- # 3. Next, let's load the config as we might need it to create
537
- # the tokenizer
538
- # load config
539
- config = AutoConfig.from_pretrained(
540
- model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
541
- )
542
-
543
- # 4. Next, if no tokenizer file is defined,
544
- # we create the vocabulary of the model by extracting all unique characters from
545
- # the training and evaluation datasets
546
- # We need to make sure that only first rank saves vocabulary
547
- # make sure all processes wait until vocab is created
548
- tokenizer_name_or_path = model_args.tokenizer_name_or_path
549
- tokenizer_kwargs = {}
550
- if tokenizer_name_or_path is None:
551
- # save vocab in training output dir
552
- tokenizer_name_or_path = training_args.output_dir
553
-
554
- vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
555
-
556
- with training_args.main_process_first():
557
- if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
558
- os.remove(vocab_file)
559
-
560
- with training_args.main_process_first(desc="dataset map vocabulary creation"):
561
- if not os.path.isfile(vocab_file):
562
- os.makedirs(tokenizer_name_or_path, exist_ok=True)
563
- vocab_dict = create_vocabulary_from_data(
564
- raw_datasets,
565
- word_delimiter_token=word_delimiter_token,
566
- unk_token=unk_token,
567
- pad_token=pad_token,
568
- )
569
-
570
- # save vocab dict to be loaded into tokenizer
571
- with open(vocab_file, "w") as file:
572
- json.dump(vocab_dict, file)
573
-
574
- # if tokenizer has just been created
575
- # it is defined by `tokenizer_class` if present in config else by `model_type`
576
- tokenizer_kwargs = {
577
- "config": config if config.tokenizer_class is not None else None,
578
- "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
579
- "unk_token": unk_token,
580
- "pad_token": pad_token,
581
- "word_delimiter_token": word_delimiter_token,
582
- }
583
-
584
- # 5. Now we can instantiate the feature extractor, tokenizer and model
585
- # Note for distributed training, the .from_pretrained methods guarantee that only
586
- # one local process can concurrently download model & vocab.
587
-
588
- # load feature_extractor and tokenizer
589
- tokenizer = AutoTokenizer.from_pretrained(
590
- tokenizer_name_or_path,
591
- use_auth_token=data_args.use_auth_token,
592
- **tokenizer_kwargs,
593
- )
594
- feature_extractor = AutoFeatureExtractor.from_pretrained(
595
- model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
596
- )
597
-
598
- # adapt config
599
- config.update(
600
- {
601
- "feat_proj_dropout": model_args.feat_proj_dropout,
602
- "attention_dropout": model_args.attention_dropout,
603
- "hidden_dropout": model_args.hidden_dropout,
604
- "final_dropout": model_args.final_dropout,
605
- "mask_time_prob": model_args.mask_time_prob,
606
- "mask_time_length": model_args.mask_time_length,
607
- "mask_feature_prob": model_args.mask_feature_prob,
608
- "mask_feature_length": model_args.mask_feature_length,
609
- "gradient_checkpointing": training_args.gradient_checkpointing,
610
- "layerdrop": model_args.layerdrop,
611
- "ctc_loss_reduction": model_args.ctc_loss_reduction,
612
- "ctc_zero_infinity": model_args.ctc_zero_infinity,
613
- "pad_token_id": tokenizer.pad_token_id,
614
- "vocab_size": len(tokenizer),
615
- "activation_dropout": model_args.activation_dropout,
616
- }
617
- )
618
-
619
- # create model
620
- model = AutoModelForCTC.from_pretrained(
621
- model_args.model_name_or_path,
622
- cache_dir=model_args.cache_dir,
623
- config=config,
624
- use_auth_token=data_args.use_auth_token,
625
- )
626
-
627
- # freeze encoder
628
- if model_args.freeze_feature_encoder:
629
- model.freeze_feature_encoder()
630
-
631
- # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
632
- # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
633
- # so that we just need to set the correct target sampling rate and normalize the input
634
- # via the `feature_extractor`
635
-
636
- # make sure that dataset decodes audio with correct sampling rate
637
- dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
638
- if dataset_sampling_rate != feature_extractor.sampling_rate:
639
- raw_datasets = raw_datasets.cast_column(
640
- data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
641
- )
642
-
643
- # derive max & min input length for sample rate & max duration
644
- max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
645
- min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
646
- audio_column_name = data_args.audio_column_name
647
- num_workers = data_args.preprocessing_num_workers
648
-
649
- # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
650
- phoneme_language = data_args.phoneme_language
651
-
652
- # Preprocessing the datasets.
653
- # We need to read the audio files as arrays and tokenize the targets.
654
- def prepare_dataset(batch):
655
- # load audio
656
- sample = batch[audio_column_name]
657
-
658
- inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
659
- batch["input_values"] = inputs.input_values[0]
660
- batch["input_length"] = len(batch["input_values"])
661
-
662
- # encode targets
663
- additional_kwargs = {}
664
- if phoneme_language is not None:
665
- additional_kwargs["phonemizer_lang"] = phoneme_language
666
-
667
- batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
668
- return batch
669
-
670
- with training_args.main_process_first(desc="dataset map preprocessing"):
671
- vectorized_datasets = raw_datasets.map(
672
- prepare_dataset,
673
- remove_columns=next(iter(raw_datasets.values())).column_names,
674
- num_proc=num_workers,
675
- desc="preprocess datasets",
676
- )
677
-
678
- def is_audio_in_length_range(length):
679
- return length > min_input_length and length < max_input_length
680
-
681
- # filter data that is shorter than min_input_length
682
- vectorized_datasets = vectorized_datasets.filter(
683
- is_audio_in_length_range,
684
- num_proc=num_workers,
685
- input_columns=["input_length"],
686
- )
687
-
688
- # 7. Next, we can prepare the training.
689
- # Let's use word error rate (WER) as our evaluation metric,
690
- # instantiate a data collator and the trainer
691
-
692
- # Define evaluation metrics during training, *i.e.* word error rate, character error rate
693
- eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
694
-
695
- # for large datasets it is advised to run the preprocessing on a
696
- # single machine first with ``args.preprocessing_only`` since there will mostly likely
697
- # be a timeout when running the script in distributed mode.
698
- # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
699
- # cached dataset
700
- if data_args.preprocessing_only:
701
- logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
702
- return
703
-
704
- def compute_metrics(pred):
705
- pred_logits = pred.predictions
706
- pred_ids = np.argmax(pred_logits, axis=-1)
707
-
708
- pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
709
-
710
- pred_str = tokenizer.batch_decode(pred_ids)
711
- # we do not want to group tokens when computing the metrics
712
- label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
713
-
714
- metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
715
-
716
- return metrics
717
-
718
- # Now save everything to be able to create a single processor later
719
- if is_main_process(training_args.local_rank):
720
- # save feature extractor, tokenizer and config
721
- feature_extractor.save_pretrained(training_args.output_dir)
722
- tokenizer.save_pretrained(training_args.output_dir)
723
- config.save_pretrained(training_args.output_dir)
724
-
725
- try:
726
- processor = AutoProcessor.from_pretrained(training_args.output_dir)
727
- except (OSError, KeyError):
728
- warnings.warn(
729
- "Loading a processor from a feature extractor config that does not"
730
- " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
731
- " attribute to your `preprocessor_config.json` file to suppress this warning: "
732
- " `'processor_class': 'Wav2Vec2Processor'`",
733
- FutureWarning,
734
- )
735
- processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
736
-
737
- # Instantiate custom data collator
738
- data_collator = DataCollatorCTCWithPadding(processor=processor)
739
-
740
- # Initialize Trainer
741
- trainer = Trainer(
742
- model=model,
743
- data_collator=data_collator,
744
- args=training_args,
745
- compute_metrics=compute_metrics,
746
- train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
747
- eval_dataset=vectorized_datasets["validation"] if training_args.do_eval else None,
748
- tokenizer=feature_extractor,
749
- )
750
-
751
- # 8. Finally, we can start training
752
-
753
- # Training
754
  if training_args.do_train:
755
 
756
  # use last checkpoint if exist
@@ -761,30 +413,53 @@ def main():
761
  else:
762
  checkpoint = None
763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
765
  trainer.save_model()
766
 
767
  metrics = train_result.metrics
768
- max_train_samples = (
769
- data_args.max_train_samples
770
- if data_args.max_train_samples is not None
771
- else len(vectorized_datasets["train"])
772
- )
773
- metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
774
-
775
  trainer.log_metrics("train", metrics)
776
  trainer.save_metrics("train", metrics)
777
  trainer.save_state()
778
 
 
 
 
 
 
 
 
779
  # Evaluation
780
  results = {}
781
  if training_args.do_eval:
782
  logger.info("*** Evaluate ***")
783
  metrics = trainer.evaluate()
784
  max_eval_samples = (
785
- data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
 
786
  )
787
- metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
 
788
 
789
  trainer.log_metrics("eval", metrics)
790
  trainer.save_metrics("eval", metrics)
@@ -793,21 +468,22 @@ def main():
793
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
794
  kwargs = {
795
  "finetuned_from": model_args.model_name_or_path,
796
- "tasks": "speech-recognition",
797
- "tags": ["automatic-speech-recognition", data_args.dataset_name],
798
  "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
799
  "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
 
800
  }
801
- if "common_voice" in data_args.dataset_name:
802
- kwargs["language"] = config_name
803
-
804
- if training_args.push_to_hub:
805
- trainer.push_to_hub(**kwargs)
806
- else:
807
- trainer.create_model_card(**kwargs)
808
 
809
  return results
810
 
811
 
 
 
 
 
 
 
 
812
  if __name__ == "__main__":
813
- main()
 
1
  #!/usr/bin/env python
2
  # coding=utf-8
 
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
 
22
  import sys
23
  import warnings
24
  from dataclasses import dataclass, field
25
+ from typing import Any, Dict, List, Optional, Union
26
+ import evaluate
27
 
 
28
  import numpy as np
29
  import torch
30
+ from pprint import pprint
31
+ import evaluate
32
+ from datasets import DatasetDict, load_dataset
33
+ from datasets import Audio
34
 
 
35
  from transformers import (
 
 
 
 
 
36
  HfArgumentParser,
 
37
  TrainingArguments,
 
38
  set_seed,
39
+ WhisperFeatureExtractor,
40
+ WhisperTokenizer,
41
+ WhisperForConditionalGeneration,
42
+ WhisperProcessor,
43
+ Seq2SeqTrainer,
44
+ Seq2SeqTrainingArguments,
45
  )
46
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
47
  from transformers.utils import check_min_version
48
  from transformers.utils.versions import require_version
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
50
 
51
  def list_field(default=None, metadata=None):
52
  return field(default_factory=lambda: default, metadata=metadata)
 
59
  """
60
 
61
  model_name_or_path: str = field(
62
+ metadata={
63
+ "help": "Path to pretrained model or model identifier from huggingface.co/models"}
64
+ )
65
+ language: str = field(
66
+ metadata={"help": "Whisper specific language"}
67
+ )
68
+ task: str = field(
69
+ metadata={
70
+ "help": "Whisper specific task, i.e., 'transcribe' or 'translate'"}
71
  )
72
  tokenizer_name_or_path: Optional[str] = field(
73
  default=None,
74
+ metadata={
75
+ "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
76
  )
77
  cache_dir: Optional[str] = field(
78
  default=None,
79
+ metadata={
80
+ "help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
81
  )
82
  freeze_feature_encoder: bool = field(
83
  default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
 
88
  activation_dropout: float = field(
89
  default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
90
  )
91
+ feat_proj_dropout: float = field(default=0.0, metadata={
92
+ "help": "The dropout ratio for the projected features."})
93
  hidden_dropout: float = field(
94
  default=0.0,
95
  metadata={
 
98
  )
99
  final_dropout: float = field(
100
  default=0.0,
101
+ metadata={
102
+ "help": "The dropout probability for the final projection layer."},
103
  )
104
  mask_time_prob: float = field(
105
  default=0.05,
 
124
  default=10,
125
  metadata={"help": "Length of vector span to mask along the feature axis."},
126
  )
127
+ layerdrop: float = field(default=0.0, metadata={
128
+ "help": "The LayerDrop probability."})
129
  ctc_loss_reduction: Optional[str] = field(
130
  default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
131
  )
 
144
  the command line.
145
  """
146
 
147
+ dataset_name: str = field(
148
+ metadata={
149
+ "help": "The configuration name of the dataset to use (via the datasets library)."}
150
+ )
151
+ dataset_config_name: str = field(
152
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
153
+ )
154
  train_split_name: str = field(
155
  default="train",
156
  metadata={
 
165
  )
166
  audio_column_name: str = field(
167
  default="audio",
168
+ metadata={
169
+ "help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
170
  )
171
  text_column_name: str = field(
172
  default="text",
173
+ metadata={
174
+ "help": "The name of the dataset column containing the text data. Defaults to 'text'"},
175
  )
176
  overwrite_cache: bool = field(
177
  default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 
200
  )
201
  eval_metrics: List[str] = list_field(
202
  default=["wer"],
203
+ metadata={
204
+ "help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
205
  )
206
  max_duration_in_seconds: float = field(
207
  default=20.0,
 
240
  default="|",
241
  metadata={"help": "The word delimiter token for the tokenizer"},
242
  )
243
+ predict_with_generate: bool = field(
244
+ default=True,
245
+ metadata={
246
+ "help": "Output tokens in addition to loss and digits for calculating metrics"},
247
+ )
248
+ generation_max_length: int = field(
249
+ default=225,
250
+ metadata={"help": "Maximum number of tokens generated"},
251
+ )
252
  phoneme_language: Optional[str] = field(
253
  default=None,
254
  metadata={
 
258
  " input audio to a sequence of phoneme sequences."
259
  },
260
  )
261
+ print_training_arguments: bool = field(
262
+ default=True,
263
+ metadata={
264
+ "help": "Prints the training arguments. For debugging"
265
+ },
266
+ )
267
 
268
 
269
  @dataclass
270
+ class DataCollatorSpeechSeq2SeqWithPadding:
271
+ processor: Any
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
274
+ # split inputs and labels since they have to be of different lengths and need different padding methods
275
+ # first treat the audio inputs by simply returning torch tensors
276
+ input_features = [{"input_features": feature["input_features"]}
277
+ for feature in features]
278
+ batch = self.processor.feature_extractor.pad(
279
+ input_features, return_tensors="pt")
280
+
281
+ # get the tokenized label sequences
282
+ label_features = [{"input_ids": feature["labels"]}
283
+ for feature in features]
284
+ # pad the labels to max length
285
+ labels_batch = self.processor.tokenizer.pad(
286
+ label_features, return_tensors="pt")
 
 
 
 
 
 
287
 
288
  # replace padding with -100 to ignore loss correctly
289
+ labels = labels_batch["input_ids"].masked_fill(
290
+ labels_batch.attention_mask.ne(1), -100)
291
 
292
+ # if bos token is appended in previous tokenization step,
293
+ # cut bos token here as it's append later anyways
294
+ if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
295
+ labels = labels[:, 1:]
296
 
297
+ batch["labels"] = labels
298
  return batch
299
 
300
 
301
+ def main():
302
+ # See all possible arguments in src/transformers/training_args.py
303
+ # or by passing the --help flag to this script.
304
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
305
+ parser = HfArgumentParser(
306
+ (ModelArguments, DataTrainingArguments, TrainingArguments))
307
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
308
+
309
+ # Metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ def compute_metrics(pred):
312
+ pred_ids = pred.predictions
313
+ label_ids = pred.label_ids
314
 
315
+ # replace -100 with the pad_token_id
316
+ label_ids[label_ids == -100] = tokenizer.pad_token_id
 
 
317
 
318
+ # we do not want to group tokens when computing the metrics
319
+ pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
320
+ label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
321
 
322
+ wer = 100 * metric.compute(predictions=pred_str, references=label_str)
 
323
 
324
+ return {"wer": wer}
 
 
 
 
 
325
 
326
+ # Prepare dataset
327
+ def prepare_dataset(batch):
328
+ # load and resample audio data from 48 to 16kHz
329
+ audio = batch["audio"]
330
 
331
+ # compute log-Mel input features from input audio array
332
+ batch["input_features"] = feature_extractor(
333
+ audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
334
 
335
+ # encode target text to label ids
336
+ batch["labels"] = tokenizer(batch["sentence"]).input_ids
337
+ return batch
 
338
 
339
+ def print_training_arguments(model_args, data_args, training_args):
340
+ print("Starting with the following parameters:")
341
+ print("\n* Model arguments:")
342
+ pprint(vars(model_args), indent=2)
343
+ print("\n* Data arguments")
344
+ pprint(vars(data_args), indent=2)
345
+ print("\n* Training arguments")
346
+ pprint(vars(training_args), indent=2)
347
+
348
+ # Print training arguments
349
+ if data_args.print_training_arguments:
350
+ print_training_arguments(model_args, data_args, training_args)
351
+
352
+ # Load dataset
353
+ train_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="train", streaming=True, use_auth_token=True)
354
+ eval_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="validation", streaming=True, use_auth_token=True)
355
+
356
+
357
+ # Rename columns
358
+ if data_args.audio_column_name != "audio":
359
+ train_dataset = train_dataset.rename_column(data_args.audio_column_name, "audio")
360
+ eval_dataset = eval_dataset.rename_column(data_args.audio_column_name, "audio")
361
+
362
+ if data_args.text_column_name != "sentence":
363
+ train_dataset = train_dataset.rename_column(data_args.text_column_name, "sentence")
364
+ eval_dataset = eval_dataset.rename_column(data_args.text_column_name, "sentence")
365
+
366
+
367
+ # Initialise
368
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(
369
+ model_args.model_name_or_path)
370
+ tokenizer = WhisperTokenizer.from_pretrained(
371
+ model_args.model_name_or_path, language=model_args.language, task=model_args.task)
372
+ processor = WhisperProcessor.from_pretrained(
373
+ model_args.model_name_or_path, language=model_args.language, task=model_args.task)
374
+ data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
375
+
376
+
377
+ # Prepare data
378
+ train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
379
+ eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
380
+
381
+ # TODO Not able to implement in Streaming mode. Can not find a way to list columns. But is is necessary?
382
+ # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
383
+
384
+ train_dataset = train_dataset.map(prepare_dataset)
385
+
386
+ # Metrics
387
+ metric = evaluate.load("wer")
388
 
389
  # Detecting last checkpoint.
390
  last_checkpoint = None
 
399
  logger.info(
400
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
401
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  )
404
 
405
+ # Training
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  if training_args.do_train:
407
 
408
  # use last checkpoint if exist
 
413
  else:
414
  checkpoint = None
415
 
416
+ # We need to set use_cache=False here if we want to use gradient accumulation
417
+ model = WhisperForConditionalGeneration.from_pretrained(
418
+ "openai/whisper-small", use_cache=False)
419
+
420
+ # Overriding generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):
421
+ model.config.forced_decoder_ids = None
422
+ model.config.suppress_tokens = []
423
+
424
+ # Set seed before initializing model.
425
+ set_seed(training_args.seed)
426
+
427
+ trainer = Seq2SeqTrainer(
428
+ args=training_args,
429
+ model=model,
430
+ train_dataset=train_dataset.with_format("torch"),
431
+ eval_dataset=eval_dataset.with_format("torch"),
432
+ data_collator=data_collator,
433
+ compute_metrics=compute_metrics,
434
+ tokenizer=processor.feature_extractor,
435
+ )
436
+
437
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
438
  trainer.save_model()
439
 
440
  metrics = train_result.metrics
 
 
 
 
 
 
 
441
  trainer.log_metrics("train", metrics)
442
  trainer.save_metrics("train", metrics)
443
  trainer.save_state()
444
 
445
+ if training_args.push_to_hub:
446
+ trainer.push_to_hub(**kwargs)
447
+ else:
448
+ trainer.create_model_card(**kwargs)
449
+
450
+ # TODO - Look closer into the evaluation and the model card writing.
451
+
452
  # Evaluation
453
  results = {}
454
  if training_args.do_eval:
455
  logger.info("*** Evaluate ***")
456
  metrics = trainer.evaluate()
457
  max_eval_samples = (
458
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
459
+ vectorized_datasets["eval"])
460
  )
461
+ metrics["eval_samples"] = min(
462
+ max_eval_samples, len(vectorized_datasets["eval"]))
463
 
464
  trainer.log_metrics("eval", metrics)
465
  trainer.save_metrics("eval", metrics)
 
468
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
469
  kwargs = {
470
  "finetuned_from": model_args.model_name_or_path,
471
+ "tasks": "automatic-speech-recognition",
472
+ "tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
473
  "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
474
  "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
475
+ "language": model_args.language,
476
  }
 
 
 
 
 
 
 
477
 
478
  return results
479
 
480
 
481
+ # XLA hook
482
+ def _mp_fn(index):
483
+ # For xla_spawn (TPUs)
484
+ print("The XLA is initiated")
485
+ main()
486
+
487
+
488
  if __name__ == "__main__":
489
+ main()
run_xla.sh CHANGED
@@ -1,4 +1,4 @@
1
 
2
- python xla_spawn.py --num_cores=4 run_whisper.py
3
 
4
 
 
1
 
2
+ python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
3
 
4