bakrianoo commited on
Commit
e3ce0fb
1 Parent(s): c9f41c7

update experiments scripts

Browse files
Files changed (1) hide show
  1. train-experiments.py +835 -0
train-experiments.py ADDED
@@ -0,0 +1,835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm.auto import tqdm
3
+ import random
4
+ from p_tqdm import p_map
5
+ from datasets import load_dataset, load_metric, Audio
6
+ from datasets import load_from_disk, concatenate_datasets
7
+ import torchaudio
8
+
9
+ import functools
10
+ import json
11
+ import logging
12
+ import os
13
+ import re
14
+ import sys
15
+ import warnings
16
+ from dataclasses import dataclass, field
17
+ from typing import Dict, List, Optional, Union
18
+ from datasets import concatenate_datasets, load_dataset
19
+
20
+ import datasets
21
+ import numpy as np
22
+ import torch
23
+ from datasets import DatasetDict, load_dataset, load_metric, Dataset
24
+
25
+ import bitsandbytes as bnb
26
+ import transformers
27
+ from transformers import (
28
+ AutoConfig,
29
+ AutoFeatureExtractor,
30
+ AutoModelForCTC,
31
+ AutoProcessor,
32
+ AutoTokenizer,
33
+ HfArgumentParser,
34
+ Trainer,
35
+ TrainingArguments,
36
+ Wav2Vec2Processor,
37
+ set_seed,
38
+ )
39
+ from transformers.trainer_pt_utils import get_parameter_names
40
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
41
+ from transformers.utils import check_min_version
42
+ from transformers.utils.versions import require_version
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+ def list_field(default=None, metadata=None):
47
+ return field(default_factory=lambda: default, metadata=metadata)
48
+
49
+ @dataclass
50
+ class ModelArguments:
51
+ """
52
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
53
+ """
54
+
55
+ model_name_or_path: str = field(
56
+ metadata={"help": ""}, default="hf-test/xls-r-dummy"
57
+ )
58
+ tokenizer_name_or_path: Optional[str] = field(
59
+ default=None,
60
+ metadata={"help": "hf-test/xls-r-dummy"},
61
+ )
62
+ cache_dir: Optional[str] = field(
63
+ default=None,
64
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
65
+ )
66
+ freeze_feature_encoder: bool = field(
67
+ default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
68
+ )
69
+ attention_dropout: float = field(
70
+ default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
71
+ )
72
+ activation_dropout: float = field(
73
+ default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
74
+ )
75
+ feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
76
+ hidden_dropout: float = field(
77
+ default=0.0,
78
+ metadata={
79
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
80
+ },
81
+ )
82
+ final_dropout: float = field(
83
+ default=0.0,
84
+ metadata={"help": "The dropout probability for the final projection layer."},
85
+ )
86
+ mask_time_prob: float = field(
87
+ default=0.05,
88
+ metadata={
89
+ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
90
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
91
+ "vectors will be masked along the time axis."
92
+ },
93
+ )
94
+ mask_time_length: int = field(
95
+ default=10,
96
+ metadata={"help": "Length of vector span to mask along the time axis."},
97
+ )
98
+ mask_feature_prob: float = field(
99
+ default=0.0,
100
+ metadata={
101
+ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
102
+ "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
103
+ },
104
+ )
105
+ mask_feature_length: int = field(
106
+ default=10,
107
+ metadata={"help": "Length of vector span to mask along the feature axis."},
108
+ )
109
+ layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
110
+ ctc_loss_reduction: Optional[str] = field(
111
+ default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
112
+ )
113
+
114
+
115
+ # In[4]:
116
+
117
+
118
+ @dataclass
119
+ class DataTrainingArguments:
120
+ """
121
+ Arguments pertaining to what data we are going to input our model for training and eval.
122
+
123
+ Using `HfArgumentParser` we can turn this class
124
+ into argparse arguments to be able to specify them on
125
+ the command line.
126
+ """
127
+
128
+ dataset_name: str = field(
129
+ metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
130
+ )
131
+ dataset_config_name: str = field(
132
+ default="ab", metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
133
+ )
134
+ train_split_name: str = field(
135
+ default="train+validation",
136
+ metadata={
137
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
138
+ },
139
+ )
140
+ eval_split_name: str = field(
141
+ default="test",
142
+ metadata={
143
+ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
144
+ },
145
+ )
146
+ audio_column_name: str = field(
147
+ default="audio",
148
+ metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
149
+ )
150
+ text_column_name: str = field(
151
+ default="text",
152
+ metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
153
+ )
154
+ overwrite_cache: bool = field(
155
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
156
+ )
157
+ preprocessing_num_workers: Optional[int] = field(
158
+ default=None,
159
+ metadata={"help": "The number of processes to use for the preprocessing."},
160
+ )
161
+ max_train_samples: Optional[int] = field(
162
+ default=None,
163
+ metadata={
164
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
165
+ "value if set."
166
+ },
167
+ )
168
+ max_eval_samples: Optional[int] = field(
169
+ default=None,
170
+ metadata={
171
+ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
172
+ "value if set."
173
+ },
174
+ )
175
+ chars_to_ignore: Optional[List[str]] = list_field(
176
+ default=None,
177
+ metadata={"help": "A list of characters to remove from the transcripts."},
178
+ )
179
+ eval_metrics: List[str] = list_field(
180
+ default=["wer"],
181
+ metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
182
+ )
183
+ max_duration_in_seconds: float = field(
184
+ default=20.0,
185
+ metadata={
186
+ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
187
+ },
188
+ )
189
+ min_duration_in_seconds: float = field(
190
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
191
+ )
192
+ preprocessing_only: bool = field(
193
+ default=False,
194
+ metadata={
195
+ "help": "Whether to only do data preprocessing and skip training. "
196
+ "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
197
+ "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
198
+ "so that the cached datasets can consequently be loaded in distributed training"
199
+ },
200
+ )
201
+ use_auth_token: bool = field(
202
+ default=False,
203
+ metadata={
204
+ "help": "If :obj:`True`, will use the token generated when running"
205
+ ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
206
+ },
207
+ )
208
+ unk_token: str = field(
209
+ default="[UNK]",
210
+ metadata={"help": "The unk token for the tokenizer"},
211
+ )
212
+ pad_token: str = field(
213
+ default="[PAD]",
214
+ metadata={"help": "The padding token for the tokenizer"},
215
+ )
216
+ word_delimiter_token: str = field(
217
+ default="|",
218
+ metadata={"help": "The word delimiter token for the tokenizer"},
219
+ )
220
+ phoneme_language: Optional[str] = field(
221
+ default=None,
222
+ metadata={
223
+ "help": "The target language that should be used be"
224
+ " passed to the tokenizer for tokenization. Note that"
225
+ " this is only relevant if the model classifies the"
226
+ " input audio to a sequence of phoneme sequences."
227
+ },
228
+ )
229
+
230
+
231
+ # In[5]:
232
+
233
+
234
+ @dataclass
235
+ class DataCollatorCTCWithPadding:
236
+ """
237
+ Data collator that will dynamically pad the inputs received.
238
+ Args:
239
+ processor (:class:`~transformers.AutoProcessor`)
240
+ The processor used for proccessing the data.
241
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
242
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
243
+ among:
244
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
245
+ sequence if provided).
246
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
247
+ maximum acceptable input length for the model if that argument is not provided.
248
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
249
+ different lengths).
250
+ max_length (:obj:`int`, `optional`):
251
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
252
+ max_length_labels (:obj:`int`, `optional`):
253
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
254
+ pad_to_multiple_of (:obj:`int`, `optional`):
255
+ If set will pad the sequence to a multiple of the provided value.
256
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
257
+ 7.5 (Volta).
258
+ """
259
+
260
+ processor: AutoProcessor
261
+ padding: Union[bool, str] = "longest"
262
+ pad_to_multiple_of: Optional[int] = None
263
+ pad_to_multiple_of_labels: Optional[int] = None
264
+
265
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
266
+ # split inputs and labels since they have to be of different lenghts and need
267
+ # different padding methods
268
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
269
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
270
+
271
+ batch = self.processor.pad(
272
+ input_features,
273
+ padding=self.padding,
274
+ pad_to_multiple_of=self.pad_to_multiple_of,
275
+ return_tensors="pt",
276
+ )
277
+
278
+ with self.processor.as_target_processor():
279
+ labels_batch = self.processor.pad(
280
+ label_features,
281
+ padding=self.padding,
282
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
283
+ return_tensors="pt",
284
+ )
285
+
286
+ # replace padding with -100 to ignore loss correctly
287
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
288
+
289
+ batch["labels"] = labels
290
+
291
+ return batch
292
+
293
+ # download the augmented Dataset from
294
+ # https://huggingface.co/datasets/bakrianoo/arabic-cv8-augmented
295
+
296
+ base_path = "/workspace/cv-corpus-8.0-2022-01-19"
297
+
298
+ # load augmented datasets
299
+ train_ar_df = pd.read_csv(f"{base_path}/train.tsv", sep="\t")
300
+ train_ar_df["audio"] = train_ar_df["path"]
301
+
302
+ test_ar_df = pd.read_csv(f"{base_path}/test.tsv", sep="\t")
303
+ test_ar_df["audio"] = test_ar_df["path"]
304
+
305
+ train_ar_df = train_ar_df.sample(frac=1, random_state=101, ignore_index=True)
306
+
307
+ raw_datasets = DatasetDict()
308
+
309
+ # select Dataset range
310
+ from_rows = 0
311
+ to_rows = 500_000
312
+
313
+ saved_vecs_path = f"{base_path}/saved_vec_dataset-{from_rows}-{to_rows}.ds"
314
+
315
+ raw_datasets["train"] = Dataset.from_pandas(train_ar_df.iloc[from_rows:to_rows])
316
+ raw_datasets["eval"] = Dataset.from_pandas(test_ar_df)
317
+
318
+ # Audio casting
319
+ raw_datasets["train"] = raw_datasets["train"].cast_column("audio", datasets.features.Audio(sampling_rate=16000))
320
+ raw_datasets["eval"] = raw_datasets["eval"].cast_column("audio", datasets.features.Audio(sampling_rate=16000))
321
+
322
+
323
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
324
+
325
+ model_args, data_args, training_args = parser.parse_dict({
326
+ "dataset_name": "mozilla-foundation/common_voice_8_0",
327
+ "model_name_or_path": "facebook/wav2vec2-xls-r-300m",
328
+ "dataset_config_name": "ar",
329
+ "overwrite_output_dir": False,
330
+
331
+ # "preprocessing_only": True,
332
+
333
+ "output_dir": f"{base_path}/output",
334
+ "text_column_name": "sentence",
335
+
336
+ "freeze_feature_encoder": True,
337
+ "gradient_checkpointing": True,
338
+ "group_by_length": False,
339
+ "push_to_hub": False,
340
+ "use_auth_token": True,
341
+ "do_train": True,
342
+ "do_eval": True,
343
+
344
+ "per_device_train_batch_size":32,
345
+ "gradient_accumulation_steps":1,
346
+ "per_device_eval_batch_size":10,
347
+
348
+ "metric_for_best_model":'wer',
349
+ "evaluation_strategy":"steps",
350
+ "eval_steps":1000,
351
+ "logging_strategy":"steps",
352
+ "logging_steps":500,
353
+ "save_strategy":"steps",
354
+ "save_steps":1000,
355
+ "num_train_epochs":10,
356
+ "fp16":True,
357
+ "learning_rate":2e-4,
358
+ "warmup_steps":1000,
359
+ "save_total_limit":8,
360
+ "chars_to_ignore": [':', 'T', '؟', 'ۖ', '…', 'x', 'چ', '?', '.', 'ْ', 'g', '☭', 'w', ';', ',', 'a', 'ۙ', 'e', '`', '“', '!', 'n', 's', '؛', 'ﺃ', 'r', 'ٓ', 'c', '-', 't', 'u', 'l', 'o', '»', 'ٰ', 'ۗ', 'h', 'ڨ', 'ۚ', 'S', '—', 'ٌ', 'm', '”', 'd', 'ۛ', 'H', 'ُ', 'ﻻ', 'y', 'M', 'ھ', 'ک', 'ٍ', 'A', 'ۘ', 'ِ', '–', 'i', 'f', "'", 'ً', '«', 'َ'] + ['\\', '(',')','-','b','c','d','e','g','i','k','p','q','r','u','v','x'],
361
+
362
+ })
363
+
364
+
365
+ # See all possible arguments in src/transformers/training_args.py
366
+ # or by passing the --help flag to this script.
367
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
368
+
369
+ # Detecting last checkpoint.
370
+ last_checkpoint = None
371
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
372
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
373
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
374
+ raise ValueError(
375
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
376
+ "Use --overwrite_output_dir to overcome."
377
+ )
378
+ elif last_checkpoint is not None:
379
+ logger.info(
380
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
381
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
382
+ )
383
+
384
+
385
+ # Setup logging
386
+ logging.basicConfig(
387
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
388
+ datefmt="%m/%d/%Y %H:%M:%S",
389
+ handlers=[logging.StreamHandler(sys.stdout)],
390
+ )
391
+ logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
392
+
393
+ # Log on each process the small summary:
394
+ logger.warning(
395
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
396
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
397
+ )
398
+
399
+ # Set the verbosity to info of the Transformers logger (on main process only):
400
+ if is_main_process(training_args.local_rank):
401
+ transformers.utils.logging.set_verbosity_info()
402
+ logger.info("Training/evaluation parameters %s", training_args)
403
+
404
+
405
+ # Set seed before initializing model.
406
+ set_seed(training_args.seed)
407
+
408
+
409
+ ### Load Dataset
410
+
411
+
412
+ chars_to_ignore_regex = (
413
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
414
+ )
415
+ text_column_name = data_args.text_column_name
416
+
417
+
418
+ def remove_special_characters(batch):
419
+ if chars_to_ignore_regex is not None:
420
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
421
+ else:
422
+ batch["target_text"] = batch[text_column_name].lower() + " "
423
+ return batch
424
+
425
+ with training_args.main_process_first(desc="dataset map special characters removal"):
426
+
427
+ raw_datasets = raw_datasets.map(
428
+ remove_special_characters,
429
+ remove_columns=[text_column_name],
430
+ desc="remove special characters from datasets",
431
+ )
432
+
433
+
434
+ data_args.word_delimiter_token
435
+
436
+
437
+ # save special tokens for tokenizer
438
+ word_delimiter_token = data_args.word_delimiter_token
439
+ unk_token = data_args.unk_token
440
+ pad_token = data_args.pad_token
441
+
442
+ # 3. Next, let's load the config as we might need it to create
443
+ # the tokenizer
444
+ # load config
445
+ config = AutoConfig.from_pretrained(
446
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
447
+ )
448
+
449
+ def create_vocabulary_from_data(
450
+ datasets: DatasetDict,
451
+ word_delimiter_token: Optional[str] = None,
452
+ unk_token: Optional[str] = None,
453
+ pad_token: Optional[str] = None,
454
+ ):
455
+ # Given training and test labels create vocabulary
456
+ def extract_all_chars(batch):
457
+ all_text = " ".join(batch["target_text"])
458
+ vocab = list(set(all_text))
459
+ return {"vocab": [vocab], "all_text": [all_text]}
460
+
461
+ vocabs = datasets.map(
462
+ extract_all_chars,
463
+ batched=True,
464
+ batch_size=-1,
465
+ keep_in_memory=True,
466
+ remove_columns=datasets["train"].column_names,
467
+ )
468
+
469
+ # take union of all unique characters in each dataset
470
+ vocab_set = functools.reduce(
471
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
472
+ )
473
+
474
+
475
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
476
+
477
+ # replace white space with delimiter token
478
+ if word_delimiter_token is not None:
479
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
480
+ del vocab_dict[" "]
481
+
482
+ # add unk and pad token
483
+ if unk_token is not None:
484
+ vocab_dict[unk_token] = len(vocab_dict)
485
+
486
+ if pad_token is not None:
487
+ vocab_dict[pad_token] = len(vocab_dict)
488
+
489
+ return vocab_dict
490
+
491
+
492
+ raw_datasets["train"] = raw_datasets["train"].remove_columns("file_id")
493
+
494
+
495
+ # 4. Next, if no tokenizer file is defined,
496
+ # we create the vocabulary of the model by extracting all unique characters from
497
+ # the training and evaluation datasets
498
+ # We need to make sure that only first rank saves vocabulary
499
+ # make sure all processes wait until vocab is created
500
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
501
+ tokenizer_kwargs = {}
502
+ if tokenizer_name_or_path is None:
503
+ # save vocab in training output dir
504
+ tokenizer_name_or_path = training_args.output_dir
505
+
506
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
507
+
508
+ with training_args.main_process_first():
509
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
510
+ os.remove(vocab_file)
511
+
512
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
513
+ if not os.path.isfile(vocab_file):
514
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
515
+ vocab_dict = create_vocabulary_from_data(
516
+ raw_datasets,
517
+ word_delimiter_token=word_delimiter_token,
518
+ unk_token=unk_token,
519
+ pad_token=pad_token,
520
+ )
521
+
522
+ # save vocab dict to be loaded into tokenizer
523
+ with open(vocab_file, "w") as file:
524
+ json.dump(vocab_dict, file)
525
+
526
+ # if tokenizer has just been created
527
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
528
+ tokenizer_kwargs = {
529
+ "config": config if config.tokenizer_class is not None else None,
530
+ "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
531
+ "unk_token": unk_token,
532
+ "pad_token": pad_token,
533
+ "word_delimiter_token": word_delimiter_token,
534
+ }
535
+
536
+
537
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
538
+ # Note for distributed training, the .from_pretrained methods guarantee that only
539
+ # one local process can concurrently download model & vocab.
540
+
541
+ # load feature_extractor and tokenizer
542
+ tokenizer = AutoTokenizer.from_pretrained(
543
+ tokenizer_name_or_path,
544
+ use_auth_token=data_args.use_auth_token,
545
+ **tokenizer_kwargs,
546
+ )
547
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
548
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
549
+ )
550
+
551
+
552
+ # adapt config
553
+ config.update(
554
+ {
555
+ "feat_proj_dropout": model_args.feat_proj_dropout,
556
+ "attention_dropout": model_args.attention_dropout,
557
+ "hidden_dropout": model_args.hidden_dropout,
558
+ "final_dropout": model_args.final_dropout,
559
+ "mask_time_prob": model_args.mask_time_prob,
560
+ "mask_time_length": model_args.mask_time_length,
561
+ "mask_feature_prob": model_args.mask_feature_prob,
562
+ "mask_feature_length": model_args.mask_feature_length,
563
+ "gradient_checkpointing": training_args.gradient_checkpointing,
564
+ "layerdrop": model_args.layerdrop,
565
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
566
+ "pad_token_id": tokenizer.pad_token_id,
567
+ "vocab_size": len(tokenizer),
568
+ "activation_dropout": model_args.activation_dropout,
569
+ }
570
+ )
571
+
572
+
573
+ # create model
574
+ model = AutoModelForCTC.from_pretrained(
575
+ model_args.model_name_or_path,
576
+ cache_dir=model_args.cache_dir,
577
+ config=config,
578
+ use_auth_token=data_args.use_auth_token,
579
+ )
580
+
581
+ # freeze encoder
582
+ if model_args.freeze_feature_encoder:
583
+ model.freeze_feature_encoder()
584
+
585
+
586
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
587
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
588
+ # so that we just need to set the correct target sampling rate and normalize the input
589
+ # via the `feature_extractor`
590
+
591
+ # make sure that dataset decodes audio with correct sampling rate
592
+ dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
593
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
594
+ raw_datasets = raw_datasets.cast_column(
595
+ data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
596
+ )
597
+
598
+ # derive max & min input length for sample rate & max duration
599
+ max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
600
+ min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
601
+
602
+ audio_column_name = data_args.audio_column_name
603
+ num_workers = data_args.preprocessing_num_workers
604
+
605
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
606
+ phoneme_language = data_args.phoneme_language
607
+
608
+
609
+ # Preprocessing the datasets.
610
+ # We need to read the audio files as arrays and tokenize the targets.
611
+ def prepare_dataset(batch):
612
+ # load audio
613
+ sample = batch[audio_column_name]
614
+
615
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
616
+ batch["input_values"] = inputs.input_values[0]
617
+ batch["input_length"] = len(batch["input_values"])
618
+
619
+ # encode targets
620
+ additional_kwargs = {}
621
+ if phoneme_language is not None:
622
+ additional_kwargs["phonemizer_lang"] = phoneme_language
623
+
624
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
625
+ return batch
626
+
627
+ def vectorizing_record(audio_path, target_text):
628
+ batch = {}
629
+
630
+ array, sampling_rate = torchaudio.load(audio_path, format="mp3")
631
+
632
+ batch["input_values"] = array.mean(axis=0)
633
+ batch["input_length"] = len(array)
634
+
635
+ # encode targets
636
+ additional_kwargs = {}
637
+ if phoneme_language is not None:
638
+ additional_kwargs["phonemizer_lang"] = phoneme_language
639
+
640
+ batch["labels"] = tokenizer(target_text, **additional_kwargs).input_ids
641
+ return batch
642
+
643
+
644
+ # In[ ]:
645
+
646
+ print(f"========\n\n{num_workers}\n\n========")
647
+ with training_args.main_process_first(desc="dataset map preprocessing"):
648
+ saved_vecs_path = f"{base_path}/saved_vec_dataset-{from_rows}-{to_rows}.ds"
649
+ if not os.path.exists(saved_vecs_path):
650
+
651
+ vectorized_datasets = raw_datasets.map(
652
+ prepare_dataset,
653
+ remove_columns=next(iter(raw_datasets.values())).column_names,
654
+ num_proc=num_workers,
655
+ desc="preprocess datasets",
656
+ )
657
+
658
+
659
+ def is_audio_in_length_range(length):
660
+ return length > min_input_length and length < max_input_length
661
+
662
+ # filter data that is shorter than min_input_length
663
+ vectorized_datasets = vectorized_datasets.filter(
664
+ is_audio_in_length_range,
665
+ num_proc=num_workers,
666
+ input_columns=["input_length"],
667
+ )
668
+
669
+ # save to local disk
670
+ vectorized_datasets.save_to_disk(saved_vecs_path)
671
+ else:
672
+ # read from disk
673
+ vectorized_datasets = load_from_disk(saved_vecs_path)
674
+
675
+ print(vectorized_datasets)
676
+
677
+ # 7. Next, we can prepare the training.
678
+ # Let's use word error rate (WER) as our evaluation metric,
679
+ # instantiate a data collator and the trainer
680
+
681
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
682
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
683
+
684
+ vectorized_datasets["train"] = vectorized_datasets["train"].remove_columns("input_length")
685
+ vectorized_datasets["eval"] = vectorized_datasets["eval"].remove_columns("input_length")
686
+
687
+ # for large datasets it is advised to run the preprocessing on a
688
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
689
+ # be a timeout when running the script in distributed mode.
690
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
691
+ # cached dataset
692
+ if data_args.preprocessing_only:
693
+ logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
694
+
695
+
696
+
697
+ def compute_metrics(pred):
698
+ pred_logits = pred.predictions
699
+ pred_ids = np.argmax(pred_logits, axis=-1)
700
+
701
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
702
+
703
+ pred_str = tokenizer.batch_decode(pred_ids)
704
+
705
+ # we do not want to group tokens when computing the metrics
706
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
707
+
708
+ metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
709
+ return metrics
710
+
711
+ # Now save everything to be able to create a single processor later
712
+ if is_main_process(training_args.local_rank):
713
+ # save feature extractor, tokenizer and config
714
+ feature_extractor.save_pretrained(training_args.output_dir)
715
+ tokenizer.save_pretrained(training_args.output_dir)
716
+ config.save_pretrained(training_args.output_dir)
717
+
718
+ try:
719
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
720
+ except (OSError, KeyError):
721
+ warnings.warn(
722
+ "Loading a processor from a feature extractor config that does not"
723
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
724
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
725
+ " `'processor_class': 'Wav2Vec2Processor'`",
726
+ FutureWarning,
727
+ )
728
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
729
+
730
+ # Instantiate custom data collator
731
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
732
+
733
+
734
+ decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
735
+ decay_parameters = [name for name in decay_parameters if "bias" not in name]
736
+
737
+ optimizer_grouped_parameters = [
738
+ {
739
+ "params": [p for n, p in model.named_parameters() if n in decay_parameters],
740
+ "weight_decay": training_args.weight_decay,
741
+ },
742
+ {
743
+ "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
744
+ "weight_decay": 0.0,
745
+ },
746
+ ]
747
+
748
+ optimizer = bnb.optim.Adam8bit(
749
+ params=optimizer_grouped_parameters,
750
+ lr=training_args.learning_rate,
751
+ betas=(training_args.adam_beta1, training_args.adam_beta2),
752
+ eps=training_args.adam_epsilon,
753
+ )
754
+
755
+ optimizers = (optimizer, None)
756
+
757
+
758
+ # Initialize Trainer
759
+ trainer = Trainer(
760
+ model=model,
761
+ data_collator=data_collator,
762
+ args=training_args,
763
+ compute_metrics=compute_metrics,
764
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
765
+ eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
766
+ tokenizer=feature_extractor,
767
+ optimizers=optimizers,
768
+ )
769
+
770
+
771
+
772
+ # 8. Finally, we can start training
773
+
774
+ # Training
775
+ if training_args.do_train and not data_args.preprocessing_only:
776
+
777
+ # use last checkpoint if exist
778
+ if last_checkpoint is not None:
779
+ checkpoint = last_checkpoint
780
+ elif os.path.isdir(model_args.model_name_or_path):
781
+ checkpoint = model_args.model_name_or_path
782
+ else:
783
+ checkpoint = None
784
+
785
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
786
+ trainer.save_model()
787
+
788
+ metrics = train_result.metrics
789
+ max_train_samples = (
790
+ data_args.max_train_samples
791
+ if data_args.max_train_samples is not None
792
+ else len(vectorized_datasets["train"])
793
+ )
794
+ metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
795
+
796
+ trainer.log_metrics("train", metrics)
797
+ trainer.save_metrics("train", metrics)
798
+ trainer.save_state()
799
+
800
+
801
+ # Evaluation
802
+ results = {}
803
+ if training_args.do_eval and not data_args.preprocessing_only:
804
+ logger.info("*** Evaluate ***")
805
+ metrics = trainer.evaluate()
806
+ max_eval_samples = (
807
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
808
+ )
809
+ metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
810
+
811
+ trainer.log_metrics("eval", metrics)
812
+ trainer.save_metrics("eval", metrics)
813
+
814
+ # Write model card and (optionally) push to hub
815
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
816
+ kwargs = {
817
+ "finetuned_from": model_args.model_name_or_path,
818
+ "tasks": "speech-recognition",
819
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
820
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
821
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
822
+ }
823
+
824
+ if not data_args.preprocessing_only:
825
+ if "common_voice" in data_args.dataset_name:
826
+ kwargs["language"] = config_name
827
+
828
+
829
+ if training_args.push_to_hub:
830
+ trainer.push_to_hub(**kwargs)
831
+ else:
832
+ trainer.create_model_card(**kwargs)
833
+
834
+ print(results)
835
+