marinone94
/

xls-r-300m-sv-robust

@@ -356,19 +356,8 @@ def create_vocabulary_from_data(
     return vocab_dict
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # TODO: Replace with check of wandb env vars
     try:
         repo_name = os.getcwd().split("/")[-1]
@@ -381,7 +370,10 @@ def main():
     except:
         pass
-    # Detecting last checkpoint.
     last_checkpoint = None
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
@@ -395,8 +387,12 @@ def main():
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
-    # Setup logging
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
@@ -414,10 +410,9 @@ def main():
         transformers.utils.logging.set_verbosity_info()
     logger.info("Training/evaluation parameters %s", training_args)
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-    # 1. First, let's load the dataset
     raw_datasets = DatasetDict()
     def common_cols(columns_a, columns_b):
@@ -435,7 +430,6 @@ def main():
             data_args.train_split_name.split(","),
         ):
             if train_split_name != "None":
                 if "train" not in raw_datasets:
                     raw_datasets["train"] = load_dataset(
@@ -544,17 +538,20 @@ def main():
         other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
         raw_datasets["eval"].remove_columns(other_columns_eval)
-    # pd_eval_head = raw_datasets["eval"].select(range(10)).to_pandas()
-    # pd_eval_tail = raw_datasets["eval"].select(range(raw_datasets["eval"].num_rows-10, raw_datasets["eval"].num_rows)).to_pandas()
-    # pd_eval = pd.concat([pd_eval_head, pd_eval_tail])
-    # print(pd_eval["audio"])
-    # 2. We remove some special characters from the datasets
-    # that make training complicated and do not help in transcribing the speech
-    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
-    # that could be easily picked up by the model
     chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
     )
     text_column_name = data_args.text_column_name
@@ -616,18 +613,11 @@ def main():
     unk_token = data_args.unk_token
     pad_token = data_args.pad_token
-    # 3. Next, let's load the config as we might need it to create
-    # the tokenizer
-    # load config
-    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
-    )
-    # 4. Next, if no tokenizer file is defined,
-    # we create the vocabulary of the model by extracting all unique characters from
-    # the training and evaluation datasets
-    # We need to make sure that only first rank saves vocabulary
-    # make sure all processes wait until vocab is created
     tokenizer_name_or_path = model_args.tokenizer_name_or_path
     tokenizer_kwargs = {}
     if tokenizer_name_or_path is None:
@@ -663,11 +653,17 @@ def main():
             "pad_token": pad_token,
             "word_delimiter_token": word_delimiter_token,
         }
-    # 5. Now we can instantiate the feature extractor, tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
     # load feature_extractor and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
@@ -709,11 +705,13 @@ def main():
     # freeze encoder
     if model_args.freeze_feature_encoder:
         model.freeze_feature_encoder()
-    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
-    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
-    # so that we just need to set the correct target sampling rate and normalize the input
-    # via the `feature_extractor`
     # make sure that dataset decodes audio with correct sampling rate
     dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
@@ -773,13 +771,6 @@ def main():
             input_columns=["input_length"],
         )
-    # 7. Next, we can prepare the training.
-    # Let's use word error rate (WER) as our evaluation metric,
-    # instantiate a data collator and the trainer
-    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
-    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
     # If dataset_seed is set, shuffle train
     if data_args.dataset_seed is not None:
         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
@@ -789,10 +780,101 @@ def main():
     pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
     # wandb.log({"train_sample": pd_train})
     # wandb.log({"eval_sample": pd_eval})
-    print(pd_train)
-    print(pd_eval)
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
@@ -803,23 +885,23 @@ def main():
         logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
         return
     def compute_metrics(pred):
         pred_logits = pred.predictions
         pred_ids = np.argmax(pred_logits, axis=-1)
         pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
         pred_str = tokenizer.batch_decode(pred_ids)
         # we do not want to group tokens when computing the metrics
         label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
-        print(pred_str)
-        print(label_str)
         metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
         return metrics
     # Now save everything to be able to create a single processor later
     if is_main_process(training_args.local_rank):
         # save feature extractor, tokenizer and config
@@ -854,8 +936,6 @@ def main():
     )
     # 8. Finally, we can start training
-    # Training
     if training_args.do_train:
         # use last checkpoint if exist

     return vocab_dict
+def init_wandb(training_args):
+    # Adds report to wandb in training args if login succeds
     # TODO: Replace with check of wandb env vars
     try:
         repo_name = os.getcwd().split("/")[-1]
     except:
         pass
+    return training_args
+def detect_last_checkpoint(training_args):
+    # Get last checkpoint if training mode and no overwrite flag is set
     last_checkpoint = None
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
+    return last_checkpoint
+def set_logging(training_args):
+    # Set logging level
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
         transformers.utils.logging.set_verbosity_info()
     logger.info("Training/evaluation parameters %s", training_args)
+def load_raw_datasets(training_args, data_args):
     raw_datasets = DatasetDict()
     def common_cols(columns_a, columns_b):
             data_args.train_split_name.split(","),
         ):
             if train_split_name != "None":
                 if "train" not in raw_datasets:
                     raw_datasets["train"] = load_dataset(
         other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
         raw_datasets["eval"].remove_columns(other_columns_eval)
+        # pd_eval_head = raw_datasets["eval"].select(range(10)).to_pandas()
+        # pd_eval_tail = raw_datasets["eval"].select(range(raw_datasets["eval"].num_rows-10, raw_datasets["eval"].num_rows)).to_pandas()
+        # pd_eval = pd.concat([pd_eval_head, pd_eval_tail])
+        # print(pd_eval["audio"])
+    return raw_datasets
+def preprocess_text_datasets(raw_datasets, training_args, data_args):
+    chars_to_ignore = [",", "?", ".", "!", "-", ";", ":", "\"", "“", "%", "‘", "”", "�", "—", "’", "…", "–"]
     chars_to_ignore_regex = (
+        f'[{"".join(chars_to_ignore)}]'
     )
     text_column_name = data_args.text_column_name
     unk_token = data_args.unk_token
     pad_token = data_args.pad_token
+    return raw_datasets, word_delimiter_token, unk_token, pad_token
+def create_vocab(raw_datasets, config, training_args, model_args, word_delimiter_token, unk_token, pad_token):
     tokenizer_name_or_path = model_args.tokenizer_name_or_path
     tokenizer_kwargs = {}
     if tokenizer_name_or_path is None:
             "pad_token": pad_token,
             "word_delimiter_token": word_delimiter_token,
         }
+        return tokenizer_name_or_path, tokenizer_kwargs
+def inst_model_tokenizer_feature_extractor(
+    training_args,
+    model_args,
+    data_args,
+    tokenizer_name_or_path,
+    tokenizer_kwargs,
+    config
+):
     # load feature_extractor and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
     # freeze encoder
     if model_args.freeze_feature_encoder:
         model.freeze_feature_encoder()
+    return model, tokenizer, feature_extractor, config
+def preprocess_audio_datasets(raw_datasets, tokenizer, feature_extractor, training_args, data_args):
+    num_workers = data_args.preprocessing_num_workers
     # make sure that dataset decodes audio with correct sampling rate
     dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
             input_columns=["input_length"],
         )
     # If dataset_seed is set, shuffle train
     if data_args.dataset_seed is not None:
         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
     pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
     # wandb.log({"train_sample": pd_train})
     # wandb.log({"eval_sample": pd_eval})
+    return vectorized_datasets
+def main():
+    # 0. Initialize script
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Adds report to wandb in training args if login succeds
+    training_args = init_wandb(training_args=training_args)
+    last_checkpoint = detect_last_checkpoint(training_args=training_args)
+    set_logging(training_args=training_args)
+    set_seed(training_args.seed)
+    # 1. Load and compose the datasets
+    raw_datasets = load_raw_datasets(
+        training_args=training_args,
+        data_args=data_args
+    )
+    # 2. Preprocess the datasets
+    #
+    # We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    raw_datasets, word_delimiter_token, unk_token, pad_token = preprocess_text_datasets(
+        raw_datasets= raw_datasets,
+        training_args=training_args,
+        data_args=data_args
+    )
+    # 3.Load the config to create the tokenizer
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path, tokenizer_kwargs = create_vocab(
+        raw_datasets= raw_datasets,
+        config=config,
+        training_args=training_args,
+        model_args=model_args,
+        word_delimiter_token=word_delimiter_token,
+        unk_token=unk_token,
+        pad_token=pad_token,
+    )
+    # 5. Instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+    model, tokenizer, feature_extractor, config = inst_model_tokenizer_feature_extractor(
+        training_args=training_args,
+        model_args=model_args,
+        data_args=data_args,
+        tokenizer_name_or_path=tokenizer_name_or_path,
+        tokenizer_kwargs=tokenizer_kwargs,
+        config=config
+    )
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+    vectorized_datasets = preprocess_audio_datasets(
+        raw_datasets=raw_datasets,
+        tokenizer=tokenizer,
+        feature_extractor=feature_extractor,
+        training_args=training_args,
+        data_args=data_args
+    )
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
         logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
         return
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
     def compute_metrics(pred):
         pred_logits = pred.predictions
         pred_ids = np.argmax(pred_logits, axis=-1)
         pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
         pred_str = tokenizer.batch_decode(pred_ids)
         # we do not want to group tokens when computing the metrics
         label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
         metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
         return metrics
     # Now save everything to be able to create a single processor later
     if is_main_process(training_args.local_rank):
         # save feature extractor, tokenizer and config
     )
     # 8. Finally, we can start training
     if training_args.do_train:
         # use last checkpoint if exist

join_datasets_asr_ctc_run.sh CHANGED Viewed

@@ -20,7 +20,6 @@ python old_run_asr_ctc.py \
 	--eval_steps="50" \
 	--save_steps="50" \
 	--text_column_name="sentence" \
-	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
 	--logging_steps="20" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \

 	--eval_steps="50" \
 	--save_steps="50" \
 	--text_column_name="sentence" \
 	--logging_steps="20" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \