marinone94 commited on
Commit
a9cc788
1 Parent(s): a7f7ee8

clean script

Browse files
join_datasets_asr_ctc.py CHANGED
@@ -356,19 +356,8 @@ def create_vocabulary_from_data(
356
  return vocab_dict
357
 
358
 
359
- def main():
360
- # See all possible arguments in src/transformers/training_args.py
361
- # or by passing the --help flag to this script.
362
- # We now keep distinct sets of args, for a cleaner separation of concerns.
363
-
364
- parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
365
- if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
366
- # If we pass only one argument to the script and it's the path to a json file,
367
- # let's parse it to get our arguments.
368
- model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
369
- else:
370
- model_args, data_args, training_args = parser.parse_args_into_dataclasses()
371
-
372
  # TODO: Replace with check of wandb env vars
373
  try:
374
  repo_name = os.getcwd().split("/")[-1]
@@ -381,7 +370,10 @@ def main():
381
  except:
382
  pass
383
 
384
- # Detecting last checkpoint.
 
 
 
385
  last_checkpoint = None
386
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
387
  last_checkpoint = get_last_checkpoint(training_args.output_dir)
@@ -395,8 +387,12 @@ def main():
395
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
396
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
397
  )
 
 
398
 
399
- # Setup logging
 
 
400
  logging.basicConfig(
401
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
402
  datefmt="%m/%d/%Y %H:%M:%S",
@@ -414,10 +410,9 @@ def main():
414
  transformers.utils.logging.set_verbosity_info()
415
  logger.info("Training/evaluation parameters %s", training_args)
416
 
417
- # Set seed before initializing model.
418
- set_seed(training_args.seed)
419
 
420
- # 1. First, let's load the dataset
 
421
  raw_datasets = DatasetDict()
422
 
423
  def common_cols(columns_a, columns_b):
@@ -435,7 +430,6 @@ def main():
435
  data_args.train_split_name.split(","),
436
  ):
437
 
438
-
439
  if train_split_name != "None":
440
  if "train" not in raw_datasets:
441
  raw_datasets["train"] = load_dataset(
@@ -544,17 +538,20 @@ def main():
544
  other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
545
  raw_datasets["eval"].remove_columns(other_columns_eval)
546
 
547
- # pd_eval_head = raw_datasets["eval"].select(range(10)).to_pandas()
548
- # pd_eval_tail = raw_datasets["eval"].select(range(raw_datasets["eval"].num_rows-10, raw_datasets["eval"].num_rows)).to_pandas()
549
- # pd_eval = pd.concat([pd_eval_head, pd_eval_tail])
550
- # print(pd_eval["audio"])
 
 
 
 
 
 
 
551
 
552
- # 2. We remove some special characters from the datasets
553
- # that make training complicated and do not help in transcribing the speech
554
- # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
555
- # that could be easily picked up by the model
556
  chars_to_ignore_regex = (
557
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
558
  )
559
  text_column_name = data_args.text_column_name
560
 
@@ -616,18 +613,11 @@ def main():
616
  unk_token = data_args.unk_token
617
  pad_token = data_args.pad_token
618
 
619
- # 3. Next, let's load the config as we might need it to create
620
- # the tokenizer
621
- # load config
622
- config = AutoConfig.from_pretrained(
623
- model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
624
- )
625
 
626
- # 4. Next, if no tokenizer file is defined,
627
- # we create the vocabulary of the model by extracting all unique characters from
628
- # the training and evaluation datasets
629
- # We need to make sure that only first rank saves vocabulary
630
- # make sure all processes wait until vocab is created
631
  tokenizer_name_or_path = model_args.tokenizer_name_or_path
632
  tokenizer_kwargs = {}
633
  if tokenizer_name_or_path is None:
@@ -663,11 +653,17 @@ def main():
663
  "pad_token": pad_token,
664
  "word_delimiter_token": word_delimiter_token,
665
  }
 
666
 
667
- # 5. Now we can instantiate the feature extractor, tokenizer and model
668
- # Note for distributed training, the .from_pretrained methods guarantee that only
669
- # one local process can concurrently download model & vocab.
670
 
 
 
 
 
 
 
 
 
671
  # load feature_extractor and tokenizer
672
  tokenizer = AutoTokenizer.from_pretrained(
673
  tokenizer_name_or_path,
@@ -709,11 +705,13 @@ def main():
709
  # freeze encoder
710
  if model_args.freeze_feature_encoder:
711
  model.freeze_feature_encoder()
 
 
712
 
713
- # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
714
- # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
715
- # so that we just need to set the correct target sampling rate and normalize the input
716
- # via the `feature_extractor`
717
 
718
  # make sure that dataset decodes audio with correct sampling rate
719
  dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
@@ -773,13 +771,6 @@ def main():
773
  input_columns=["input_length"],
774
  )
775
 
776
- # 7. Next, we can prepare the training.
777
- # Let's use word error rate (WER) as our evaluation metric,
778
- # instantiate a data collator and the trainer
779
-
780
- # Define evaluation metrics during training, *i.e.* word error rate, character error rate
781
- eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
782
-
783
  # If dataset_seed is set, shuffle train
784
  if data_args.dataset_seed is not None:
785
  vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
@@ -789,10 +780,101 @@ def main():
789
  pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
790
  # wandb.log({"train_sample": pd_train})
791
  # wandb.log({"eval_sample": pd_eval})
 
 
 
 
 
792
 
793
- print(pd_train)
794
- print(pd_eval)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
 
797
  # for large datasets it is advised to run the preprocessing on a
798
  # single machine first with ``args.preprocessing_only`` since there will mostly likely
@@ -803,23 +885,23 @@ def main():
803
  logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
804
  return
805
 
 
 
806
  def compute_metrics(pred):
807
  pred_logits = pred.predictions
808
  pred_ids = np.argmax(pred_logits, axis=-1)
809
 
810
  pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
811
-
812
  pred_str = tokenizer.batch_decode(pred_ids)
813
 
814
  # we do not want to group tokens when computing the metrics
815
  label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
816
 
817
- print(pred_str)
818
- print(label_str)
819
  metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
820
 
821
  return metrics
822
 
 
823
  # Now save everything to be able to create a single processor later
824
  if is_main_process(training_args.local_rank):
825
  # save feature extractor, tokenizer and config
@@ -854,8 +936,6 @@ def main():
854
  )
855
 
856
  # 8. Finally, we can start training
857
-
858
- # Training
859
  if training_args.do_train:
860
 
861
  # use last checkpoint if exist
 
356
  return vocab_dict
357
 
358
 
359
+ def init_wandb(training_args):
360
+ # Adds report to wandb in training args if login succeds
 
 
 
 
 
 
 
 
 
 
 
361
  # TODO: Replace with check of wandb env vars
362
  try:
363
  repo_name = os.getcwd().split("/")[-1]
 
370
  except:
371
  pass
372
 
373
+ return training_args
374
+
375
+ def detect_last_checkpoint(training_args):
376
+ # Get last checkpoint if training mode and no overwrite flag is set
377
  last_checkpoint = None
378
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
379
  last_checkpoint = get_last_checkpoint(training_args.output_dir)
 
387
  f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
388
  "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
389
  )
390
+
391
+ return last_checkpoint
392
 
393
+
394
+ def set_logging(training_args):
395
+ # Set logging level
396
  logging.basicConfig(
397
  format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
398
  datefmt="%m/%d/%Y %H:%M:%S",
 
410
  transformers.utils.logging.set_verbosity_info()
411
  logger.info("Training/evaluation parameters %s", training_args)
412
 
 
 
413
 
414
+ def load_raw_datasets(training_args, data_args):
415
+
416
  raw_datasets = DatasetDict()
417
 
418
  def common_cols(columns_a, columns_b):
 
430
  data_args.train_split_name.split(","),
431
  ):
432
 
 
433
  if train_split_name != "None":
434
  if "train" not in raw_datasets:
435
  raw_datasets["train"] = load_dataset(
 
538
  other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
539
  raw_datasets["eval"].remove_columns(other_columns_eval)
540
 
541
+ # pd_eval_head = raw_datasets["eval"].select(range(10)).to_pandas()
542
+ # pd_eval_tail = raw_datasets["eval"].select(range(raw_datasets["eval"].num_rows-10, raw_datasets["eval"].num_rows)).to_pandas()
543
+ # pd_eval = pd.concat([pd_eval_head, pd_eval_tail])
544
+ # print(pd_eval["audio"])
545
+
546
+ return raw_datasets
547
+
548
+
549
+ def preprocess_text_datasets(raw_datasets, training_args, data_args):
550
+
551
+ chars_to_ignore = [",", "?", ".", "!", "-", ";", ":", "\"", "“", "%", "‘", "”", "�", "—", "’", "…", "–"]
552
 
 
 
 
 
553
  chars_to_ignore_regex = (
554
+ f'[{"".join(chars_to_ignore)}]'
555
  )
556
  text_column_name = data_args.text_column_name
557
 
 
613
  unk_token = data_args.unk_token
614
  pad_token = data_args.pad_token
615
 
616
+ return raw_datasets, word_delimiter_token, unk_token, pad_token
617
+
618
+
619
+ def create_vocab(raw_datasets, config, training_args, model_args, word_delimiter_token, unk_token, pad_token):
 
 
620
 
 
 
 
 
 
621
  tokenizer_name_or_path = model_args.tokenizer_name_or_path
622
  tokenizer_kwargs = {}
623
  if tokenizer_name_or_path is None:
 
653
  "pad_token": pad_token,
654
  "word_delimiter_token": word_delimiter_token,
655
  }
656
+ return tokenizer_name_or_path, tokenizer_kwargs
657
 
 
 
 
658
 
659
+ def inst_model_tokenizer_feature_extractor(
660
+ training_args,
661
+ model_args,
662
+ data_args,
663
+ tokenizer_name_or_path,
664
+ tokenizer_kwargs,
665
+ config
666
+ ):
667
  # load feature_extractor and tokenizer
668
  tokenizer = AutoTokenizer.from_pretrained(
669
  tokenizer_name_or_path,
 
705
  # freeze encoder
706
  if model_args.freeze_feature_encoder:
707
  model.freeze_feature_encoder()
708
+
709
+ return model, tokenizer, feature_extractor, config
710
 
711
+
712
+ def preprocess_audio_datasets(raw_datasets, tokenizer, feature_extractor, training_args, data_args):
713
+
714
+ num_workers = data_args.preprocessing_num_workers
715
 
716
  # make sure that dataset decodes audio with correct sampling rate
717
  dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
 
771
  input_columns=["input_length"],
772
  )
773
 
 
 
 
 
 
 
 
774
  # If dataset_seed is set, shuffle train
775
  if data_args.dataset_seed is not None:
776
  vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
 
780
  pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
781
  # wandb.log({"train_sample": pd_train})
782
  # wandb.log({"eval_sample": pd_eval})
783
+
784
+ return vectorized_datasets
785
+
786
+
787
+
788
 
789
+ def main():
790
+ # 0. Initialize script
791
+
792
+ # See all possible arguments in src/transformers/training_args.py
793
+ # or by passing the --help flag to this script.
794
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
795
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
796
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
797
+ # If we pass only one argument to the script and it's the path to a json file,
798
+ # let's parse it to get our arguments.
799
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
800
+ else:
801
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
802
+
803
+ # Adds report to wandb in training args if login succeds
804
+ training_args = init_wandb(training_args=training_args)
805
 
806
+ last_checkpoint = detect_last_checkpoint(training_args=training_args)
807
+
808
+ set_logging(training_args=training_args)
809
+
810
+ set_seed(training_args.seed)
811
+
812
+ # 1. Load and compose the datasets
813
+ raw_datasets = load_raw_datasets(
814
+ training_args=training_args,
815
+ data_args=data_args
816
+ )
817
+
818
+ # 2. Preprocess the datasets
819
+ #
820
+ # We remove some special characters from the datasets
821
+ # that make training complicated and do not help in transcribing the speech
822
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
823
+ # that could be easily picked up by the model
824
+ raw_datasets, word_delimiter_token, unk_token, pad_token = preprocess_text_datasets(
825
+ raw_datasets= raw_datasets,
826
+ training_args=training_args,
827
+ data_args=data_args
828
+ )
829
+
830
+ # 3.Load the config to create the tokenizer
831
+ config = AutoConfig.from_pretrained(
832
+ model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
833
+ )
834
+
835
+ # 4. Next, if no tokenizer file is defined,
836
+ # we create the vocabulary of the model by extracting all unique characters from
837
+ # the training and evaluation datasets
838
+ # We need to make sure that only first rank saves vocabulary
839
+ # make sure all processes wait until vocab is created
840
+ tokenizer_name_or_path, tokenizer_kwargs = create_vocab(
841
+ raw_datasets= raw_datasets,
842
+ config=config,
843
+ training_args=training_args,
844
+ model_args=model_args,
845
+ word_delimiter_token=word_delimiter_token,
846
+ unk_token=unk_token,
847
+ pad_token=pad_token,
848
+ )
849
+
850
+ # 5. Instantiate the feature extractor, tokenizer and model
851
+ # Note for distributed training, the .from_pretrained methods guarantee that only
852
+ # one local process can concurrently download model & vocab.
853
+ model, tokenizer, feature_extractor, config = inst_model_tokenizer_feature_extractor(
854
+ training_args=training_args,
855
+ model_args=model_args,
856
+ data_args=data_args,
857
+ tokenizer_name_or_path=tokenizer_name_or_path,
858
+ tokenizer_kwargs=tokenizer_kwargs,
859
+ config=config
860
+ )
861
+
862
+
863
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
864
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
865
+ # so that we just need to set the correct target sampling rate and normalize the input
866
+ # via the `feature_extractor`
867
+ vectorized_datasets = preprocess_audio_datasets(
868
+ raw_datasets=raw_datasets,
869
+ tokenizer=tokenizer,
870
+ feature_extractor=feature_extractor,
871
+ training_args=training_args,
872
+ data_args=data_args
873
+ )
874
+
875
+ # 7. Next, we can prepare the training.
876
+ # Let's use word error rate (WER) as our evaluation metric,
877
+ # instantiate a data collator and the trainer
878
 
879
  # for large datasets it is advised to run the preprocessing on a
880
  # single machine first with ``args.preprocessing_only`` since there will mostly likely
 
885
  logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
886
  return
887
 
888
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
889
+ eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
890
  def compute_metrics(pred):
891
  pred_logits = pred.predictions
892
  pred_ids = np.argmax(pred_logits, axis=-1)
893
 
894
  pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
 
895
  pred_str = tokenizer.batch_decode(pred_ids)
896
 
897
  # we do not want to group tokens when computing the metrics
898
  label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
899
 
 
 
900
  metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
901
 
902
  return metrics
903
 
904
+
905
  # Now save everything to be able to create a single processor later
906
  if is_main_process(training_args.local_rank):
907
  # save feature extractor, tokenizer and config
 
936
  )
937
 
938
  # 8. Finally, we can start training
 
 
939
  if training_args.do_train:
940
 
941
  # use last checkpoint if exist
join_datasets_asr_ctc_run.sh CHANGED
@@ -20,7 +20,6 @@ python old_run_asr_ctc.py \
20
  --eval_steps="50" \
21
  --save_steps="50" \
22
  --text_column_name="sentence" \
23
- --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
24
  --logging_steps="20" \
25
  --layerdrop="0.0" \
26
  --activation_dropout="0.1" \
 
20
  --eval_steps="50" \
21
  --save_steps="50" \
22
  --text_column_name="sentence" \
 
23
  --logging_steps="20" \
24
  --layerdrop="0.0" \
25
  --activation_dropout="0.1" \