pere commited on
Commit
f5c74a6
1 Parent(s): f583313
Files changed (2) hide show
  1. run_test.nst +41 -0
  2. run_whisper_finetuning.py +21 -18
run_test.nst ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Whisper Finetuning script for the NST dataset
2
+ # Currently for training on a 48GB
3
+ # Reduce batch size and learning rate if training on smaller GPU
4
+
5
+ python run_whisper_finetuning.py \
6
+ --model_name_or_path="openai/whisper-small" \
7
+ --output_dir="../whisper-test-delete" \
8
+ --overwrite_output_dir=True \
9
+ --language="Norwegian" \
10
+ --task="transcribe" \
11
+ --dataset_name="NbAiLab/NST" \
12
+ --dataset_config="no-close" \
13
+ --do_train=True \
14
+ --do_eval=True \
15
+ --audio_column_name="audio" \
16
+ --text_column_name="text" \
17
+ --per_device_train_batch_size=48 \
18
+ --per_device_train_batch_size=48 \
19
+ --learning_rate=4e-5 \
20
+ --warmup_steps=5 \
21
+ --max_steps=50 \
22
+ --gradient_checkpointing=True \
23
+ --gradient_accumulation_steps=1 \
24
+ --group_by_length=False \
25
+ --evaluation_strategy="steps" \
26
+ --save_steps=10 \
27
+ --eval_steps=10 \
28
+ --max_eval_samples=10 \
29
+ --logging_steps=10 \
30
+ --fp16=True \
31
+ --load_best_model_at_end=True \
32
+ --metric_for_best_model="wer" \
33
+ --greater_is_better=False \
34
+ --report_to="tensorboard" \
35
+ --predict_with_generate=True \
36
+ --generation_max_length=225 \
37
+ --print_training_arguments=True \
38
+ --push_to_hub=True
39
+
40
+
41
+
run_whisper_finetuning.py CHANGED
@@ -408,6 +408,9 @@ def main():
408
  model_args.model_name_or_path, language=model_args.language, task=model_args.task)
409
  data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
410
 
 
 
 
411
 
412
  # Prepare data
413
  # TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
@@ -416,6 +419,7 @@ def main():
416
  # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
417
  # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
418
 
 
419
 
420
  # TODO I would really like to remove the non needed columns here. At least this cleans up the output.
421
  # I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
@@ -425,7 +429,7 @@ def main():
425
  eval_dataset = eval_dataset.map(prepare_dataset)
426
 
427
  # Metrics
428
- metric = evaluate.load("wer","cer")
429
 
430
  # Detecting last checkpoint.
431
  last_checkpoint = None
@@ -476,11 +480,8 @@ def main():
476
  # Num Epochs = 9223372036854775807
477
  # Instantaneous batch size per device = 48
478
 
479
- # Saving the processor since we need it later
480
- processor.save_pretrained(training_args.output_dir)
481
-
482
 
483
- # TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
484
  trainer = Seq2SeqTrainer(
485
  args=training_args,
486
  model=model,
@@ -501,24 +502,26 @@ def main():
501
  # TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
502
  trainer.save_state()
503
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  if training_args.push_to_hub:
505
  trainer.push_to_hub(**kwargs)
506
  else:
507
  trainer.create_model_card(**kwargs)
508
 
509
- # TODO - Look closer into the model card writing.
510
- # Write model card and (optionally) push to hub
511
- config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
512
- kwargs = {
513
- "finetuned_from": model_args.model_name_or_path,
514
- "tasks": "automatic-speech-recognition",
515
- "tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
516
- "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
517
- "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
518
- "language": model_args.language,
519
- }
520
-
521
- return results
522
 
523
  # XLA hook
524
  def _mp_fn(index):
 
408
  model_args.model_name_or_path, language=model_args.language, task=model_args.task)
409
  data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
410
 
411
+ # Saving the processor and the tokenizer
412
+ processor.save_pretrained(training_args.output_dir)
413
+ tokenizer.save_pretrained(training_args.output_dir)
414
 
415
  # Prepare data
416
  # TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
 
419
  # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
420
  # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
421
 
422
+
423
 
424
  # TODO I would really like to remove the non needed columns here. At least this cleans up the output.
425
  # I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
 
429
  eval_dataset = eval_dataset.map(prepare_dataset)
430
 
431
  # Metrics
432
+ metric = evaluate.load("wer")
433
 
434
  # Detecting last checkpoint.
435
  last_checkpoint = None
 
480
  # Num Epochs = 9223372036854775807
481
  # Instantaneous batch size per device = 48
482
 
483
+
 
 
484
 
 
485
  trainer = Seq2SeqTrainer(
486
  args=training_args,
487
  model=model,
 
502
  # TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
503
  trainer.save_state()
504
 
505
+ # TODO - Look closer into the model card writing.
506
+ # Write model card and (optionally) push to hub
507
+ config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
508
+ kwargs = {
509
+ "finetuned_from": model_args.model_name_or_path,
510
+ "tasks": "automatic-speech-recognition",
511
+ "tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
512
+ "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
513
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
514
+ "language": model_args.language,
515
+ }
516
+
517
  if training_args.push_to_hub:
518
  trainer.push_to_hub(**kwargs)
519
  else:
520
  trainer.create_model_card(**kwargs)
521
 
522
+
523
+
524
+ return train_result
 
 
 
 
 
 
 
 
 
 
525
 
526
  # XLA hook
527
  def _mp_fn(index):