update
Browse files- run_test.nst +41 -0
- run_whisper_finetuning.py +21 -18
run_test.nst
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Whisper Finetuning script for the NST dataset
|
2 |
+
# Currently for training on a 48GB
|
3 |
+
# Reduce batch size and learning rate if training on smaller GPU
|
4 |
+
|
5 |
+
python run_whisper_finetuning.py \
|
6 |
+
--model_name_or_path="openai/whisper-small" \
|
7 |
+
--output_dir="../whisper-test-delete" \
|
8 |
+
--overwrite_output_dir=True \
|
9 |
+
--language="Norwegian" \
|
10 |
+
--task="transcribe" \
|
11 |
+
--dataset_name="NbAiLab/NST" \
|
12 |
+
--dataset_config="no-close" \
|
13 |
+
--do_train=True \
|
14 |
+
--do_eval=True \
|
15 |
+
--audio_column_name="audio" \
|
16 |
+
--text_column_name="text" \
|
17 |
+
--per_device_train_batch_size=48 \
|
18 |
+
--per_device_train_batch_size=48 \
|
19 |
+
--learning_rate=4e-5 \
|
20 |
+
--warmup_steps=5 \
|
21 |
+
--max_steps=50 \
|
22 |
+
--gradient_checkpointing=True \
|
23 |
+
--gradient_accumulation_steps=1 \
|
24 |
+
--group_by_length=False \
|
25 |
+
--evaluation_strategy="steps" \
|
26 |
+
--save_steps=10 \
|
27 |
+
--eval_steps=10 \
|
28 |
+
--max_eval_samples=10 \
|
29 |
+
--logging_steps=10 \
|
30 |
+
--fp16=True \
|
31 |
+
--load_best_model_at_end=True \
|
32 |
+
--metric_for_best_model="wer" \
|
33 |
+
--greater_is_better=False \
|
34 |
+
--report_to="tensorboard" \
|
35 |
+
--predict_with_generate=True \
|
36 |
+
--generation_max_length=225 \
|
37 |
+
--print_training_arguments=True \
|
38 |
+
--push_to_hub=True
|
39 |
+
|
40 |
+
|
41 |
+
|
run_whisper_finetuning.py
CHANGED
@@ -408,6 +408,9 @@ def main():
|
|
408 |
model_args.model_name_or_path, language=model_args.language, task=model_args.task)
|
409 |
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
|
410 |
|
|
|
|
|
|
|
411 |
|
412 |
# Prepare data
|
413 |
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
@@ -416,6 +419,7 @@ def main():
|
|
416 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
417 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
418 |
|
|
|
419 |
|
420 |
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
421 |
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
@@ -425,7 +429,7 @@ def main():
|
|
425 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
426 |
|
427 |
# Metrics
|
428 |
-
metric = evaluate.load("wer"
|
429 |
|
430 |
# Detecting last checkpoint.
|
431 |
last_checkpoint = None
|
@@ -476,11 +480,8 @@ def main():
|
|
476 |
# Num Epochs = 9223372036854775807
|
477 |
# Instantaneous batch size per device = 48
|
478 |
|
479 |
-
|
480 |
-
processor.save_pretrained(training_args.output_dir)
|
481 |
-
|
482 |
|
483 |
-
# TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
|
484 |
trainer = Seq2SeqTrainer(
|
485 |
args=training_args,
|
486 |
model=model,
|
@@ -501,24 +502,26 @@ def main():
|
|
501 |
# TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
|
502 |
trainer.save_state()
|
503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
if training_args.push_to_hub:
|
505 |
trainer.push_to_hub(**kwargs)
|
506 |
else:
|
507 |
trainer.create_model_card(**kwargs)
|
508 |
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
kwargs = {
|
513 |
-
"finetuned_from": model_args.model_name_or_path,
|
514 |
-
"tasks": "automatic-speech-recognition",
|
515 |
-
"tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
|
516 |
-
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
517 |
-
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
518 |
-
"language": model_args.language,
|
519 |
-
}
|
520 |
-
|
521 |
-
return results
|
522 |
|
523 |
# XLA hook
|
524 |
def _mp_fn(index):
|
|
|
408 |
model_args.model_name_or_path, language=model_args.language, task=model_args.task)
|
409 |
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
|
410 |
|
411 |
+
# Saving the processor and the tokenizer
|
412 |
+
processor.save_pretrained(training_args.output_dir)
|
413 |
+
tokenizer.save_pretrained(training_args.output_dir)
|
414 |
|
415 |
# Prepare data
|
416 |
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
|
|
419 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
420 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
421 |
|
422 |
+
|
423 |
|
424 |
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
425 |
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
|
|
429 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
430 |
|
431 |
# Metrics
|
432 |
+
metric = evaluate.load("wer")
|
433 |
|
434 |
# Detecting last checkpoint.
|
435 |
last_checkpoint = None
|
|
|
480 |
# Num Epochs = 9223372036854775807
|
481 |
# Instantaneous batch size per device = 48
|
482 |
|
483 |
+
|
|
|
|
|
484 |
|
|
|
485 |
trainer = Seq2SeqTrainer(
|
486 |
args=training_args,
|
487 |
model=model,
|
|
|
502 |
# TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
|
503 |
trainer.save_state()
|
504 |
|
505 |
+
# TODO - Look closer into the model card writing.
|
506 |
+
# Write model card and (optionally) push to hub
|
507 |
+
config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
|
508 |
+
kwargs = {
|
509 |
+
"finetuned_from": model_args.model_name_or_path,
|
510 |
+
"tasks": "automatic-speech-recognition",
|
511 |
+
"tags": ["hf-asr-leaderboard", "automatic-speech-recognition", data_args.dataset_name],
|
512 |
+
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
513 |
+
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
514 |
+
"language": model_args.language,
|
515 |
+
}
|
516 |
+
|
517 |
if training_args.push_to_hub:
|
518 |
trainer.push_to_hub(**kwargs)
|
519 |
else:
|
520 |
trainer.create_model_card(**kwargs)
|
521 |
|
522 |
+
|
523 |
+
|
524 |
+
return train_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
# XLA hook
|
527 |
def _mp_fn(index):
|