fixed bug
Browse files- run_test.sh +1 -1
- run_whisper_finetuning.py +30 -30
run_test.sh
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
|
5 |
python run_whisper_finetuning.py \
|
6 |
--model_name_or_path="openai/whisper-small" \
|
7 |
-
--output_dir="../whisper-test-
|
8 |
--overwrite_output_dir=True \
|
9 |
--language="Norwegian" \
|
10 |
--task="transcribe" \
|
|
|
4 |
|
5 |
python run_whisper_finetuning.py \
|
6 |
--model_name_or_path="openai/whisper-small" \
|
7 |
+
--output_dir="../whisper-test-delete3" \
|
8 |
--overwrite_output_dir=True \
|
9 |
--language="Norwegian" \
|
10 |
--task="transcribe" \
|
run_whisper_finetuning.py
CHANGED
@@ -92,7 +92,7 @@ class Seq2SeqTrainingArguments(TrainingArguments):
|
|
92 |
)
|
93 |
},
|
94 |
)
|
95 |
-
|
96 |
|
97 |
@dataclass
|
98 |
class ModelArguments:
|
@@ -340,10 +340,6 @@ def main():
|
|
340 |
parser = HfArgumentParser(
|
341 |
(ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
|
342 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
343 |
-
|
344 |
-
#Debug
|
345 |
-
import torch_xla.debug.metrics as met
|
346 |
-
print(met.metrics_report())
|
347 |
|
348 |
# Metrics
|
349 |
|
@@ -390,14 +386,14 @@ def main():
|
|
390 |
feats[new_name] = feats.pop(old_name)
|
391 |
ds.info.features = feats
|
392 |
return ds
|
393 |
-
|
394 |
def remove_columns(ds, column_name):
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
# Print training arguments
|
402 |
if data_args.print_training_arguments:
|
403 |
print_training_arguments(model_args, data_args, training_args)
|
@@ -409,12 +405,12 @@ def main():
|
|
409 |
|
410 |
# Rename columns
|
411 |
if data_args.audio_column_name != "audio":
|
412 |
-
train_dataset = rename_column(
|
413 |
-
eval_dataset = rename_column(
|
414 |
|
415 |
if data_args.text_column_name != "sentence":
|
416 |
-
train_dataset = rename_column(
|
417 |
-
eval_dataset = rename_column(
|
418 |
|
419 |
|
420 |
# Initialise
|
@@ -429,23 +425,27 @@ def main():
|
|
429 |
# Saving the processor and the tokenizer
|
430 |
processor.save_pretrained(training_args.output_dir)
|
431 |
tokenizer.save_pretrained(training_args.output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
|
434 |
-
#
|
435 |
-
|
436 |
-
|
|
|
|
|
|
|
|
|
437 |
|
438 |
-
|
439 |
-
#
|
440 |
-
|
441 |
|
442 |
-
for c in column_names:
|
443 |
-
if c not in ["audio", "sentence"]:
|
444 |
-
print(f"removing {c}")
|
445 |
-
train_dataset = remove_columns(train_dataset, c)
|
446 |
-
eval_dataset = remove_columns(eval_dataset, c)
|
447 |
-
|
448 |
-
# Prepare dataset
|
449 |
train_dataset = train_dataset.map(prepare_dataset)
|
450 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
451 |
|
@@ -502,7 +502,7 @@ def main():
|
|
502 |
# Instantaneous batch size per device = 48
|
503 |
|
504 |
|
505 |
-
|
506 |
trainer = Seq2SeqTrainer(
|
507 |
args=training_args,
|
508 |
model=model,
|
|
|
92 |
)
|
93 |
},
|
94 |
)
|
95 |
+
|
96 |
|
97 |
@dataclass
|
98 |
class ModelArguments:
|
|
|
340 |
parser = HfArgumentParser(
|
341 |
(ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
|
342 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
|
|
|
|
|
|
|
|
343 |
|
344 |
# Metrics
|
345 |
|
|
|
386 |
feats[new_name] = feats.pop(old_name)
|
387 |
ds.info.features = feats
|
388 |
return ds
|
389 |
+
|
390 |
def remove_columns(ds, column_name):
|
391 |
+
feats = ds.info.features
|
392 |
+
ds = ds.remove_columns(column_name)
|
393 |
+
feats.pop(column_name)
|
394 |
+
ds.info.features = feats
|
395 |
+
return ds
|
396 |
+
|
397 |
# Print training arguments
|
398 |
if data_args.print_training_arguments:
|
399 |
print_training_arguments(model_args, data_args, training_args)
|
|
|
405 |
|
406 |
# Rename columns
|
407 |
if data_args.audio_column_name != "audio":
|
408 |
+
train_dataset = train_dataset.rename_column(data_args.audio_column_name, "audio")
|
409 |
+
eval_dataset = eval_dataset.rename_column(data_args.audio_column_name, "audio")
|
410 |
|
411 |
if data_args.text_column_name != "sentence":
|
412 |
+
train_dataset = train_dataset.rename_column(data_args.text_column_name, "sentence")
|
413 |
+
eval_dataset = eval_dataset.rename_column(data_args.text_column_name, "sentence")
|
414 |
|
415 |
|
416 |
# Initialise
|
|
|
425 |
# Saving the processor and the tokenizer
|
426 |
processor.save_pretrained(training_args.output_dir)
|
427 |
tokenizer.save_pretrained(training_args.output_dir)
|
428 |
+
|
429 |
+
# Prepare data
|
430 |
+
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
431 |
+
# The issue is that the dataset features returns None. But for me thay seem to have been set correctly
|
432 |
+
# In our case this is not needed, since the datasets already is available as 16K. But it would be great to solve this bug
|
433 |
+
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
434 |
+
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
435 |
|
436 |
|
437 |
+
# Remove non needed columns
|
438 |
+
#column_names=[x for x in train_dataset.info.features]
|
439 |
+
|
440 |
+
#for c in column_names:
|
441 |
+
# if c not in ["audio", "text"]:
|
442 |
+
# train_dataset = remove_columns(train_dataset, c)
|
443 |
+
# eval_dataset = remove_columns(eval_dataset, c)
|
444 |
|
445 |
+
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
446 |
+
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
447 |
+
# train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
|
448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
train_dataset = train_dataset.map(prepare_dataset)
|
450 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
451 |
|
|
|
502 |
# Instantaneous batch size per device = 48
|
503 |
|
504 |
|
505 |
+
|
506 |
trainer = Seq2SeqTrainer(
|
507 |
args=training_args,
|
508 |
model=model,
|