Update run_speech_recognition_ctc.py
Browse files
run_speech_recognition_ctc.py
CHANGED
@@ -393,6 +393,20 @@ def main():
|
|
393 |
|
394 |
# Pre-processing dataset
|
395 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
def filter_inaudible(entry):
|
397 |
return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
|
398 |
|
@@ -400,7 +414,7 @@ def main():
|
|
400 |
return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
|
401 |
|
402 |
def filter_tooshort(entry):
|
403 |
-
print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
404 |
return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
|
405 |
|
406 |
def map_dataset(entry):
|
@@ -432,7 +446,7 @@ def main():
|
|
432 |
split=data_args.train_split_name,
|
433 |
use_auth_token=data_args.use_auth_token,
|
434 |
)
|
435 |
-
raw_datasets["train"] = raw_datasets["train"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
|
436 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
437 |
|
438 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
@@ -459,7 +473,7 @@ def main():
|
|
459 |
split=data_args.eval_split_name,
|
460 |
use_auth_token=data_args.use_auth_token,
|
461 |
)
|
462 |
-
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
|
463 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
464 |
|
465 |
if data_args.max_eval_samples is not None:
|
@@ -470,9 +484,11 @@ def main():
|
|
470 |
# that make training complicated and do not help in transcribing the speech
|
471 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
472 |
# that could be easily picked up by the model
|
473 |
-
chars_to_ignore_regex = (
|
474 |
-
|
475 |
-
)
|
|
|
|
|
476 |
text_column_name = data_args.text_column_name
|
477 |
|
478 |
def remove_special_characters(batch):
|
@@ -570,6 +586,7 @@ def main():
|
|
570 |
"gradient_checkpointing": training_args.gradient_checkpointing,
|
571 |
"layerdrop": model_args.layerdrop,
|
572 |
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
|
|
573 |
"pad_token_id": tokenizer.pad_token_id,
|
574 |
"vocab_size": len(tokenizer),
|
575 |
"activation_dropout": model_args.activation_dropout,
|
|
|
393 |
|
394 |
# Pre-processing dataset
|
395 |
import re
|
396 |
+
def filter_numeric(entry):
|
397 |
+
return (
|
398 |
+
"0" not in batch["text"]
|
399 |
+
and "1" not in batch["text"]
|
400 |
+
and "2" not in batch["text"]
|
401 |
+
and "3" not in batch["text"]
|
402 |
+
and "4" not in batch["text"]
|
403 |
+
and "5" not in batch["text"]
|
404 |
+
and "6" not in batch["text"]
|
405 |
+
and "7" not in batch["text"]
|
406 |
+
and "8" not in batch["text"]
|
407 |
+
and "9" not in batch["text"]
|
408 |
+
)
|
409 |
+
|
410 |
def filter_inaudible(entry):
|
411 |
return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
|
412 |
|
|
|
414 |
return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
|
415 |
|
416 |
def filter_tooshort(entry):
|
417 |
+
#print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
418 |
return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
|
419 |
|
420 |
def map_dataset(entry):
|
|
|
446 |
split=data_args.train_split_name,
|
447 |
use_auth_token=data_args.use_auth_token,
|
448 |
)
|
449 |
+
raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
|
450 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
451 |
|
452 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
|
|
473 |
split=data_args.eval_split_name,
|
474 |
use_auth_token=data_args.use_auth_token,
|
475 |
)
|
476 |
+
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
|
477 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
478 |
|
479 |
if data_args.max_eval_samples is not None:
|
|
|
484 |
# that make training complicated and do not help in transcribing the speech
|
485 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
486 |
# that could be easily picked up by the model
|
487 |
+
#chars_to_ignore_regex = (
|
488 |
+
# f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
|
489 |
+
#)
|
490 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
|
491 |
+
|
492 |
text_column_name = data_args.text_column_name
|
493 |
|
494 |
def remove_special_characters(batch):
|
|
|
586 |
"gradient_checkpointing": training_args.gradient_checkpointing,
|
587 |
"layerdrop": model_args.layerdrop,
|
588 |
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
589 |
+
"ctc_zero_infinity": training_args.ctc_zero_infinity,
|
590 |
"pad_token_id": tokenizer.pad_token_id,
|
591 |
"vocab_size": len(tokenizer),
|
592 |
"activation_dropout": model_args.activation_dropout,
|