versae commited on
Commit
4aa9eee
1 Parent(s): fc1bc61

Update run_speech_recognition_ctc.py

Browse files
Files changed (1) hide show
  1. run_speech_recognition_ctc.py +23 -6
run_speech_recognition_ctc.py CHANGED
@@ -393,6 +393,20 @@ def main():
393
 
394
  # Pre-processing dataset
395
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  def filter_inaudible(entry):
397
  return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
398
 
@@ -400,7 +414,7 @@ def main():
400
  return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
401
 
402
  def filter_tooshort(entry):
403
- print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
404
  return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
405
 
406
  def map_dataset(entry):
@@ -432,7 +446,7 @@ def main():
432
  split=data_args.train_split_name,
433
  use_auth_token=data_args.use_auth_token,
434
  )
435
- raw_datasets["train"] = raw_datasets["train"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
436
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
437
 
438
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -459,7 +473,7 @@ def main():
459
  split=data_args.eval_split_name,
460
  use_auth_token=data_args.use_auth_token,
461
  )
462
- raw_datasets["eval"] = raw_datasets["eval"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
463
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
464
 
465
  if data_args.max_eval_samples is not None:
@@ -470,9 +484,11 @@ def main():
470
  # that make training complicated and do not help in transcribing the speech
471
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
472
  # that could be easily picked up by the model
473
- chars_to_ignore_regex = (
474
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
475
- )
 
 
476
  text_column_name = data_args.text_column_name
477
 
478
  def remove_special_characters(batch):
@@ -570,6 +586,7 @@ def main():
570
  "gradient_checkpointing": training_args.gradient_checkpointing,
571
  "layerdrop": model_args.layerdrop,
572
  "ctc_loss_reduction": model_args.ctc_loss_reduction,
 
573
  "pad_token_id": tokenizer.pad_token_id,
574
  "vocab_size": len(tokenizer),
575
  "activation_dropout": model_args.activation_dropout,
 
393
 
394
  # Pre-processing dataset
395
  import re
396
+ def filter_numeric(entry):
397
+ return (
398
+ "0" not in batch["text"]
399
+ and "1" not in batch["text"]
400
+ and "2" not in batch["text"]
401
+ and "3" not in batch["text"]
402
+ and "4" not in batch["text"]
403
+ and "5" not in batch["text"]
404
+ and "6" not in batch["text"]
405
+ and "7" not in batch["text"]
406
+ and "8" not in batch["text"]
407
+ and "9" not in batch["text"]
408
+ )
409
+
410
  def filter_inaudible(entry):
411
  return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
412
 
 
414
  return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
415
 
416
  def filter_tooshort(entry):
417
+ #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
418
  return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
419
 
420
  def map_dataset(entry):
 
446
  split=data_args.train_split_name,
447
  use_auth_token=data_args.use_auth_token,
448
  )
449
+ raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
450
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
451
 
452
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
 
473
  split=data_args.eval_split_name,
474
  use_auth_token=data_args.use_auth_token,
475
  )
476
+ raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
477
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
478
 
479
  if data_args.max_eval_samples is not None:
 
484
  # that make training complicated and do not help in transcribing the speech
485
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
486
  # that could be easily picked up by the model
487
+ #chars_to_ignore_regex = (
488
+ # f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
489
+ #)
490
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
491
+
492
  text_column_name = data_args.text_column_name
493
 
494
  def remove_special_characters(batch):
 
586
  "gradient_checkpointing": training_args.gradient_checkpointing,
587
  "layerdrop": model_args.layerdrop,
588
  "ctc_loss_reduction": model_args.ctc_loss_reduction,
589
+ "ctc_zero_infinity": training_args.ctc_zero_infinity,
590
  "pad_token_id": tokenizer.pad_token_id,
591
  "vocab_size": len(tokenizer),
592
  "activation_dropout": model_args.activation_dropout,