NbAiLab
/

wav2vec2-xls-r-1b-npsc-bokmaal

@@ -393,6 +393,20 @@ def main():
     # Pre-processing dataset
     import re
     def filter_inaudible(entry):
         return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
@@ -400,7 +414,7 @@ def main():
         return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
     def filter_tooshort(entry):
-        print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
         return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
     def map_dataset(entry):
@@ -432,7 +446,7 @@ def main():
             split=data_args.train_split_name,
             use_auth_token=data_args.use_auth_token,
         )
-        raw_datasets["train"] = raw_datasets["train"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
         raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -459,7 +473,7 @@ def main():
             split=data_args.eval_split_name,
             use_auth_token=data_args.use_auth_token,
         )
-        raw_datasets["eval"] = raw_datasets["eval"].filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
         raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
         if data_args.max_eval_samples is not None:
@@ -470,9 +484,11 @@ def main():
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
     text_column_name = data_args.text_column_name
     def remove_special_characters(batch):
@@ -570,6 +586,7 @@ def main():
             "gradient_checkpointing": training_args.gradient_checkpointing,
             "layerdrop": model_args.layerdrop,
             "ctc_loss_reduction": model_args.ctc_loss_reduction,
             "pad_token_id": tokenizer.pad_token_id,
             "vocab_size": len(tokenizer),
             "activation_dropout": model_args.activation_dropout,

     # Pre-processing dataset
     import re
+    def filter_numeric(entry):
+        return  (
+            "0" not in batch["text"]
+            and "1" not in batch["text"]
+            and "2" not in batch["text"]
+            and "3" not in batch["text"]
+            and "4" not in batch["text"]
+            and "5" not in batch["text"]
+            and "6" not in batch["text"]
+            and "7" not in batch["text"]
+            and "8" not in batch["text"]
+            and "9" not in batch["text"]
+        )
     def filter_inaudible(entry):
         return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
         return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
     def filter_tooshort(entry):
+        #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
         return len(entry["text"]) >= len(entry["audio"]["array"] // 320 and len(entry["text"].strip()) >= 3
     def map_dataset(entry):
             split=data_args.train_split_name,
             use_auth_token=data_args.use_auth_token,
         )
+        raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
         raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
             split=data_args.eval_split_name,
             use_auth_token=data_args.use_auth_token,
         )
+        raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
         raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
         if data_args.max_eval_samples is not None:
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+    #chars_to_ignore_regex = (
+    #    f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    #)
+    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
     text_column_name = data_args.text_column_name
     def remove_special_characters(batch):
             "gradient_checkpointing": training_args.gradient_checkpointing,
             "layerdrop": model_args.layerdrop,
             "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "ctc_zero_infinity": training_args.ctc_zero_infinity,
             "pad_token_id": tokenizer.pad_token_id,
             "vocab_size": len(tokenizer),
             "activation_dropout": model_args.activation_dropout,