NbAiLab
/

wav2vec2-xls-r-1b-npsc-bokmaal

@@ -125,7 +125,9 @@ class ModelArguments:
     ctc_loss_reduction: Optional[str] = field(
         default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
     )
 @dataclass
 class DataTrainingArguments:
@@ -395,16 +397,16 @@ def main():
     import re
     def filter_numeric(entry):
         return  (
-            "0" not in batch["text"]
-            and "1" not in batch["text"]
-            and "2" not in batch["text"]
-            and "3" not in batch["text"]
-            and "4" not in batch["text"]
-            and "5" not in batch["text"]
-            and "6" not in batch["text"]
-            and "7" not in batch["text"]
-            and "8" not in batch["text"]
-            and "9" not in batch["text"]
         )
     def filter_inaudible(entry):
@@ -415,7 +417,7 @@ def main():
     def filter_tooshort(entry):
         #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
-        return len(entry["text"]) >= len(entry["audio"]["array"]) // 320 and len(entry["text"].strip()) >= 3
     def map_dataset(entry):
         batch = {"text": entry["text"].lower()}
@@ -586,7 +588,7 @@ def main():
             "gradient_checkpointing": training_args.gradient_checkpointing,
             "layerdrop": model_args.layerdrop,
             "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "ctc_zero_infinity": training_args.ctc_zero_infinity,
             "pad_token_id": tokenizer.pad_token_id,
             "vocab_size": len(tokenizer),
             "activation_dropout": model_args.activation_dropout,

     ctc_loss_reduction: Optional[str] = field(
         default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
     )
+    ctc_zero_infinity Optional[bool] = field(
+        default=False, metadata={"help": "If True, will try yo aboud the CTC loss goinf to infinity."}
+    )
 @dataclass
 class DataTrainingArguments:
     import re
     def filter_numeric(entry):
         return  (
+            "0" not in entry["text"]
+            and "1" not in entry["text"]
+            and "2" not in entry["text"]
+            and "3" not in entry["text"]
+            and "4" not in entry["text"]
+            and "5" not in entry["text"]
+            and "6" not in entry["text"]
+            and "7" not in entry["text"]
+            and "8" not in entry["text"]
+            and "9" not in entry["text"]
         )
     def filter_inaudible(entry):
     def filter_tooshort(entry):
         #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
+        return (len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)
     def map_dataset(entry):
         batch = {"text": entry["text"].lower()}
             "gradient_checkpointing": training_args.gradient_checkpointing,
             "layerdrop": model_args.layerdrop,
             "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "ctc_zero_infinity": model_args.ctc_zero_infinity,
             "pad_token_id": tokenizer.pad_token_id,
             "vocab_size": len(tokenizer),
             "activation_dropout": model_args.activation_dropout,