lucio
/

xls-r-uzbek-cv8

@@ -2,6 +2,8 @@
 import argparse
 import functools
 import re
 from typing import Dict
 from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
@@ -50,9 +52,17 @@ def log_results(result: Dataset, args: Dict[str, str]):
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[!"%,.:;?\\_|©«¬»،؛؟‒–—’“”„…‹›−☺♂�\\\\-]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
-    text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # note that order is important here!
@@ -107,7 +117,7 @@ def main(args):
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
-    dataset = dataset.select(range(10))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)

 import argparse
 import functools
 import re
+import string
+import unidecode
 from typing import Dict
 from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore_regex = f'[{re.escape(string.punctuation)}]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(
+        chars_to_ignore_regex,
+        "",
+        re.sub("['`´]", "’",   # elsewhere probably meant as glottal stop
+               re.sub("([og])['`´]", "\g<1>‘",  # after o/g indicate modified char
+                      unidecode.unidecode(text).lower()
+                     )
+              )
+    ) + " "
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # note that order is important here!
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)

mozilla-foundation_common_voice_8_0_uz_test_eval_results.txt ADDED Viewed