ivanlau
/

wav2vec2-large-xls-r-300m-cantonese

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import re
 from typing import Dict
 import torch
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
     text = re.sub(chars_to_ignore_regex, "", text.lower())
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
     token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
     for t in token_sequences_to_ignore:
-        text = " ".join(text.split(t))
     return text

 #!/usr/bin/env python3
 import argparse
 import re
+import string
 from typing import Dict
 import torch
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore = [
+        ",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
+        "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
+        "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
+        "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
+        "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
+    ]  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
     for t in token_sequences_to_ignore:
+        text = "".join(text.split(t))
+    # convert 'D' and 'd' to '啲' if there a 'D' in sentence
+    # hacky stuff, wont work on 'D', 'd' co-occure with normal english words
+    # wont work on multiple 'D'
+    if "d" in text:
+        if len([c for c in text if c in string.ascii_lowercase]) == 1:
+            text = text.replace("d", "啲")
+    text += ' '
     return text

eval.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import re
 from typing import Dict
 import torch
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
     text = re.sub(chars_to_ignore_regex, "", text.lower())
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
     token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
     for t in token_sequences_to_ignore:
-        text = " ".join(text.split(t))
     return text

 #!/usr/bin/env python3
 import argparse
 import re
+import string
 from typing import Dict
 import torch
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore = [
+        ",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
+        "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
+        "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
+        "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
+        "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
+    ]  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
     for t in token_sequences_to_ignore:
+        text = "".join(text.split(t))
+    # convert 'D' and 'd' to '啲' if there a 'D' in sentence
+    # hacky stuff, wont work on 'D', 'd' co-occure with normal english words
+    # wont work on multiple 'D'
+    if "d" in text:
+        if len([c for c in text if c in string.ascii_lowercase]) == 1:
+            text = text.replace("d", "啲")
+    text += ' '
     return text

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58eaf79862b306678e5a5aa8d46dba49938a3302217e925245c6f674565ea0c5
 size 1278024433

 version https://git-lfs.github.com/spec/v1
+oid sha256:2cf9654583b75dea424b875769f5c205aadeff4ef6f019a7717d32a2d023c8d6
 size 1278024433