ivanlau commited on
Commit
eaab83b
1 Parent(s): a82947a

Training in progress, step 60

Browse files
.ipynb_checkpoints/eval-checkpoint.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python3
2
  import argparse
3
  import re
 
4
  from typing import Dict
5
 
6
  import torch
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
50
  def normalize_text(text: str) -> str:
51
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
 
53
- chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
 
 
 
 
 
 
 
 
54
 
55
  text = re.sub(chars_to_ignore_regex, "", text.lower())
56
 
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
59
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
 
61
  for t in token_sequences_to_ignore:
62
- text = " ".join(text.split(t))
 
 
 
 
 
 
 
 
 
63
 
64
  return text
65
 
1
  #!/usr/bin/env python3
2
  import argparse
3
  import re
4
+ import string
5
  from typing import Dict
6
 
7
  import torch
51
  def normalize_text(text: str) -> str:
52
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
53
 
54
+ chars_to_ignore = [
55
+ ",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
56
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
57
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
58
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
59
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
60
+ ] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
61
+
62
+ chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
63
 
64
  text = re.sub(chars_to_ignore_regex, "", text.lower())
65
 
68
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
69
 
70
  for t in token_sequences_to_ignore:
71
+ text = "".join(text.split(t))
72
+
73
+ # convert 'D' and 'd' to '啲' if there a 'D' in sentence
74
+ # hacky stuff, wont work on 'D', 'd' co-occure with normal english words
75
+ # wont work on multiple 'D'
76
+ if "d" in text:
77
+ if len([c for c in text if c in string.ascii_lowercase]) == 1:
78
+ text = text.replace("d", "啲")
79
+
80
+ text += ' '
81
 
82
  return text
83
 
eval.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python3
2
  import argparse
3
  import re
 
4
  from typing import Dict
5
 
6
  import torch
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
50
  def normalize_text(text: str) -> str:
51
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
 
53
- chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
 
 
 
 
 
 
 
 
54
 
55
  text = re.sub(chars_to_ignore_regex, "", text.lower())
56
 
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
59
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
 
61
  for t in token_sequences_to_ignore:
62
- text = " ".join(text.split(t))
 
 
 
 
 
 
 
 
 
63
 
64
  return text
65
 
1
  #!/usr/bin/env python3
2
  import argparse
3
  import re
4
+ import string
5
  from typing import Dict
6
 
7
  import torch
51
  def normalize_text(text: str) -> str:
52
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
53
 
54
+ chars_to_ignore = [
55
+ ",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
56
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
57
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
58
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
59
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
60
+ ] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
61
+
62
+ chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
63
 
64
  text = re.sub(chars_to_ignore_regex, "", text.lower())
65
 
68
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
69
 
70
  for t in token_sequences_to_ignore:
71
+ text = "".join(text.split(t))
72
+
73
+ # convert 'D' and 'd' to '啲' if there a 'D' in sentence
74
+ # hacky stuff, wont work on 'D', 'd' co-occure with normal english words
75
+ # wont work on multiple 'D'
76
+ if "d" in text:
77
+ if len([c for c in text if c in string.ascii_lowercase]) == 1:
78
+ text = text.replace("d", "啲")
79
+
80
+ text += ' '
81
 
82
  return text
83
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58eaf79862b306678e5a5aa8d46dba49938a3302217e925245c6f674565ea0c5
3
  size 1278024433
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cf9654583b75dea424b875769f5c205aadeff4ef6f019a7717d32a2d023c8d6
3
  size 1278024433