NikiPshg commited on
Commit
b3ef6db
1 Parent(s): 9ba7d3b

Upload 21 files

Browse files
G2P_lexicon/G2P.py CHANGED
@@ -74,8 +74,9 @@ class GraphemeToPhoneme:
74
  return pred
75
 
76
 
77
- dict_path = os.path.join(dirname, "my_tokenizer/bpe_512_lex.json")
78
- model_path = os.path.join(dirname, "models/model0.07.pt")
 
79
 
80
  tokenizer_g2p = Tokenizer.from_file(dict_path)
81
  g2p_model = TransformerBlock(config=config_g2p, tokenizer=tokenizer_g2p)
 
74
  return pred
75
 
76
 
77
+ dict_path = os.path.join(dirname, "my_tokenizer/bpe_256_cmu.json")
78
+ model_path = os.path.join(dirname, "models/model_g2p.pt")
79
+
80
 
81
  tokenizer_g2p = Tokenizer.from_file(dict_path)
82
  g2p_model = TransformerBlock(config=config_g2p, tokenizer=tokenizer_g2p)
G2P_lexicon/SP.py CHANGED
@@ -65,8 +65,8 @@ class Stress_Pred:
65
  return pred
66
 
67
 
68
- dict_path = os.path.join(dirname, "my_tokenizer\my_dict_256.json")
69
- model_path = os.path.join(dirname, "models\model_0.159.pt")
70
 
71
  tokenizer_sp = Tokenizer_sp(dict_path=dict_path)
72
 
 
65
  return pred
66
 
67
 
68
+ dict_path = os.path.join(dirname, "my_tokenizer\sp_dict.json")
69
+ model_path = os.path.join(dirname, "models\model_sp.pt")
70
 
71
  tokenizer_sp = Tokenizer_sp(dict_path=dict_path)
72
 
G2P_lexicon/__pycache__/G2P.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/G2P.cpython-311.pyc and b/G2P_lexicon/__pycache__/G2P.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/SP.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/SP.cpython-311.pyc and b/G2P_lexicon/__pycache__/SP.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/__init__.cpython-311.pyc and b/G2P_lexicon/__pycache__/__init__.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/config_models.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/config_models.cpython-311.pyc and b/G2P_lexicon/__pycache__/config_models.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc and b/G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc and b/G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc differ
 
G2P_lexicon/__pycache__/transformer.cpython-311.pyc CHANGED
Binary files a/G2P_lexicon/__pycache__/transformer.cpython-311.pyc and b/G2P_lexicon/__pycache__/transformer.cpython-311.pyc differ
 
G2P_lexicon/config_models.py CHANGED
@@ -4,12 +4,14 @@ config_sp = {
4
  "NUM": 3,
5
  "NUM_HEADS": 4,
6
  "MAX_LEN": 32,
 
7
  }
8
 
9
  config_g2p = {
10
- "D_MODEL": 512,
11
- "D_FF": 2048,
12
- "NUM": 6,
13
- "NUM_HEADS": 8,
14
  "MAX_LEN": 32,
 
15
  }
 
4
  "NUM": 3,
5
  "NUM_HEADS": 4,
6
  "MAX_LEN": 32,
7
+ "BIAS": True
8
  }
9
 
10
  config_g2p = {
11
+ "D_MODEL": 256,
12
+ "D_FF": 1024,
13
+ "NUM": 3,
14
+ "NUM_HEADS": 4,
15
  "MAX_LEN": 32,
16
+ "BIAS": False,
17
  }
G2P_lexicon/data_preparation.py CHANGED
@@ -1,43 +1,59 @@
1
  import re
2
 
 
 
 
 
 
 
3
 
4
- def intToWord(number):
5
- ones = ("", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
6
- tens = ("", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety")
7
- teens = (
8
- "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen")
9
- levels = (
10
- "", "thousand", "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", "septillion",
11
- "octillion", "nonillion")
12
-
13
- word = ""
14
- num = reversed(str(number))
15
- number = ""
16
- for x in num:
17
- number += x
18
- del num
19
- if len(number) % 3 == 1: number += "0"
20
- x = 0
21
- for digit in number:
22
- if x % 3 == 0:
23
- word = levels[x // 3] + " " + word
24
- n = int(digit)
25
- elif x % 3 == 1:
26
- if digit == "1":
27
- num = teens[n]
28
- else:
29
- num = tens[int(digit)]
30
- if n:
31
- if num:
32
- num += ones[n]
33
- else:
34
- num = ones[n]
35
- word = num + " " + word
36
- elif x % 3 == 2:
37
- if digit != "0":
38
- word = ones[int(digit)] + " hundred " + word
39
- x += 1
40
- return word.strip(" ")
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def preprocess_text(text):
@@ -48,7 +64,7 @@ def preprocess_text(text):
48
  return:
49
  ['HELLO', ',', 'WORLD', 'THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'WITH', 'NUMBERS', 'AND', 'SYMBOLS', '.']
50
  """
51
- if not(text.isspace()) and text and text:
52
 
53
  text = text.upper()
54
  text = re.sub(r'([.,])', r' \1 ', text)
 
1
  import re
2
 
3
+ one = ["", "one ", "two ", "three ", "four ",
4
+ "five ", "six ", "seven ", "eight ",
5
+ "nine ", "ten ", "eleven ", "twelve ",
6
+ "thirteen ", "fourteen ", "fifteen ",
7
+ "sixteen ", "seventeen ", "eighteen ",
8
+ "nineteen "]
9
 
10
+ # strings at index 0 and 1 are not used,
11
+ # they are to make array indexing simple
12
+ ten = ["", "", "twenty ", "thirty ", "forty ",
13
+ "fifty ", "sixty ", "seventy ", "eighty ",
14
+ "ninety "]
15
+
16
+
17
+ def numToWords(n, s):
18
+ str = ""
19
+
20
+ if n <= 19:
21
+ str += one[n]
22
+ # if n is more than 19, divide it
23
+ else:
24
+ str += ten[n // 10] + one[n % 10]
25
+
26
+ # if n is non-zero
27
+ if (n):
28
+ str += s
29
+
30
+ return str
31
+
32
+
33
+ def intToWord(n):
34
+ n=int(n)
35
+ out = ""
36
+
37
+ out += numToWords((n // 10000000),
38
+ "crore ")
39
+
40
+ out += numToWords(((n // 100000) % 100),
41
+ "lakh ")
42
+
43
+ out += numToWords(((n // 1000) % 100),
44
+ "thousand ")
45
+
46
+ out += numToWords(((n // 100) % 10),
47
+ "hundred ")
48
+
49
+ if n > 100 and n % 100:
50
+ out += "and "
51
+
52
+ # handles digits at ones and tens
53
+ # places (if any)
54
+ out += numToWords((n % 100), "")
55
+
56
+ return out.strip()
57
 
58
 
59
  def preprocess_text(text):
 
64
  return:
65
  ['HELLO', ',', 'WORLD', 'THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'WITH', 'NUMBERS', 'AND', 'SYMBOLS', '.']
66
  """
67
+ if not (text.isspace()) and text and text:
68
 
69
  text = text.upper()
70
  text = re.sub(r'([.,])', r' \1 ', text)
G2P_lexicon/models/model_g2p.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07c75f15750171f0c1be7be681b433031fe9beaa1d223054cb06fd5ebfcc0fcf
3
+ size 22952698
G2P_lexicon/models/model_sp.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce2f8269e96abaf00086f4c61043046656deb8cf397ce7f1501d2f354dd6bea7
3
+ size 22471914
G2P_lexicon/my_tokenizer/bpe_256_cmu.json ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 256,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 257,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 258,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "BertNormalizer",
45
+ "clean_text": true,
46
+ "handle_chinese_chars": true,
47
+ "strip_accents": null,
48
+ "lowercase": false
49
+ },
50
+ "pre_tokenizer": {
51
+ "type": "BertPreTokenizer"
52
+ },
53
+ "post_processor": null,
54
+ "decoder": {
55
+ "type": "BPEDecoder",
56
+ "suffix": "</w>"
57
+ },
58
+ "model": {
59
+ "type": "BPE",
60
+ "dropout": null,
61
+ "unk_token": "<unk>",
62
+ "continuing_subword_prefix": null,
63
+ "end_of_word_suffix": "</w>",
64
+ "fuse_unk": false,
65
+ "byte_fallback": false,
66
+ "ignore_merges": false,
67
+ "vocab": {
68
+ "<unk>": 0,
69
+ "A": 1,
70
+ "B": 2,
71
+ "C": 3,
72
+ "D": 4,
73
+ "E": 5,
74
+ "F": 6,
75
+ "G": 7,
76
+ "H": 8,
77
+ "I": 9,
78
+ "J": 10,
79
+ "K": 11,
80
+ "L": 12,
81
+ "M": 13,
82
+ "N": 14,
83
+ "O": 15,
84
+ "P": 16,
85
+ "Q": 17,
86
+ "R": 18,
87
+ "S": 19,
88
+ "T": 20,
89
+ "U": 21,
90
+ "V": 22,
91
+ "W": 23,
92
+ "X": 24,
93
+ "Y": 25,
94
+ "Z": 26,
95
+ "Ġ": 27,
96
+ "Z</w>": 28,
97
+ "R</w>": 29,
98
+ "G</w>": 30,
99
+ "N</w>": 31,
100
+ "S</w>": 32,
101
+ "H</w>": 33,
102
+ "D</w>": 34,
103
+ "M</w>": 35,
104
+ "W</w>": 36,
105
+ "L</w>": 37,
106
+ "Y</w>": 38,
107
+ "E</w>": 39,
108
+ "T</w>": 40,
109
+ "K</w>": 41,
110
+ "V</w>": 42,
111
+ "A</w>": 43,
112
+ "F</w>": 44,
113
+ "Q</w>": 45,
114
+ "B</w>": 46,
115
+ "O</w>": 47,
116
+ "P</w>": 48,
117
+ "I</w>": 49,
118
+ "C</w>": 50,
119
+ "U</w>": 51,
120
+ "X</w>": 52,
121
+ "J</w>": 53,
122
+ "HĠ": 54,
123
+ "ĠA": 55,
124
+ "ĠI": 56,
125
+ "ĠAHĠ": 57,
126
+ "ĠE": 58,
127
+ "ĠIHĠ": 59,
128
+ "YĠ": 60,
129
+ "RĠ": 61,
130
+ "NĠ": 62,
131
+ "AĠ": 63,
132
+ "WĠ": 64,
133
+ "EĠ": 65,
134
+ "ĠAAĠ": 66,
135
+ "SĠ": 67,
136
+ "ĠEHĠ": 68,
137
+ "ĠAEĠ": 69,
138
+ "ĠR": 70,
139
+ "ĠIYĠ": 71,
140
+ "LĠ": 72,
141
+ "ĠERĠ": 73,
142
+ "HĠA": 74,
143
+ "KĠ": 75,
144
+ "OWĠ": 76,
145
+ "ĠIY</w>": 77,
146
+ "ĠEYĠ": 78,
147
+ "TĠ": 79,
148
+ "ĠAO": 80,
149
+ "GĠ": 81,
150
+ "UWĠ": 82,
151
+ "ĠAHĠNĠ": 83,
152
+ "ĠAOĠ": 84,
153
+ "ĠIHĠN": 85,
154
+ "IHĠ": 86,
155
+ "MĠ": 87,
156
+ "ĠAH</w>": 88,
157
+ "ĠAYĠ": 89,
158
+ "DĠ": 90,
159
+ "SĠT": 91,
160
+ "HĠE": 92,
161
+ "HĠAHĠ": 93,
162
+ "ĠAHĠN</w>": 94,
163
+ "ĠIY": 95,
164
+ "ĠER</w>": 96,
165
+ "PĠ": 97,
166
+ "BĠ": 98,
167
+ "AHĠ": 99,
168
+ "ĠIHĠNG</w>": 100,
169
+ "LĠAHĠ": 101,
170
+ "NĠAHĠ": 102,
171
+ "ĠER": 103,
172
+ "OW</w>": 104,
173
+ "KĠAHĠ": 105,
174
+ "ĠAAĠRĠ": 106,
175
+ "HHĠA": 107,
176
+ "LĠIY</w>": 108,
177
+ "LĠIHĠ": 109,
178
+ "TĠS</w>": 110,
179
+ "HĠIHĠ": 111,
180
+ "SĠIHĠ": 112,
181
+ "DĠIHĠ": 113,
182
+ "TĠIHĠ": 114,
183
+ "ĠAOĠRĠ": 115,
184
+ "ĠERĠZ</w>": 116,
185
+ "SĠAHĠ": 117,
186
+ "ĠIYĠZ</w>": 118,
187
+ "FĠ": 119,
188
+ "IN": 120,
189
+ "SHĠAHĠ": 121,
190
+ "TĠAHĠ": 122,
191
+ "NĠZ</w>": 123,
192
+ "ER": 124,
193
+ "AEĠ": 125,
194
+ "MĠAHĠ": 126,
195
+ "ĠAEĠNĠ": 127,
196
+ "HĠEHĠ": 128,
197
+ "EHĠ": 129,
198
+ "UHĠ": 130,
199
+ "ĠRĠAHĠ": 131,
200
+ "ĠAHĠNĠZ</w>": 132,
201
+ "BĠAHĠ": 133,
202
+ "ĠEHĠR": 134,
203
+ "ĠEHĠNĠ": 135,
204
+ "DĠAHĠ": 136,
205
+ "ĠRĠIHĠ": 137,
206
+ "HĠI": 138,
207
+ "KĠAAĠ": 139,
208
+ "LĠZ</w>": 140,
209
+ "ĠIHĠNGĠ": 141,
210
+ "NGĠ": 142,
211
+ "NĠIHĠ": 143,
212
+ "MĠIHĠ": 144,
213
+ "AN": 145,
214
+ "WĠIHĠ": 146,
215
+ "ĠAWĠ": 147,
216
+ "AR": 148,
217
+ "ZĠ": 149,
218
+ "AAĠ": 150,
219
+ "SĠT</w>": 151,
220
+ "YĠUWĠ": 152,
221
+ "DĠZ</w>": 153,
222
+ "RĠOWĠ": 154,
223
+ "AHĠNĠ": 155,
224
+ "SĠK": 156,
225
+ "EN": 157,
226
+ "OĠ": 158,
227
+ "SĠP": 159,
228
+ "BĠERĠ": 160,
229
+ "LĠAEĠ": 161,
230
+ "KĠS</w>": 162,
231
+ "RĠIHĠ": 163,
232
+ "IHĠNĠ": 164,
233
+ "TĠR": 165,
234
+ "ĠIYĠAHĠ": 166,
235
+ "ĠAAĠNĠ": 167,
236
+ "ON": 168,
237
+ "YĠAHĠ": 169,
238
+ "PĠAHĠ": 170,
239
+ "VĠ": 171,
240
+ "RĠAHĠ": 172,
241
+ "VĠIHĠ": 173,
242
+ "LĠEHĠ": 174,
243
+ "KĠAEĠ": 175,
244
+ "HHĠ": 176,
245
+ "LĠIYĠ": 177,
246
+ "OR": 178,
247
+ "HĠERĠ": 179,
248
+ "GĠAHĠ": 180,
249
+ "MĠAEĠ": 181,
250
+ "GĠR": 182,
251
+ "ST": 183,
252
+ "AT": 184,
253
+ "ES</w>": 185,
254
+ "BĠR": 186,
255
+ "RĠIYĠ": 187,
256
+ "BĠIHĠ": 188,
257
+ "SHĠ": 189,
258
+ "LĠEYĠ": 190,
259
+ "PĠR": 191,
260
+ "LĠAAĠ": 192,
261
+ "AL": 193,
262
+ "TĠIY</w>": 194,
263
+ "HHĠAEĠ": 195,
264
+ "SĠEHĠ": 196,
265
+ "NĠAHĠS</w>": 197,
266
+ "TH</w>": 198,
267
+ "EL": 199,
268
+ "HĠIYĠ": 200,
269
+ "FĠAHĠ": 201,
270
+ "LĠAYĠ": 202,
271
+ "LĠD</w>": 203,
272
+ "KĠW": 204,
273
+ "MĠEHĠ": 205,
274
+ "RE": 206,
275
+ "PĠIHĠ": 207,
276
+ "FĠIHĠ": 208,
277
+ "SHĠAHĠN</w>": 209,
278
+ "NĠIY</w>": 210,
279
+ "MĠAAĠ": 211,
280
+ "KĠR": 212,
281
+ "VĠAHĠ": 213,
282
+ "THĠ": 214,
283
+ "UW</w>": 215,
284
+ "OWĠZ</w>": 216,
285
+ "HHĠAAĠ": 217,
286
+ "CH": 218,
287
+ "RĠUWĠ": 219,
288
+ "OYĠ": 220,
289
+ "ĠAOĠR": 221,
290
+ "KĠIHĠ": 222,
291
+ "HĠAEĠ": 223,
292
+ "ED</w>": 224,
293
+ "ZĠAHĠ": 225,
294
+ "HHĠEHĠ": 226,
295
+ "SĠIHĠZ</w>": 227,
296
+ "DĠEHĠ": 228,
297
+ "JHĠAHĠ": 229,
298
+ "JHĠIHĠ": 230,
299
+ "BĠAEĠ": 231,
300
+ "TĠERĠ": 232,
301
+ "JHĠ": 233,
302
+ "OW": 234,
303
+ "BĠEHĠ": 235,
304
+ "SĠIYĠ": 236,
305
+ "OWĠLĠ": 237,
306
+ "VĠERĠ": 238,
307
+ "ĠEY</w>": 239,
308
+ "TĠIHĠD</w>": 240,
309
+ "KĠAHĠNĠ": 241,
310
+ "LE": 242,
311
+ "MĠAHĠN</w>": 243,
312
+ "ĠAHĠNĠT</w>": 244,
313
+ "RĠEHĠ": 245,
314
+ "NĠAH</w>": 246,
315
+ "CHĠ": 247,
316
+ "IS": 248,
317
+ "UW": 249,
318
+ "PĠERĠ": 250,
319
+ "SĠTĠ": 251,
320
+ "PĠAAĠ": 252,
321
+ "TĠAHĠN</w>": 253,
322
+ "LĠUWĠ": 254,
323
+ "HĠAAĠ": 255
324
+ },
325
+ "merges": [
326
+ "H Ġ",
327
+ "Ġ A",
328
+ "Ġ I",
329
+ "ĠA HĠ",
330
+ "Ġ E",
331
+ "ĠI HĠ",
332
+ "Y Ġ",
333
+ "R Ġ",
334
+ "N Ġ",
335
+ "A Ġ",
336
+ "W Ġ",
337
+ "E Ġ",
338
+ "ĠA AĠ",
339
+ "S Ġ",
340
+ "ĠE HĠ",
341
+ "ĠA EĠ",
342
+ "Ġ R",
343
+ "ĠI YĠ",
344
+ "L Ġ",
345
+ "ĠE RĠ",
346
+ "HĠ A",
347
+ "K Ġ",
348
+ "O WĠ",
349
+ "ĠI Y</w>",
350
+ "ĠE YĠ",
351
+ "T Ġ",
352
+ "ĠA O",
353
+ "G Ġ",
354
+ "U WĠ",
355
+ "ĠAHĠ NĠ",
356
+ "ĠAO Ġ",
357
+ "ĠIHĠ N",
358
+ "I HĠ",
359
+ "M Ġ",
360
+ "ĠA H</w>",
361
+ "ĠA YĠ",
362
+ "D Ġ",
363
+ "SĠ T",
364
+ "HĠ E",
365
+ "HĠA HĠ",
366
+ "ĠAHĠ N</w>",
367
+ "ĠI Y",
368
+ "ĠE R</w>",
369
+ "P Ġ",
370
+ "B Ġ",
371
+ "A HĠ",
372
+ "ĠIHĠN G</w>",
373
+ "L ĠAHĠ",
374
+ "N ĠAHĠ",
375
+ "ĠE R",
376
+ "O W</w>",
377
+ "K ĠAHĠ",
378
+ "ĠAAĠ RĠ",
379
+ "H HĠA",
380
+ "L ĠIY</w>",
381
+ "L ĠIHĠ",
382
+ "TĠ S</w>",
383
+ "HĠ IHĠ",
384
+ "S ĠIHĠ",
385
+ "D ĠIHĠ",
386
+ "T ĠIHĠ",
387
+ "ĠAOĠ RĠ",
388
+ "ĠERĠ Z</w>",
389
+ "S ĠAHĠ",
390
+ "ĠIYĠ Z</w>",
391
+ "F Ġ",
392
+ "I N",
393
+ "S HĠAHĠ",
394
+ "T ĠAHĠ",
395
+ "NĠ Z</w>",
396
+ "E R",
397
+ "A EĠ",
398
+ "M ĠAHĠ",
399
+ "ĠAEĠ NĠ",
400
+ "HĠE HĠ",
401
+ "E HĠ",
402
+ "U HĠ",
403
+ "ĠR ĠAHĠ",
404
+ "ĠAHĠNĠ Z</w>",
405
+ "B ĠAHĠ",
406
+ "ĠEHĠ R",
407
+ "ĠEHĠ NĠ",
408
+ "D ĠAHĠ",
409
+ "ĠR ĠIHĠ",
410
+ "HĠ I",
411
+ "K ĠAAĠ",
412
+ "LĠ Z</w>",
413
+ "ĠIHĠN GĠ",
414
+ "N GĠ",
415
+ "N ĠIHĠ",
416
+ "M ĠIHĠ",
417
+ "A N",
418
+ "W ĠIHĠ",
419
+ "ĠA WĠ",
420
+ "A R",
421
+ "Z Ġ",
422
+ "A AĠ",
423
+ "SĠ T</w>",
424
+ "YĠ UWĠ",
425
+ "DĠ Z</w>",
426
+ "RĠ OWĠ",
427
+ "AHĠ NĠ",
428
+ "SĠ K",
429
+ "E N",
430
+ "O Ġ",
431
+ "SĠ P",
432
+ "B ĠERĠ",
433
+ "L ĠAEĠ",
434
+ "KĠ S</w>",
435
+ "R ĠIHĠ",
436
+ "IHĠ NĠ",
437
+ "T ĠR",
438
+ "ĠIY ĠAHĠ",
439
+ "ĠAAĠ NĠ",
440
+ "O N",
441
+ "Y ĠAHĠ",
442
+ "P ĠAHĠ",
443
+ "V Ġ",
444
+ "R ĠAHĠ",
445
+ "V ĠIHĠ",
446
+ "L ĠEHĠ",
447
+ "K ĠAEĠ",
448
+ "H HĠ",
449
+ "L ĠIYĠ",
450
+ "O R",
451
+ "HĠE RĠ",
452
+ "G ĠAHĠ",
453
+ "M ĠAEĠ",
454
+ "G ĠR",
455
+ "S T",
456
+ "A T",
457
+ "E S</w>",
458
+ "B ĠR",
459
+ "R ĠIYĠ",
460
+ "B ĠIHĠ",
461
+ "S HĠ",
462
+ "L ĠEYĠ",
463
+ "P ĠR",
464
+ "L ĠAAĠ",
465
+ "A L",
466
+ "T ĠIY</w>",
467
+ "HHĠA EĠ",
468
+ "S ĠEHĠ",
469
+ "NĠAHĠ S</w>",
470
+ "T H</w>",
471
+ "E L",
472
+ "HĠI YĠ",
473
+ "F ĠAHĠ",
474
+ "L ĠAYĠ",
475
+ "LĠ D</w>",
476
+ "KĠ W",
477
+ "M ĠEHĠ",
478
+ "R E",
479
+ "P ĠIHĠ",
480
+ "F ĠIHĠ",
481
+ "SHĠAHĠ N</w>",
482
+ "N ĠIY</w>",
483
+ "M ĠAAĠ",
484
+ "K ĠR",
485
+ "V ĠAHĠ",
486
+ "T HĠ",
487
+ "U W</w>",
488
+ "OWĠ Z</w>",
489
+ "HHĠA AĠ",
490
+ "C H",
491
+ "RĠ UWĠ",
492
+ "O YĠ",
493
+ "ĠAO ĠR",
494
+ "K ĠIHĠ",
495
+ "HĠA EĠ",
496
+ "E D</w>",
497
+ "Z ĠAHĠ",
498
+ "H HĠEHĠ",
499
+ "SĠIHĠ Z</w>",
500
+ "D ĠEHĠ",
501
+ "J HĠAHĠ",
502
+ "J HĠIHĠ",
503
+ "B ĠAEĠ",
504
+ "T ĠERĠ",
505
+ "J HĠ",
506
+ "O W",
507
+ "B ĠEHĠ",
508
+ "S ĠIYĠ",
509
+ "OWĠ LĠ",
510
+ "V ĠERĠ",
511
+ "ĠE Y</w>",
512
+ "TĠIHĠ D</w>",
513
+ "K ĠAHĠNĠ",
514
+ "L E",
515
+ "M ĠAHĠN</w>",
516
+ "ĠAHĠNĠ T</w>",
517
+ "R ĠEHĠ",
518
+ "N ĠAH</w>",
519
+ "C HĠ",
520
+ "I S",
521
+ "U W",
522
+ "P ĠERĠ",
523
+ "SĠ TĠ",
524
+ "P ĠAAĠ",
525
+ "T ĠAHĠN</w>",
526
+ "LĠ UWĠ",
527
+ "HĠA AĠ"
528
+ ]
529
+ }
530
+ }
G2P_lexicon/my_tokenizer/sp_dict.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "<sos>",
3
+ "1": "<eos>",
4
+ "2": "<unk>",
5
+ "3": "<pad>",
6
+ "4": "AA1",
7
+ "5": "UW",
8
+ "6": "ER0",
9
+ "7": "F",
10
+ "8": "CH",
11
+ "9": "S",
12
+ "10": "AO1",
13
+ "11": "DH",
14
+ "12": "TH",
15
+ "13": "IY",
16
+ "14": "OW",
17
+ "15": "AH2",
18
+ "16": "W",
19
+ "17": "AH1",
20
+ "18": "AO",
21
+ "19": "D",
22
+ "20": "AW1",
23
+ "21": "OY2",
24
+ "22": "AO0",
25
+ "23": "EY0",
26
+ "24": "AH",
27
+ "25": "AE",
28
+ "26": "UH2",
29
+ "27": "OW2",
30
+ "28": "UW0",
31
+ "29": "UW1",
32
+ "30": "UH1",
33
+ "31": "ER",
34
+ "32": "EH2",
35
+ "33": "UW2",
36
+ "34": "ER2",
37
+ "35": "OY",
38
+ "36": "AE0",
39
+ "37": "AY",
40
+ "38": "K",
41
+ "39": "AA0",
42
+ "40": "T",
43
+ "41": "EH0",
44
+ "42": "SH",
45
+ "43": "ER1",
46
+ "44": "G",
47
+ "45": "EY",
48
+ "46": "AH0",
49
+ "47": "IH0",
50
+ "48": "L",
51
+ "49": "AE2",
52
+ "50": "B",
53
+ "51": "OY0",
54
+ "52": "EH",
55
+ "53": "AA2",
56
+ "54": "IH",
57
+ "55": "M",
58
+ "56": "AY0",
59
+ "57": "UH",
60
+ "58": "EY2",
61
+ "59": "IY2",
62
+ "60": "EY1",
63
+ "61": "HH",
64
+ "62": "P",
65
+ "63": "AE1",
66
+ "64": "OW1",
67
+ "65": "R",
68
+ "66": "IH1",
69
+ "67": "Z",
70
+ "68": "IH2",
71
+ "69": "IY0",
72
+ "70": "V",
73
+ "71": "JH",
74
+ "72": "OY1",
75
+ "73": "Y",
76
+ "74": "N",
77
+ "75": "AO2",
78
+ "76": "AW",
79
+ "77": "UH0",
80
+ "78": "IY1",
81
+ "79": "AW0",
82
+ "80": "AA",
83
+ "81": "NG",
84
+ "82": "AY1",
85
+ "83": "EH1",
86
+ "84": "AY2",
87
+ "85": "OW0",
88
+ "86": "AW2",
89
+ "87": "ZH"
90
+ }
G2P_lexicon/sp_tokenizer.py CHANGED
@@ -83,5 +83,5 @@ class Tokenizer_sp:
83
 
84
 
85
  if __name__ == "__main__":
86
- tokenizer_sp = Tokenizer_sp(dict_path='./my_tokenizer/my_dict_256.json')
87
  print(tokenizer_sp.idx2token)
 
83
 
84
 
85
  if __name__ == "__main__":
86
+ tokenizer_sp = Tokenizer_sp(dict_path='my_tokenizer/sp_dict.json')
87
  print(tokenizer_sp.idx2token)
G2P_lexicon/transformer.py CHANGED
@@ -22,7 +22,7 @@ class PositionalEncoding(nn.Module):
22
 
23
 
24
  class MultiHeadSelfAttention(nn.Module):
25
- def __init__(self, d_model, num_heads):
26
  super(MultiHeadSelfAttention, self).__init__()
27
  assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
28
 
@@ -30,9 +30,9 @@ class MultiHeadSelfAttention(nn.Module):
30
  self.num_heads = num_heads
31
  self.depth = d_model // num_heads
32
 
33
- self.wq = nn.Linear(d_model, d_model)
34
- self.wk = nn.Linear(d_model, d_model)
35
- self.wv = nn.Linear(d_model, d_model)
36
 
37
  self.fc = nn.Linear(d_model, d_model)
38
 
@@ -76,9 +76,9 @@ class FeedForwardNetwork(nn.Module):
76
 
77
 
78
  class EncoderLayer(nn.Module):
79
- def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
80
  super(EncoderLayer, self).__init__()
81
- self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
82
  self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
83
 
84
  self.layernorm1 = nn.LayerNorm(d_model)
@@ -95,10 +95,10 @@ class EncoderLayer(nn.Module):
95
 
96
 
97
  class DecoderLayer(nn.Module):
98
- def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
99
  super(DecoderLayer, self).__init__()
100
- self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
101
- self.cross_attn = MultiHeadSelfAttention(d_model, num_heads)
102
  self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
103
 
104
  self.layernorm1 = nn.LayerNorm(d_model)
@@ -132,6 +132,7 @@ class TransformerBlock(nn.Module):
132
  self.num_decoder_layers = config.get('NUM', 6)
133
  self.d_ff = config.get('D_FF', 2048)
134
  self.dropout = config.get('DROPOUT', 0.1)
 
135
  self.stress = stress
136
 
137
  self.encoder_embedding = nn.Embedding(self.input_vocab_size, self.d_model)
@@ -140,10 +141,10 @@ class TransformerBlock(nn.Module):
140
  self.pos_embedding = PositionalEncoding(self.d_model, config.get('MAX_LEN', 32))
141
 
142
  self.encoder_layers = nn.ModuleList(
143
- [EncoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
144
  range(self.num_encoder_layers)])
145
  self.decoder_layers = nn.ModuleList(
146
- [DecoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
147
  range(self.num_decoder_layers)])
148
 
149
  self.fc_out = nn.Linear(self.d_model, self.target_vocab_size)
 
22
 
23
 
24
  class MultiHeadSelfAttention(nn.Module):
25
+ def __init__(self, d_model, num_heads, bias=False):
26
  super(MultiHeadSelfAttention, self).__init__()
27
  assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
28
 
 
30
  self.num_heads = num_heads
31
  self.depth = d_model // num_heads
32
 
33
+ self.wq = nn.Linear(d_model, d_model, bias)
34
+ self.wk = nn.Linear(d_model, d_model, bias)
35
+ self.wv = nn.Linear(d_model, d_model, bias)
36
 
37
  self.fc = nn.Linear(d_model, d_model)
38
 
 
76
 
77
 
78
  class EncoderLayer(nn.Module):
79
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1, bias=False):
80
  super(EncoderLayer, self).__init__()
81
+ self.self_attn = MultiHeadSelfAttention(d_model, num_heads, bias)
82
  self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
83
 
84
  self.layernorm1 = nn.LayerNorm(d_model)
 
95
 
96
 
97
  class DecoderLayer(nn.Module):
98
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1, bias=False):
99
  super(DecoderLayer, self).__init__()
100
+ self.self_attn = MultiHeadSelfAttention(d_model, num_heads, bias)
101
+ self.cross_attn = MultiHeadSelfAttention(d_model, num_heads, bias)
102
  self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
103
 
104
  self.layernorm1 = nn.LayerNorm(d_model)
 
132
  self.num_decoder_layers = config.get('NUM', 6)
133
  self.d_ff = config.get('D_FF', 2048)
134
  self.dropout = config.get('DROPOUT', 0.1)
135
+ self.bias = config.get('BIAS', False)
136
  self.stress = stress
137
 
138
  self.encoder_embedding = nn.Embedding(self.input_vocab_size, self.d_model)
 
141
  self.pos_embedding = PositionalEncoding(self.d_model, config.get('MAX_LEN', 32))
142
 
143
  self.encoder_layers = nn.ModuleList(
144
+ [EncoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout, self.bias) for _ in
145
  range(self.num_encoder_layers)])
146
  self.decoder_layers = nn.ModuleList(
147
+ [DecoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout, self.bias) for _ in
148
  range(self.num_decoder_layers)])
149
 
150
  self.fc_out = nn.Linear(self.d_model, self.target_vocab_size)