Vineel Pratap commited on
Commit
78e8beb
·
1 Parent(s): d15da79
app.py CHANGED
@@ -84,7 +84,7 @@ with gr.Blocks(css="style.css") as demo:
84
  with gr.Accordion("Logs", open=False):
85
  logs = gr.Textbox(show_label=False)
86
 
87
- # hack
88
  reference = gr.Textbox(label="Reference Transcript", visible=False)
89
 
90
  btn.click(
@@ -97,7 +97,7 @@ with gr.Blocks(css="style.css") as demo:
97
  lmscore,
98
  wscore_usedefault,
99
  lmscore_usedefault,
100
- reference
101
  ],
102
  outputs=[text, logs],
103
  )
@@ -106,9 +106,21 @@ with gr.Blocks(css="style.css") as demo:
106
  gr.Examples(
107
  examples=[
108
  # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
109
- ["upload/english/english.mp3", "upload/english/c4_10k_sentences.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
110
- ["upload/english/english.mp3", "upload/english/c4_5k_sentences.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
111
- ["upload/english/english.mp3", "upload/english/cv8_top10k_words.txt", " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import"],
 
 
 
 
 
 
 
 
 
 
 
 
112
  ],
113
  inputs=[audio, words_file, reference],
114
  label="English",
@@ -116,9 +128,21 @@ with gr.Blocks(css="style.css") as demo:
116
  gr.Examples(
117
  examples=[
118
  # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
119
- ["upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_10k_sentences.txt", "I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña."],
120
- ["upload/ligurian/ligurian_2.mp3", "upload/ligurian/zenamt_10k_sentences.txt", "Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato."],
121
- ["upload/ligurian/ligurian_3.mp3", "upload/ligurian/zenamt_5k_sentences.txt", "Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ."],
 
 
 
 
 
 
 
 
 
 
 
 
122
  ],
123
  inputs=[audio, words_file, reference],
124
  label="Ligurian",
 
84
  with gr.Accordion("Logs", open=False):
85
  logs = gr.Textbox(show_label=False)
86
 
87
+ # hack
88
  reference = gr.Textbox(label="Reference Transcript", visible=False)
89
 
90
  btn.click(
 
97
  lmscore,
98
  wscore_usedefault,
99
  lmscore_usedefault,
100
+ reference,
101
  ],
102
  outputs=[text, logs],
103
  )
 
106
  gr.Examples(
107
  examples=[
108
  # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
109
+ [
110
+ "upload/english/english.mp3",
111
+ "upload/english/c4_10k_sentences.txt",
112
+ " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
113
+ ],
114
+ [
115
+ "upload/english/english.mp3",
116
+ "upload/english/c4_5k_sentences.txt",
117
+ " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
118
+ ],
119
+ [
120
+ "upload/english/english.mp3",
121
+ "upload/english/cv8_top10k_words.txt",
122
+ " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
123
+ ],
124
  ],
125
  inputs=[audio, words_file, reference],
126
  label="English",
 
128
  gr.Examples(
129
  examples=[
130
  # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
131
+ [
132
+ "upload/ligurian/ligurian_1.mp3",
133
+ "upload/ligurian/zenamt_10k_sentences.txt",
134
+ "I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña.",
135
+ ],
136
+ [
137
+ "upload/ligurian/ligurian_2.mp3",
138
+ "upload/ligurian/zenamt_10k_sentences.txt",
139
+ "Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato.",
140
+ ],
141
+ [
142
+ "upload/ligurian/ligurian_3.mp3",
143
+ "upload/ligurian/zenamt_5k_sentences.txt",
144
+ "Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ.",
145
+ ],
146
  ],
147
  inputs=[audio, words_file, reference],
148
  label="Ligurian",
normalization/README.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6aa5ef11df920fccc933f0d0ff4dd982a2872e0e544ab7409507ad6f130b81
3
+ size 118
normalization/__init__.py ADDED
File without changes
normalization/norm_config.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+
5
+ colon = ":"
6
+ comma = ","
7
+ exclamation_mark = "!"
8
+ period = re.escape(".")
9
+ question_mark = re.escape("?")
10
+ semicolon = ";"
11
+
12
+ left_curly_bracket = "{"
13
+ right_curly_bracket = "}"
14
+ quotation_mark = '"'
15
+
16
+ basic_punc = (
17
+ period
18
+ + question_mark
19
+ + comma
20
+ + colon
21
+ + exclamation_mark
22
+ + left_curly_bracket
23
+ + right_curly_bracket
24
+ )
25
+
26
+ # General punc unicode block (0x2000-0x206F)
27
+ zero_width_space = r"\u200B"
28
+ zero_width_nonjoiner = r"\u200C"
29
+ left_to_right_mark = r"\u200E"
30
+ right_to_left_mark = r"\u200F"
31
+ left_to_right_embedding = r"\u202A"
32
+ pop_directional_formatting = r"\u202C"
33
+
34
+ # Here are some commonly ill-typed versions of apostrophe
35
+ right_single_quotation_mark = r"\u2019"
36
+ left_single_quotation_mark = r"\u2018"
37
+
38
+ # Language specific definitions
39
+ # Spanish
40
+ inverted_exclamation_mark = r"\u00A1"
41
+ inverted_question_mark = r"\u00BF"
42
+
43
+
44
+ # Hindi
45
+ hindi_danda = u"\u0964"
46
+
47
+ # Egyptian Arabic
48
+ # arabic_percent = r"\u066A"
49
+ arabic_comma = r"\u060C"
50
+ arabic_question_mark = r"\u061F"
51
+ arabic_semicolon = r"\u061B"
52
+ arabic_diacritics = r"\u064B-\u0652"
53
+
54
+
55
+ arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
56
+
57
+
58
+ # Chinese
59
+ full_stop = r"\u3002"
60
+ full_comma = r"\uFF0C"
61
+ full_exclamation_mark = r"\uFF01"
62
+ full_question_mark = r"\uFF1F"
63
+ full_semicolon = r"\uFF1B"
64
+ full_colon = r"\uFF1A"
65
+ full_parentheses = r"\uFF08\uFF09"
66
+ quotation_mark_horizontal = r"\u300C-\u300F"
67
+ quotation_mark_vertical = r"\uFF41-\uFF44"
68
+ title_marks = r"\u3008-\u300B"
69
+ wavy_low_line = r"\uFE4F"
70
+ ellipsis = r"\u22EF"
71
+ enumeration_comma = r"\u3001"
72
+ hyphenation_point = r"\u2027"
73
+ forward_slash = r"\uFF0F"
74
+ wavy_dash = r"\uFF5E"
75
+ box_drawings_light_horizontal = r"\u2500"
76
+ fullwidth_low_line = r"\uFF3F"
77
+ chinese_punc = (
78
+ full_stop
79
+ + full_comma
80
+ + full_exclamation_mark
81
+ + full_question_mark
82
+ + full_semicolon
83
+ + full_colon
84
+ + full_parentheses
85
+ + quotation_mark_horizontal
86
+ + quotation_mark_vertical
87
+ + title_marks
88
+ + wavy_low_line
89
+ + ellipsis
90
+ + enumeration_comma
91
+ + hyphenation_point
92
+ + forward_slash
93
+ + wavy_dash
94
+ + box_drawings_light_horizontal
95
+ + fullwidth_low_line
96
+ )
97
+
98
+ # Armenian
99
+ armenian_apostrophe = r"\u055A"
100
+ emphasis_mark = r"\u055B"
101
+ exclamation_mark = r"\u055C"
102
+ armenian_comma = r"\u055D"
103
+ armenian_question_mark = r"\u055E"
104
+ abbreviation_mark = r"\u055F"
105
+ armenian_full_stop = r"\u0589"
106
+ armenian_punc = (
107
+ armenian_apostrophe
108
+ + emphasis_mark
109
+ + exclamation_mark
110
+ + armenian_comma
111
+ + armenian_question_mark
112
+ + abbreviation_mark
113
+ + armenian_full_stop
114
+ )
115
+
116
+ lesser_than_symbol = r"<"
117
+ greater_than_symbol = r">"
118
+
119
+ lesser_than_sign = r"\u003c"
120
+ greater_than_sign = r"\u003e"
121
+
122
+ nbsp_written_form = r"&nbsp"
123
+
124
+ # Quotation marks
125
+ left_double_quotes = r"\u201c"
126
+ right_double_quotes = r"\u201d"
127
+ left_double_angle = r"\u00ab"
128
+ right_double_angle = r"\u00bb"
129
+ left_single_angle = r"\u2039"
130
+ right_single_angle = r"\u203a"
131
+ low_double_quotes = r"\u201e"
132
+ low_single_quotes = r"\u201a"
133
+ high_double_quotes = r"\u201f"
134
+ high_single_quotes = r"\u201b"
135
+
136
+ all_punct_quotes = (
137
+ left_double_quotes
138
+ + right_double_quotes
139
+ + left_double_angle
140
+ + right_double_angle
141
+ + left_single_angle
142
+ + right_single_angle
143
+ + low_double_quotes
144
+ + low_single_quotes
145
+ + high_double_quotes
146
+ + high_single_quotes
147
+ + right_single_quotation_mark
148
+ + left_single_quotation_mark
149
+ )
150
+ mapping_quotes = (
151
+ "["
152
+ + high_single_quotes
153
+ + right_single_quotation_mark
154
+ + left_single_quotation_mark
155
+ + "]"
156
+ )
157
+
158
+
159
+ # Digits
160
+
161
+ english_digits = r"\u0030-\u0039"
162
+ bengali_digits = r"\u09e6-\u09ef"
163
+ khmer_digits = r"\u17e0-\u17e9"
164
+ devanagari_digits = r"\u0966-\u096f"
165
+ oriya_digits = r"\u0b66-\u0b6f"
166
+ extended_arabic_indic_digits = r"\u06f0-\u06f9"
167
+ kayah_li_digits = r"\ua900-\ua909"
168
+ fullwidth_digits = r"\uff10-\uff19"
169
+ malayam_digits = r"\u0d66-\u0d6f"
170
+ myanmar_digits = r"\u1040-\u1049"
171
+ roman_numeral = r"\u2170-\u2179"
172
+ nominal_digit_shapes = r"\u206f"
173
+
174
+ # Load punctuations from MMS-lab data
175
+ with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
176
+ punc_list = punc_f.readlines()
177
+
178
+ punct_pattern = r""
179
+ for punc in punc_list:
180
+ # the first character in the tab separated line is the punc to be removed
181
+ punct_pattern += re.escape(punc.split("\t")[0])
182
+
183
+ shared_digits = (
184
+ english_digits
185
+ + bengali_digits
186
+ + khmer_digits
187
+ + devanagari_digits
188
+ + oriya_digits
189
+ + extended_arabic_indic_digits
190
+ + kayah_li_digits
191
+ + fullwidth_digits
192
+ + malayam_digits
193
+ + myanmar_digits
194
+ + roman_numeral
195
+ + nominal_digit_shapes
196
+ )
197
+
198
+ shared_punc_list = (
199
+ basic_punc
200
+ + all_punct_quotes
201
+ + greater_than_sign
202
+ + lesser_than_sign
203
+ + inverted_question_mark
204
+ + full_stop
205
+ + semicolon
206
+ + armenian_punc
207
+ + inverted_exclamation_mark
208
+ + arabic_comma
209
+ + enumeration_comma
210
+ + hindi_danda
211
+ + quotation_mark
212
+ + arabic_semicolon
213
+ + arabic_question_mark
214
+ + chinese_punc
215
+ + punct_pattern
216
+
217
+ )
218
+
219
+ shared_mappping = {
220
+ lesser_than_symbol: "",
221
+ greater_than_symbol: "",
222
+ nbsp_written_form: "",
223
+ r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
224
+ }
225
+
226
+ shared_deletion_list = (
227
+ left_to_right_mark
228
+ + zero_width_nonjoiner
229
+ + arabic_subscript_alef_and_inverted_damma
230
+ + zero_width_space
231
+ + arabic_diacritics
232
+ + pop_directional_formatting
233
+ + right_to_left_mark
234
+ + left_to_right_embedding
235
+ )
236
+
237
+ norm_config = {
238
+ "*": {
239
+ "lower_case": True,
240
+ "punc_set": shared_punc_list,
241
+ "del_set": shared_deletion_list,
242
+ "mapping": shared_mappping,
243
+ "digit_set": shared_digits,
244
+ "unicode_norm": "NFKC",
245
+ "rm_diacritics" : False,
246
+ }
247
+ }
248
+
249
+ #=============== Mongolian ===============#
250
+
251
+ norm_config["mon"] = norm_config["*"].copy()
252
+ # add soft hyphen to punc list to match with fleurs
253
+ norm_config["mon"]["del_set"] += r"\u00AD"
254
+
255
+ norm_config["khk"] = norm_config["mon"].copy()
256
+
257
+ #=============== Hebrew ===============#
258
+
259
+ norm_config["heb"] = norm_config["*"].copy()
260
+ # add "HEBREW POINT" symbols to match with fleurs
261
+ norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
262
+
263
+ #=============== Thai ===============#
264
+
265
+ norm_config["tha"] = norm_config["*"].copy()
266
+ # add "Zero width joiner" symbols to match with fleurs
267
+ norm_config["tha"]["punc_set"] += r"\u200D"
268
+
269
+ #=============== Arabic ===============#
270
+ norm_config["ara"] = norm_config["*"].copy()
271
+ norm_config["ara"]["mapping"]["ٱ"] = "ا"
272
+ norm_config["arb"] = norm_config["ara"].copy()
273
+
274
+ #=============== Javanese ===============#
275
+ norm_config["jav"] = norm_config["*"].copy()
276
+ norm_config["jav"]["rm_diacritics"] = True
normalization/punctuations.lst ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+  7355 INVALID UNICODE 0x81
2
+  5265 INVALID UNICODE 0x90
3
+  75 INVALID UNICODE 0x8
4
+  31 INVALID UNICODE 0x8d
5
+ ” 3 INVALID UNICODE 0x94
6
+  2 INVALID UNICODE 0x8f
7
+  2 INVALID UNICODE 0x1a
8
+  1 INVALID UNICODE 0x9d
9
+ “ 1 INVALID UNICODE 0x93
10
+ ’ 1 INVALID UNICODE 0x92
11
+  8647 INVALID UNICODE 0xe295
12
+  6650 INVALID UNICODE 0xf21d
13
+  6234 INVALID UNICODE 0xf62d
14
+  4815 INVALID UNICODE 0xf173
15
+  4789 INVALID UNICODE 0xe514
16
+  4409 INVALID UNICODE 0xe293
17
+  3881 INVALID UNICODE 0xf523
18
+  3788 INVALID UNICODE 0xe233
19
+  2448 INVALID UNICODE 0xf50f
20
+  2177 INVALID UNICODE 0xe232
21
+  1955 INVALID UNICODE 0xea7b
22
+  1926 INVALID UNICODE 0xf172
23
+  973 INVALID UNICODE 0xe290
24
+  972 INVALID UNICODE 0xf519
25
+  661 INVALID UNICODE 0xe292
26
+  591 INVALID UNICODE 0xe328
27
+  509 INVALID UNICODE 0xe2fa
28
+  458 INVALID UNICODE 0xe234
29
+  446 INVALID UNICODE 0xe043
30
+  419 INVALID UNICODE 0xe040
31
+  399 INVALID UNICODE 0xe2fb
32
+  387 INVALID UNICODE 0xe32b
33
+  381 INVALID UNICODE 0xe236
34
+  374 INVALID UNICODE 0xf511
35
+  314 INVALID UNICODE 0xe517
36
+  296 INVALID UNICODE 0xe2fe
37
+  293 INVALID UNICODE 0xe492
38
+  291 INVALID UNICODE 0xf52d
39
+  289 INVALID UNICODE 0xe2fc
40
+  195 INVALID UNICODE 0xf521
41
+  190 INVALID UNICODE 0xe516
42
+  182 INVALID UNICODE 0xe041
43
+  178 INVALID UNICODE 0xf529
44
+  113 INVALID UNICODE 0xe2f9
45
+  87 INVALID UNICODE 0xe2d9
46
+  78 INVALID UNICODE 0xe32a
47
+  76 INVALID UNICODE 0xe291
48
+  74 INVALID UNICODE 0xe296
49
+  66 INVALID UNICODE 0xe518
50
+  52 INVALID UNICODE 0xe32c
51
+  46 INVALID UNICODE 0xe2db
52
+  41 INVALID UNICODE 0xe231
53
+  34 INVALID UNICODE 0xf522
54
+  33 INVALID UNICODE 0xf518
55
+  32 INVALID UNICODE 0xf513
56
+  27 INVALID UNICODE 0xe32d
57
+  25 INVALID UNICODE 0xe32e
58
+  23 INVALID UNICODE 0xe06b
59
+  15 INVALID UNICODE 0xea01
60
+  12 INVALID UNICODE 0xe294
61
+  11 INVALID UNICODE 0xe203
62
+  8 INVALID UNICODE 0xf218
63
+  7 INVALID UNICODE 0xe070
64
+  7 INVALID UNICODE 0xe013
65
+  5 INVALID UNICODE 0xe2de
66
+  4 INVALID UNICODE 0xe493
67
+  3 INVALID UNICODE 0xf7e8
68
+  3 INVALID UNICODE 0xf7d0
69
+  3 INVALID UNICODE 0xe313
70
+  2 INVALID UNICODE 0xe329
71
+  2 INVALID UNICODE 0xe06d
72
+  2 INVALID UNICODE 0xe003
73
+  1 INVALID UNICODE 0xf50e
74
+  1 INVALID UNICODE 0xf171
75
+  1 INVALID UNICODE 0xe01d
76
+  71 NOMINAL DIGIT SHAPES 0x206f
77
+ ⁠ 3 WORD JOINER 0x2060
78
+ ― 126545 HORIZONTAL BAR 0x2015
79
+ ־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
80
+ ) 98429 RIGHT PARENTHESIS 0x29
81
+ ] 27108 RIGHT SQUARE BRACKET 0x5d
82
+ ⌋ 1567 RIGHT FLOOR 0x230b
83
+ 〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
84
+ 】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
85
+ ﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
86
+ & 170517 AMPERSAND 0x26
87
+ ། 106330 TIBETAN MARK SHAD 0xf0d
88
+ ። 90203 ETHIOPIC FULL STOP 0x1362
89
+ ፥ 60484 ETHIOPIC COLON 0x1365
90
+ ༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
91
+ ။ 51567 MYANMAR SIGN SECTION 0x104b
92
+ / 46929 SOLIDUS 0x2f
93
+ ၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
94
+ · 37985 MIDDLE DOT 0xb7
95
+ ‸ 36310 CARET 0x2038
96
+ * 34793 ASTERISK 0x2a
97
+ ۔ 32432 ARABIC FULL STOP 0x6d4
98
+ ፤ 31906 ETHIOPIC SEMICOLON 0x1364
99
+ ၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
100
+ ។ 20834 KHMER SIGN KHAN 0x17d4
101
+ ꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
102
+ ᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
103
+ ꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
104
+ ⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
105
+ ꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
106
+ ॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
107
+ ؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
108
+ ၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
109
+ · 8431 GREEK ANO TELEIA 0x387
110
+ † 7477 DAGGER 0x2020
111
+ ၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
112
+ ፣ 5719 ETHIOPIC COMMA 0x1363
113
+ ៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
114
+ ꤮ 4791 KAYAH LI SIGN CWI 0xa92e
115
+ ※ 3439 REFERENCE MARK 0x203b
116
+ ፦ 2727 ETHIOPIC PREFACE COLON 0x1366
117
+ • 1749 BULLET 0x2022
118
+ ¶ 1507 PILCROW SIGN 0xb6
119
+ ၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
120
+ ﹖ 1224 SMALL QUESTION MARK 0xfe56
121
+ ; 975 GREEK QUESTION MARK 0x37e
122
+ … 827 HORIZONTAL ELLIPSIS 0x2026
123
+ % 617 PERCENT SIGN 0x25
124
+ ・ 468 KATAKANA MIDDLE DOT 0x30fb
125
+ ༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
126
+ ‡ 140 DOUBLE DAGGER 0x2021
127
+ # 137 NUMBER SIGN 0x23
128
+ @ 125 COMMERCIAL AT 0x40
129
+ ፡ 121 ETHIOPIC WORDSPACE 0x1361
130
+ ៚ 55 KHMER SIGN KOOMUUT 0x17da
131
+ ៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
132
+ ﹐ 10 SMALL COMMA 0xfe50
133
+ ༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
134
+ ༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
135
+ . 2 FULLWIDTH FULL STOP 0xff0e
136
+ ﹗ 2 SMALL EXCLAMATION MARK 0xfe57
137
+ ﹕ 2 SMALL COLON 0xfe55
138
+ ‰ 2 PER MILLE SIGN 0x2030
139
+ ・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
140
+ ( 98504 LEFT PARENTHESIS 0x28
141
+ [ 27245 LEFT SQUARE BRACKET 0x5b
142
+ ⌊ 1567 LEFT FLOOR 0x230a
143
+ 〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
144
+ 【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
145
+ ﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
146
+ _ 4851 LOW LINE 0x5f
147
+ $ 72 DOLLAR SIGN 0x24
148
+ € 14 EURO SIGN 0x20ac
149
+ £ 2 POUND SIGN 0xa3
150
+ ~ 27462 TILDE 0x7e
151
+ = 11450 EQUALS SIGN 0x3d
152
+ | 8430 VERTICAL LINE 0x7c
153
+ − 3971 MINUS SIGN 0x2212
154
+ ≫ 1904 MUCH GREATER-THAN 0x226b
155
+ ≪ 1903 MUCH LESS-THAN 0x226a
156
+ + 1450 PLUS SIGN 0x2b
157
+ < 345 FULLWIDTH LESS-THAN SIGN 0xff1c
158
+ > 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
159
+ ¬ 5 NOT SIGN 0xac
160
+ × 4 MULTIPLICATION SIGN 0xd7
161
+ → 2 RIGHTWARDS ARROW 0x2192
162
+ ᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
163
+ ° 499 DEGREE SIGN 0xb0
164
+ ႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
165
+ � 192 REPLACEMENT CHARACTER 0xfffd
166
+ ⌟ 54 BOTTOM RIGHT CORNER 0x231f
167
+ ⌞ 54 BOTTOM LEFT CORNER 0x231e
168
+ © 2 COPYRIGHT SIGN 0xa9
169
+   40 NARROW NO-BREAK SPACE 0x202f
170
+   1 SIX-PER-EM SPACE 0x2006
171
+ ˜ 40261 SMALL TILDE 0x2dc
172
+ ^ 6469 CIRCUMFLEX ACCENT 0x5e
173
+ ¯ 20 MACRON 0xaf
174
+ ˇ 191442 CARON 0x2c7
175
+ ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
176
+ ـ 9440 ARABIC TATWEEL 0x640
177
+ ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
178
+ ៗ 3310 KHMER SIGN LEK TOO 0x17d7
179
+ 々 678 IDEOGRAPHIC ITERATION MARK 0x3005
180
+ ໆ 430 LAO KO LA 0xec6
181
+ ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
182
+ ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
183
+ ৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
184
+ ⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
185
+ ½ 26 VULGAR FRACTION ONE HALF 0xbd
186
+ ¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
187
+ ⅟ 1 FRACTION NUMERATOR ONE 0x215f
188
+ ⁄ 57 FRACTION SLASH 0x2044
normalization/text_norm.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import unicodedata
4
+
5
+ from normalization.norm_config import norm_config
6
+
7
+
8
+ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
9
+
10
+ """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
11
+
12
+ Args:
13
+ text : The string to be normalized
14
+ iso_code :
15
+ remove_numbers : Boolean flag to specify if words containing only digits should be removed
16
+
17
+ Returns:
18
+ normalized_text : the string after all normalization
19
+
20
+ """
21
+
22
+ config = norm_config.get(iso_code, norm_config["*"])
23
+
24
+ for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
25
+ if field not in config:
26
+ config[field] = norm_config["*"][field]
27
+
28
+
29
+ text = unicodedata.normalize(config["unicode_norm"], text)
30
+
31
+ # Convert to lower case
32
+
33
+ if config["lower_case"] and lower_case:
34
+ text = text.lower()
35
+
36
+ # brackets
37
+
38
+ # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
39
+ text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
40
+ if remove_brackets:
41
+ text = re.sub(r"\([^\)]*\)", " ", text)
42
+
43
+ # Apply mappings
44
+
45
+ for old, new in config["mapping"].items():
46
+ text = re.sub(old, new, text)
47
+
48
+ # Replace punctutations with space
49
+
50
+ punct_pattern = r"[" + config["punc_set"]
51
+
52
+ punct_pattern += "]"
53
+
54
+ normalized_text = re.sub(punct_pattern, " ", text)
55
+
56
+ # remove characters in delete list
57
+
58
+ delete_patten = r"[" + config["del_set"] + "]"
59
+
60
+ normalized_text = re.sub(delete_patten, "", normalized_text)
61
+
62
+ # Remove words containing only digits
63
+ # We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
64
+ # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
65
+ # The lookaround enables overlapping pattern matches to be replaced
66
+
67
+ if remove_numbers:
68
+
69
+ digits_pattern = "[" + config["digit_set"]
70
+
71
+ digits_pattern += "]+"
72
+
73
+ complete_digit_pattern = (
74
+ r"^"
75
+ + digits_pattern
76
+ + "(?=\s)|(?<=\s)"
77
+ + digits_pattern
78
+ + "(?=\s)|(?<=\s)"
79
+ + digits_pattern
80
+ + "$"
81
+ )
82
+
83
+ normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
84
+
85
+ if config["rm_diacritics"]:
86
+ from unidecode import unidecode
87
+ normalized_text = unidecode(normalized_text)
88
+
89
+ # Remove extra spaces
90
+ normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
91
+
92
+ return normalized_text
zeroshot.py CHANGED
@@ -9,6 +9,7 @@ import numpy as np
9
  from transformers import Wav2Vec2ForCTC, AutoProcessor
10
  from huggingface_hub import hf_hub_download
11
  from torchaudio.models.decoder import ctc_decoder
 
12
 
13
  uroman_dir = "uroman"
14
  assert os.path.exists(uroman_dir)
@@ -94,6 +95,7 @@ def load_words(filepath):
94
  with open(filepath) as f:
95
  for line in f:
96
  line = line.strip().lower()
 
97
  # ignore invalid words.
98
  for w in line.split():
99
  words.setdefault(w, 0)
@@ -109,7 +111,7 @@ def process(
109
  lmscore=None,
110
  wscore_usedefault=True,
111
  lmscore_usedefault=True,
112
- reference=None
113
  ):
114
  transcription, logs = "", MY_LOG()
115
  if not audio_data or not words_file:
@@ -169,7 +171,6 @@ def process(
169
 
170
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
171
 
172
-
173
  if lm_path is None:
174
  yield transcription, logs.add(f"Filtering lexicon....")
175
  lexicon = filter_lexicon(lexicon, word_counts)
@@ -219,8 +220,8 @@ def process(
219
  yield transcription, logs.add(f"[DONE]")
220
 
221
 
222
- # for i in process("upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"):
223
- # print(i)
224
 
225
 
226
  # for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
 
9
  from transformers import Wav2Vec2ForCTC, AutoProcessor
10
  from huggingface_hub import hf_hub_download
11
  from torchaudio.models.decoder import ctc_decoder
12
+ from normalization.text_norm import text_normalize
13
 
14
  uroman_dir = "uroman"
15
  assert os.path.exists(uroman_dir)
 
95
  with open(filepath) as f:
96
  for line in f:
97
  line = line.strip().lower()
98
+ line = text_normalize(line, iso_code="xxx")
99
  # ignore invalid words.
100
  for w in line.split():
101
  words.setdefault(w, 0)
 
111
  lmscore=None,
112
  wscore_usedefault=True,
113
  lmscore_usedefault=True,
114
+ reference=None,
115
  ):
116
  transcription, logs = "", MY_LOG()
117
  if not audio_data or not words_file:
 
171
 
172
  yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
173
 
 
174
  if lm_path is None:
175
  yield transcription, logs.add(f"Filtering lexicon....")
176
  lexicon = filter_lexicon(lexicon, word_counts)
 
220
  yield transcription, logs.add(f"[DONE]")
221
 
222
 
223
+ for i in process("upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"):
224
+ print(i)
225
 
226
 
227
  # for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):