piyazon commited on
Commit
a41e7db
·
1 Parent(s): 80b9339

added number tokinizer

Browse files
Files changed (2) hide show
  1. app.py +363 -61
  2. requirements.txt +3 -1
app.py CHANGED
@@ -7,11 +7,309 @@ import io
7
  import os
8
  import string
9
  import unicodedata
 
 
 
 
 
 
 
 
10
  from huggingface_hub import login
11
 
12
  if os.environ.get("HF_TOKEN"):
13
  login(token=os.environ["HF_TOKEN"])
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Dictionary of available TTS models
16
  MODEL_OPTIONS = {
17
  "Uyghur (Arabic script, CV_Unique)": "piyazon/TTS-CV-Unique-Ug",
@@ -32,73 +330,75 @@ def load_model_and_tokenizer(model_name):
32
  tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(MODEL_OPTIONS[model_name])
33
  return model_cache[model_name], tokenizer_cache[model_name]
34
 
35
- def fix_string(batch):
36
- batch = batch.lower()
37
- batch = unicodedata.normalize('NFKC', batch)
38
- extra_punctuation = "–؛;،؟?«»‹›−—¬”“•…" # Add your additional custom punctuation from the training set here
39
- all_punctuation = string.punctuation + extra_punctuation
40
- for char in all_punctuation:
41
- batch = batch.replace(char, ' ')
42
- ## replace ug chars
43
- # Replace 'ژ' with 'ج'
44
- batch = batch.replace('ژ', 'ج')
45
- batch = batch.replace('ک', 'ك')
46
- batch = batch.replace('ی', 'ى')
47
- batch = batch.replace('ه', 'ە')
48
- ## replace nums
49
- numbers_to_uyghur_map = {
50
- '0': ' نۆل ',
51
- '1': ' بىر ',
52
- '2': ' ئىككى ',
53
- '3': ' ئۈچ ',
54
- '4': ' تۆت ',
55
- '5': ' بەش ',
56
- '6': ' ئالتە ',
57
- '7': ' يەتتە ',
58
- '8': ' سەككىز ',
59
- '9': ' توققۇز '
60
- }
61
- for num_char, uyghur_char in numbers_to_uyghur_map.items():
62
- batch = batch.replace(num_char, uyghur_char)
63
- ## replace en chars
64
- english_to_uyghur_map = {
65
- 'a': ' ئېي ',
66
- 'b': ' بى ',
67
- 'c': ' سى ',
68
- 'd': ' دى ',
69
- 'e': ' ئى ',
70
- 'f': ' ئەف ',
71
- 'g': ' جى ',
72
- 'h': ' ئېچ ',
73
- 'i': ' ئاي ',
74
- 'j': ' جېي ',
75
- 'k': ' کېي ',
76
- 'l': ' ئەل ',
77
- 'm': ' ئەم ',
78
- 'n': ' ئېن ',
79
- 'o': ' ئو ',
80
- 'p': ' پى ',
81
- 'q': ' كىيۇ ',
82
- 'r': ' ئار ',
83
- 's': ' ئەس ',
84
- 't': ' تى ',
85
- 'u': ' يۇ ',
86
- 'v': ' ۋى ',
87
- 'w': ' دابىلىيۇ ',
88
- 'x': ' ئېكىس ',
89
- 'y': ' ۋاي ',
90
- 'z': ' زى ',
91
- }
92
- for eng_char, uyghur_char in english_to_uyghur_map.items():
93
- batch = batch.replace(eng_char, uyghur_char)
94
- return batch
95
 
96
  def text_to_speech(text, model_name):
97
  # Load the selected model and tokenizer
98
  model, tokenizer = load_model_and_tokenizer(model_name)
99
 
 
 
100
  # Tokenize input text
101
- inputs = tokenizer(fix_string(text), return_tensors="pt")
102
 
103
  # Generate speech waveform
104
  with torch.no_grad():
@@ -123,6 +423,8 @@ def text_to_speech(text, model_name):
123
 
124
  # Define examples for Gradio Examples component
125
  examples = [
 
 
126
  ["يەنىمۇ ئىلگىرىلىگەن ھالدا تەجرىبە قىلىپ دەلىللەش ۋە تەتقىق قىلىشقا تېگىشلىك بەزى نەزەرىيەلەرنى ھېسابقا ئالمىغاندا، كۆپ قىسىم پىلانلارنىڭ ھەممىسى تاماملانغان، شۇڭا مۇمكىنچىلىك قاتلىمىدىن ئېيتقاندا مانخاتتان پىلانىدا ھېچقانداق مەسىلە يوق.", "Uyghur (Arabic script, CV_Unique)"],
127
  ["ھەممە ئادەم ئەركىن بولۇپ تۇغۇلىدۇ، ھەمدە ئىززەت-ھۆرمەت ۋە ھوقۇقتا باب-باراۋەر بولىدۇ.", "Uyghur (Arabic script, AliKurban)"],
128
  ["بىز ئىنسانلارنىڭ ھەممىسى بىرلىكتە ياشايمىز. ھەر بىر ئادەم ئۆزىنىڭ يولىنى تاللىيالايدۇ.", "Uyghur (Arabic script, QutadguBilik)"],
 
7
  import os
8
  import string
9
  import unicodedata
10
+ from pypinyin import pinyin, Style
11
+ import re
12
+ from umsc import UgMultiScriptConverter
13
+
14
+ # Initialize uyghur script converter
15
+ ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
16
+ ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
17
+
18
  from huggingface_hub import login
19
 
20
  if os.environ.get("HF_TOKEN"):
21
  login(token=os.environ["HF_TOKEN"])
22
 
23
+
24
+ def number_to_uyghur_arabic_script(number_str):
25
+ """
26
+ Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
27
+ to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
28
+ Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
29
+
30
+ Args:
31
+ number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
32
+
33
+ Returns:
34
+ str: Uyghur pronunciation in Arabic script.
35
+ """
36
+ # Uyghur number words in Arabic script
37
+ digits = {
38
+ 0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
39
+ 6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
40
+ }
41
+ ordinals = {
42
+ 1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
43
+ 6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'تەككىزىنجى'
44
+ }
45
+ tens = {
46
+ 10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
47
+ 60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
48
+ }
49
+ units = [
50
+ (1000000000, 'مىليارد'), # billion
51
+ (1000000, 'مىليون'), # million
52
+ (1000, 'مىڭ'), # thousand
53
+ (100, 'يۈز') # hundred
54
+ ]
55
+ fractions = {
56
+ 1: 'ئوندا', # tenths
57
+ 2: 'يۈزدە', # hundredths
58
+ 3: 'مىڭدە', # thousandths
59
+ 4: 'ئون مىڭدە', # ten-thousandths
60
+ 5: 'يۈز مىڭدە', # hundred-thousandths
61
+ 6: 'مىليوندا', # millionths
62
+ 7: 'ئون مىليوندا', # ten-millionths
63
+ 8: 'يۈز مىليوندا', # hundred-millionths
64
+ 9: 'مىليارددا' # billionths
65
+ }
66
+
67
+ # Convert integer part to words
68
+ def integer_to_words(num):
69
+ if num == 0:
70
+ return digits[0]
71
+
72
+ result = []
73
+ num = int(num)
74
+
75
+ # Handle large units (billion, million, thousand, hundred)
76
+ for value, unit_name in units:
77
+ if num >= value:
78
+ count = num // value
79
+ if count == 1 and value >= 100: # e.g., 100 → "يۈز", not "بىر يۈز"
80
+ result.append(unit_name)
81
+ else:
82
+ result.append(integer_to_words(count) + ' ' + unit_name)
83
+ num %= value
84
+
85
+ # Handle tens and ones
86
+ if num >= 10 and num in tens:
87
+ result.append(tens[num])
88
+ elif num > 10:
89
+ ten = (num // 10) * 10
90
+ one = num % 10
91
+ if one == 0:
92
+ result.append(tens[ten])
93
+ else:
94
+ result.append(tens[ten] + ' ' + digits[one])
95
+ elif num > 0:
96
+ result.append(digits[num])
97
+
98
+ return ' '.join(result)
99
+
100
+ # Clean the input (remove commas or spaces)
101
+ number_str = number_str.replace(',', '').replace(' ', '')
102
+
103
+ # Check for ordinal (ends with '_')
104
+ is_ordinal = number_str.endswith('_') or number_str.endswith('-')
105
+ if is_ordinal:
106
+ number_str = number_str[:-1] # Remove the _ sign
107
+ num = int(number_str)
108
+ if num > 999999999:
109
+ # raise ValueError("Ordinal number exceeds 9 digits")
110
+ return number_str
111
+ if num in ordinals: # Use special forms for single-digit ordinals
112
+ return ordinals[num]
113
+
114
+ # Convert to words and modify the last word for ordinal
115
+ words = integer_to_words(num).split()
116
+ last_num = num % 100 # Get the last two digits to handle tens and ones
117
+ if last_num in tens:
118
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
119
+ elif last_num % 10 == 0 and last_num > 0:
120
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
121
+ else:
122
+ last_digit = num % 10
123
+ if last_digit in ordinals:
124
+ words[-1] = ordinals[last_digit] + ' ' # Replace last digit with ordinal form
125
+ elif last_digit == 0:
126
+ words[-1] += 'ىنجى'
127
+ return ' '.join(words)
128
+
129
+ # Check for percentage
130
+ is_percentage = number_str.endswith('%')
131
+ if is_percentage:
132
+ number_str = number_str[:-1] # Remove the % sign
133
+
134
+ # Check for fraction
135
+ if '/' in number_str:
136
+ numerator, denominator = map(int, number_str.split('/'))
137
+ if numerator in digits and denominator in digits:
138
+ return f"{digits[denominator]}دە {digits[numerator]}"
139
+ else:
140
+ # raise ValueError("Fractions are only supported for single-digit numerators and denominators")
141
+ return number_str
142
+
143
+ # Split into integer and decimal parts
144
+ parts = number_str.split('.')
145
+ integer_part = parts[0]
146
+ decimal_part = parts[1] if len(parts) > 1 else None
147
+
148
+ # Validate integer part (up to 9 digits)
149
+ if len(integer_part) > 9:
150
+ # raise ValueError("Integer part exceeds 9 digits")
151
+ return number_str
152
+
153
+ # Validate decimal part (up to 9 digits)
154
+ if decimal_part and len(decimal_part) > 9:
155
+ # raise ValueError("Decimal part exceeds 9 digits")
156
+ return number_str
157
+
158
+ # Convert the integer part
159
+ pronunciation = integer_to_words(int(integer_part))
160
+
161
+ # Handle decimal part as a whole number with fractional term
162
+ if decimal_part:
163
+ pronunciation += ' پۈتۈن'
164
+ if decimal_part != '0': # Only pronounce non-zero decimal parts
165
+ decimal_value = int(decimal_part.rstrip('0')) # Remove trailing zeros
166
+ decimal_places = len(decimal_part.rstrip('0')) # Count significant decimal places
167
+ fraction_term = fractions.get(decimal_places, 'مىليارددا') # Fallback for beyond 9 digits
168
+ pronunciation += ' ' + fraction_term + ' ' + integer_to_words(decimal_value)
169
+
170
+ # Append percentage term if applicable
171
+ if is_percentage:
172
+ pronunciation += ' پىرسەنت'
173
+
174
+ return pronunciation.strip()
175
+ # return pronunciation
176
+
177
+
178
+ def process_uyghur_text_with_numbers(text):
179
+ """
180
+ Processes a string containing Uyghur text and numbers, converting valid numbers to their
181
+ Uyghur pronunciation in Arabic script while preserving non-numeric text.
182
+
183
+ Args:
184
+ text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
185
+
186
+ Returns:
187
+ str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
188
+ """
189
+ text = text.replace('%', ' پىرسەنت ')
190
+ # Valid number characters and symbols
191
+ digits = '0123456789'
192
+ number_symbols = '/.%_-'
193
+
194
+ result = []
195
+ i = 0
196
+ while i < len(text):
197
+ # Check for spaces and preserve them
198
+ if text[i].isspace():
199
+ result.append(text[i])
200
+ i += 1
201
+ continue
202
+
203
+ # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
204
+ number_start = i
205
+ number_str = ''
206
+ is_number = False
207
+
208
+ # Collect potential number characters
209
+ while i < len(text) and (text[i] in digits or text[i] in number_symbols):
210
+ number_str += text[i]
211
+ i += 1
212
+ is_number = True
213
+
214
+ # If we found a potential number, validate and convert it
215
+ if is_number:
216
+ # Check if the string is a valid number format
217
+ valid = False
218
+ if '/' in number_str and number_str.count('/') == 1:
219
+ # Fraction: e.g., "1/4"
220
+ num, denom = number_str.split('/')
221
+ if num.isdigit() and denom.isdigit():
222
+ valid = True
223
+ elif number_str.endswith('%'):
224
+ # Percentage: e.g., "25%"
225
+ if number_str[:-1].isdigit():
226
+ valid = True
227
+ elif number_str.endswith('_') or number_str.endswith('-'):
228
+ # Ordinal: e.g., "1_"
229
+ if number_str[:-1].isdigit():
230
+ valid = True
231
+ elif '.' in number_str and number_str.count('.') == 1:
232
+ # Decimal: e.g., "3.14"
233
+ whole, frac = number_str.split('.')
234
+ if whole.isdigit() and frac.isdigit():
235
+ valid = True
236
+ elif number_str.isdigit():
237
+ # Integer: e.g., "123"
238
+ valid = True
239
+
240
+ if valid:
241
+ try:
242
+ # Convert the number to Uyghur pronunciation
243
+ converted = number_to_uyghur_arabic_script(number_str)
244
+ result.append(converted)
245
+ except ValueError:
246
+ # If conversion fails, append the original number string
247
+ result.append(number_str)
248
+ else:
249
+ # If not a valid number format, treat as regular text
250
+ result.append(number_str)
251
+ else:
252
+ # Non-number character, append as is
253
+ result.append(text[i])
254
+ i += 1
255
+
256
+ # Join the result list into a string
257
+ return ''.join(result)
258
+
259
+ def fix_pauctuations(batch):
260
+ batch = batch.lower()
261
+ batch = unicodedata.normalize('NFKC', batch)
262
+ # extra_punctuation = "–؛;،؟?«»‹›−—¬”“•…" # Add your additional custom punctuation from the training set here
263
+ # all_punctuation = string.punctuation + extra_punctuation
264
+ # for char in all_punctuation:
265
+ # batch = batch.replace(char, ' ')
266
+ ## replace ug chars
267
+ # Replace 'ژ' with 'ج'
268
+ batch = batch.replace('ژ', 'ج')
269
+ batch = batch.replace('ک', 'ك')
270
+ batch = batch.replace('ی', 'ى')
271
+ batch = batch.replace('ه', 'ە')
272
+
273
+ vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك", "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
274
+
275
+ # Process each character in the batch
276
+ result = []
277
+ for char in batch:
278
+ if char in vocab:
279
+ result.append(char)
280
+ elif char in {'.', '?', '؟'}:
281
+ result.append(' ') # Replace dot with two spaces
282
+ else:
283
+ result.append(' ') # Replace other non-vocab characters with one space
284
+
285
+ # Join the result into a string
286
+ return ''.join(result)
287
+
288
+ def chinese_to_pinyin(mixed_text):
289
+ """
290
+ Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
291
+ preserving non-Chinese text, using only English letters.
292
+
293
+ Args:
294
+ mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
295
+
296
+ Returns:
297
+ str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
298
+ """
299
+ # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
300
+ chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
301
+
302
+ def replace_chinese(match):
303
+ chinese_text = match.group(0)
304
+ # Convert Chinese to Pinyin without tone marks, join syllables with spaces
305
+ pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
306
+ return ' '.join([item[0] for item in pinyin_list])
307
+
308
+ # Replace Chinese characters with their Pinyin, leave other text unchanged
309
+ result = chinese_pattern.sub(replace_chinese, mixed_text)
310
+ return result
311
+
312
+
313
  # Dictionary of available TTS models
314
  MODEL_OPTIONS = {
315
  "Uyghur (Arabic script, CV_Unique)": "piyazon/TTS-CV-Unique-Ug",
 
330
  tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(MODEL_OPTIONS[model_name])
331
  return model_cache[model_name], tokenizer_cache[model_name]
332
 
333
+ # def fix_string(batch):
334
+ # batch = batch.lower()
335
+ # batch = unicodedata.normalize('NFKC', batch)
336
+ # extra_punctuation = "–؛;،؟?«»‹›−—¬”“•…" # Add your additional custom punctuation from the training set here
337
+ # all_punctuation = string.punctuation + extra_punctuation
338
+ # for char in all_punctuation:
339
+ # batch = batch.replace(char, ' ')
340
+ # ## replace ug chars
341
+ # # Replace 'ژ' with 'ج'
342
+ # batch = batch.replace('ژ', 'ج')
343
+ # batch = batch.replace('ک', 'ك')
344
+ # batch = batch.replace('ی', 'ى')
345
+ # batch = batch.replace('ه', 'ە')
346
+ # ## replace nums
347
+ # numbers_to_uyghur_map = {
348
+ # '0': ' نۆل ',
349
+ # '1': ' بىر ',
350
+ # '2': ' ئىكك�� ',
351
+ # '3': ' ئۈچ ',
352
+ # '4': ' تۆت ',
353
+ # '5': ' بەش ',
354
+ # '6': ' ئالتە ',
355
+ # '7': ' يەتتە ',
356
+ # '8': ' سەككىز ',
357
+ # '9': ' توققۇز '
358
+ # }
359
+ # for num_char, uyghur_char in numbers_to_uyghur_map.items():
360
+ # batch = batch.replace(num_char, uyghur_char)
361
+ # ## replace en chars
362
+ # english_to_uyghur_map = {
363
+ # 'a': ' ئېي ',
364
+ # 'b': ' بى ',
365
+ # 'c': ' سى ',
366
+ # 'd': ' دى ',
367
+ # 'e': ' ئى ',
368
+ # 'f': ' ئەف ',
369
+ # 'g': ' جى ',
370
+ # 'h': ' ئېچ ',
371
+ # 'i': ' ئاي ',
372
+ # 'j': ' جېي ',
373
+ # 'k': ' کېي ',
374
+ # 'l': ' ئەل ',
375
+ # 'm': ' ئەم ',
376
+ # 'n': ' ئېن ',
377
+ # 'o': ' ئو ',
378
+ # 'p': ' پى ',
379
+ # 'q': ' كىيۇ ',
380
+ # 'r': ' ئار ',
381
+ # 's': ' ئەس ',
382
+ # 't': ' تى ',
383
+ # 'u': ' يۇ ',
384
+ # 'v': ' ۋى ',
385
+ # 'w': ' دابىلىيۇ ',
386
+ # 'x': ' ئېكىس ',
387
+ # 'y': ' ۋاي ',
388
+ # 'z': ' زى ',
389
+ # }
390
+ # for eng_char, uyghur_char in english_to_uyghur_map.items():
391
+ # batch = batch.replace(eng_char, uyghur_char)
392
+ # return batch
393
 
394
  def text_to_speech(text, model_name):
395
  # Load the selected model and tokenizer
396
  model, tokenizer = load_model_and_tokenizer(model_name)
397
 
398
+ fixted_text = fix_pauctuations(process_uyghur_text_with_numbers(ug_latn_to_arab(chinese_to_pinyin(text))))
399
+ print(fixted_text)
400
  # Tokenize input text
401
+ inputs = tokenizer(fix_string(fixted_text), return_tensors="pt")
402
 
403
  # Generate speech waveform
404
  with torch.no_grad():
 
423
 
424
  # Define examples for Gradio Examples component
425
  examples = [
426
+ ["« ئوكسفورد ئىنگلىز تىلى لۇغىتى» گە ئاساسلانغاندا، « دەرىجىدىن تاشقىرى چوڭ دۆلەت (superpow) » دېگەن بۇ ئاتالغۇ ئەڭ بۇرۇن 1930-يىلى تىلغا ئېلىنغان. ئىنگلىز تىلىدىكى بۇ ئاتالغۇ بىرقەدەر بۇرۇنقى« powers» (يەنى« كۈچلۈك دۆلەتلەر» ) ۋە« great power» (يەنى« چوڭ دۆلەت» ) دىن كەلگەن. ", "Uyghur (Arabic script, Radio-Ug)"],
427
+ ["ئامېرىكا ئارمىيەسى 1945-يىلى 7-ئاينىڭ 16-كۈنى دۇنيا بويىچە تۇنجى قېتىم« ئۈچنى بىر گەۋدىلەشتۈرۈش» يادرو سىنىقىنى ئېلىپ باردى", "Uyghur (Arabic script, Radio-Ug)"],
428
  ["يەنىمۇ ئىلگىرىلىگەن ھالدا تەجرىبە قىلىپ دەلىللەش ۋە تەتقىق قىلىشقا تېگىشلىك بەزى نەزەرىيەلەرنى ھېسابقا ئالمىغاندا، كۆپ قىسىم پىلانلارنىڭ ھەممىسى تاماملانغان، شۇڭا مۇمكىنچىلىك قاتلىمىدىن ئېيتقاندا مانخاتتان پىلانىدا ھېچقانداق مەسىلە يوق.", "Uyghur (Arabic script, CV_Unique)"],
429
  ["ھەممە ئادەم ئەركىن بولۇپ تۇغۇلىدۇ، ھەمدە ئىززەت-ھۆرمەت ۋە ھوقۇقتا باب-باراۋەر بولىدۇ.", "Uyghur (Arabic script, AliKurban)"],
430
  ["بىز ئىنسانلارنىڭ ھەممىسى بىرلىكتە ياشايمىز. ھەر بىر ئادەم ئۆزىنىڭ يولىنى تاللىيالايدۇ.", "Uyghur (Arabic script, QutadguBilik)"],
requirements.txt CHANGED
@@ -2,4 +2,6 @@ transformers
2
  torch
3
  soundfile
4
  numpy
5
- huggingface_hub
 
 
 
2
  torch
3
  soundfile
4
  numpy
5
+ huggingface_hub
6
+ umsc
7
+ pypinyin