Artrajz commited on
Commit
bd5f307
1 Parent(s): ff648e9
text/__init__.py CHANGED
@@ -2,31 +2,31 @@
2
  from text import cleaners
3
 
4
 
5
- def text_to_sequence(text, symbols, cleaner_names):
6
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
- Args:
8
- text: string to convert to a sequence
9
- cleaner_names: names of the cleaner functions to run the text through
10
- Returns:
11
- List of integers corresponding to the symbols in the text
12
- '''
13
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
 
15
- sequence = []
16
 
17
- clean_text = _clean_text(text, cleaner_names)
18
- for symbol in clean_text:
19
- if symbol not in _symbol_to_id.keys():
20
- continue
21
- symbol_id = _symbol_to_id[symbol]
22
- sequence += [symbol_id]
23
- return sequence
 
24
 
25
 
26
  def _clean_text(text, cleaner_names):
27
- for name in cleaner_names:
28
- cleaner = getattr(cleaners, name)
29
- if not cleaner:
30
- raise Exception('Unknown cleaner: %s' % name)
31
- text = cleaner(text)
32
- return text
 
2
  from text import cleaners
3
 
4
 
5
+ def text_to_sequence(text, symbols, cleaner_names, bert_embedding=False):
6
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
+ Args:
8
+ text: string to convert to a sequence
9
+ cleaner_names: names of the cleaner functions to run the text through
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ '''
 
13
 
14
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
15
 
16
+ if bert_embedding:
17
+ cleaned_text, char_embeds = _clean_text(text, cleaner_names)
18
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split()]
19
+ return sequence, char_embeds
20
+ else:
21
+ cleaned_text = _clean_text(text, cleaner_names)
22
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
23
+ return sequence
24
 
25
 
26
  def _clean_text(text, cleaner_names):
27
+ for name in cleaner_names:
28
+ cleaner = getattr(cleaners, name)
29
+ if not cleaner:
30
+ raise Exception('Unknown cleaner: %s' % name)
31
+ text = cleaner(text)
32
+ return text
text/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (1.21 kB)
 
text/__pycache__/cantonese.cpython-310.pyc DELETED
Binary file (2.34 kB)
 
text/__pycache__/cleaners.cpython-310.pyc DELETED
Binary file (11 kB)
 
text/__pycache__/english.cpython-310.pyc DELETED
Binary file (4.69 kB)
 
text/__pycache__/japanese.cpython-310.pyc DELETED
Binary file (4.13 kB)
 
text/__pycache__/korean.cpython-310.pyc DELETED
Binary file (5.58 kB)
 
text/__pycache__/mandarin.cpython-310.pyc DELETED
Binary file (6.53 kB)
 
text/__pycache__/ngu_dialect.cpython-310.pyc DELETED
Binary file (1.17 kB)
 
text/__pycache__/shanghainese.cpython-310.pyc DELETED
Binary file (2.51 kB)
 
text/cantonese.py CHANGED
@@ -35,25 +35,6 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
35
  ('Z', 'iː˨sɛːt̚˥')
36
  ]]
37
 
38
- _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
- ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
- ('([0-9]+)/([0-9]+)', r'\2分之\1'),
41
- ('\+', r'加'),
42
- ('([0-9]+)-([0-9]+)', r'\1减\2'),
43
- ('×', r'乘以'),
44
- ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
45
- ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
46
- ('÷', r'除以'),
47
- ('=', r'等于'),
48
- ('≠', r'不等于'),
49
- ]]
50
-
51
-
52
- def symbols_to_chinese(text):
53
- for regex, replacement in _symbols_to_chinese:
54
- text = re.sub(regex, replacement, text)
55
- return text
56
-
57
 
58
  def number_to_cantonese(text):
59
  return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
@@ -66,6 +47,7 @@ def latin_to_ipa(text):
66
 
67
 
68
  def cantonese_to_ipa(text):
 
69
  text = symbols_to_chinese(text)
70
  text = number_to_cantonese(text.upper())
71
  text = converter.convert(text).replace('-', '').replace('$', ' ')
 
35
  ('Z', 'iː˨sɛːt̚˥')
36
  ]]
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def number_to_cantonese(text):
40
  return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
 
47
 
48
 
49
  def cantonese_to_ipa(text):
50
+ from text.mandarin import symbols_to_chinese
51
  text = symbols_to_chinese(text)
52
  text = number_to_cantonese(text.upper())
53
  text = converter.convert(text).replace('-', '').replace('$', ' ')
text/cleaners.py CHANGED
@@ -247,3 +247,17 @@ def chinese_dialect_cleaners(text):
247
  text = re.sub(r'\s+$', '', text)
248
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
249
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  text = re.sub(r'\s+$', '', text)
248
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
249
  return text
250
+
251
+
252
+ def bert_chinese_cleaners(text):
253
+ from text import mandarin
254
+ matches = re.findall(r"\[ZH\](.*?)\[ZH\]", text)
255
+ text = "".join(matches)
256
+ if text[-1] not in [".", "。", ",", ","]: text += "."
257
+ text = mandarin.symbols_to_chinese(text)
258
+ text = mandarin.number_transform_to_chinese(text)
259
+ if not hasattr(bert_chinese_cleaners, "tts_front"):
260
+ bert_chinese_cleaners.tts_front = mandarin.VITS_PinYin_model()
261
+ tts_front = bert_chinese_cleaners.tts_front
262
+ cleaned_text, char_embeds = tts_front.chinese_to_phonemes(text)
263
+ return cleaned_text, char_embeds
text/mandarin.py CHANGED
@@ -262,6 +262,11 @@ def number_to_chinese(text):
262
  return text
263
 
264
 
 
 
 
 
 
265
  def chinese_to_bopomofo(text):
266
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
267
  words = jieba.lcut(text, cut_all=False)
@@ -305,7 +310,7 @@ def bopomofo_to_ipa2(text):
305
 
306
  def chinese_to_romaji(text):
307
  text = symbols_to_chinese(text)
308
- text = number_to_chinese(text)
309
  text = chinese_to_bopomofo(text)
310
  text = latin_to_bopomofo(text)
311
  text = bopomofo_to_romaji(text)
@@ -326,7 +331,7 @@ def chinese_to_lazy_ipa(text):
326
 
327
  def chinese_to_ipa(text):
328
  text = symbols_to_chinese(text)
329
- text = number_to_chinese(text)
330
  text = chinese_to_bopomofo(text)
331
  text = latin_to_bopomofo(text)
332
  text = bopomofo_to_ipa(text)
@@ -340,7 +345,7 @@ def chinese_to_ipa(text):
340
 
341
  def chinese_to_ipa2(text):
342
  text = symbols_to_chinese(text)
343
- text = number_to_chinese(text)
344
  text = chinese_to_bopomofo(text)
345
  text = latin_to_bopomofo(text)
346
  text = bopomofo_to_ipa2(text)
@@ -349,3 +354,13 @@ def chinese_to_ipa2(text):
349
  text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
350
  text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
351
  return text
 
 
 
 
 
 
 
 
 
 
 
262
  return text
263
 
264
 
265
+ def number_transform_to_chinese(text):
266
+ text = cn2an.transform(text, "an2cn")
267
+ return text
268
+
269
+
270
  def chinese_to_bopomofo(text):
271
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
272
  words = jieba.lcut(text, cut_all=False)
 
310
 
311
  def chinese_to_romaji(text):
312
  text = symbols_to_chinese(text)
313
+ text = number_transform_to_chinese(text)
314
  text = chinese_to_bopomofo(text)
315
  text = latin_to_bopomofo(text)
316
  text = bopomofo_to_romaji(text)
 
331
 
332
  def chinese_to_ipa(text):
333
  text = symbols_to_chinese(text)
334
+ text = number_transform_to_chinese(text)
335
  text = chinese_to_bopomofo(text)
336
  text = latin_to_bopomofo(text)
337
  text = bopomofo_to_ipa(text)
 
345
 
346
  def chinese_to_ipa2(text):
347
  text = symbols_to_chinese(text)
348
+ text = number_transform_to_chinese(text)
349
  text = chinese_to_bopomofo(text)
350
  text = latin_to_bopomofo(text)
351
  text = bopomofo_to_ipa2(text)
 
354
  text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
355
  text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
356
  return text
357
+
358
+
359
+ def VITS_PinYin_model():
360
+ import torch
361
+ import config
362
+ from vits_pinyin import VITS_PinYin
363
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
364
+ # pinyin
365
+ tts_front = VITS_PinYin(f"{config.ABS_PATH}/bert", device)
366
+ return tts_front
text/shanghainese.py CHANGED
@@ -35,25 +35,6 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
35
  ('Z', 'zᴇ')
36
  ]]
37
 
38
- _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
- ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
- ('([0-9]+)/([0-9]+)', r'\2分之\1'),
41
- ('\+', r'加'),
42
- ('([0-9]+)-([0-9]+)', r'\1减\2'),
43
- ('×', r'乘以'),
44
- ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
45
- ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
46
- ('÷', r'除以'),
47
- ('=', r'等于'),
48
- ('≠', r'不等于'),
49
- ]]
50
-
51
-
52
- def symbols_to_chinese(text):
53
- for regex, replacement in _symbols_to_chinese:
54
- text = re.sub(regex, replacement, text)
55
- return text
56
-
57
 
58
  def _number_to_shanghainese(num):
59
  num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
@@ -71,6 +52,7 @@ def latin_to_ipa(text):
71
 
72
 
73
  def shanghainese_to_ipa(text):
 
74
  text = symbols_to_chinese(text)
75
  text = number_to_shanghainese(text.upper())
76
  text = converter.convert(text).replace('-', '').replace('$', ' ')
 
35
  ('Z', 'zᴇ')
36
  ]]
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def _number_to_shanghainese(num):
40
  num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
 
52
 
53
 
54
  def shanghainese_to_ipa(text):
55
+ from text.mandarin import symbols_to_chinese
56
  text = symbols_to_chinese(text)
57
  text = number_to_shanghainese(text.upper())
58
  text = converter.convert(text).replace('-', '').replace('$', ' ')