SohomToom commited on
Commit
be83d8c
·
verified ·
1 Parent(s): c4a8041

Update MeloTTS/melo/text/japanese.py

Browse files
Files changed (1) hide show
  1. MeloTTS/melo/text/japanese.py +647 -647
MeloTTS/melo/text/japanese.py CHANGED
@@ -1,647 +1,647 @@
1
- # Convert Japanese text to phonemes which is
2
- # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
- import re
4
- import unicodedata
5
-
6
- from transformers import AutoTokenizer
7
-
8
- from . import symbols
9
- punctuation = ["!", "?", "…", ",", ".", "'", "-"]
10
-
11
- try:
12
- import MeCab
13
- except ImportError as e:
14
- raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
15
- from num2words import num2words
16
-
17
- _CONVRULES = [
18
- # Conversion of 2 letters
19
- "アァ/ a a",
20
- "イィ/ i i",
21
- "イェ/ i e",
22
- "イャ/ y a",
23
- "ウゥ/ u:",
24
- "エェ/ e e",
25
- "オォ/ o:",
26
- "カァ/ k a:",
27
- "キィ/ k i:",
28
- "クゥ/ k u:",
29
- "クャ/ ky a",
30
- "クュ/ ky u",
31
- "クョ/ ky o",
32
- "ケェ/ k e:",
33
- "コォ/ k o:",
34
- "ガァ/ g a:",
35
- "ギィ/ g i:",
36
- "グゥ/ g u:",
37
- "グャ/ gy a",
38
- "グュ/ gy u",
39
- "グョ/ gy o",
40
- "ゲェ/ g e:",
41
- "ゴォ/ g o:",
42
- "サァ/ s a:",
43
- "シィ/ sh i:",
44
- "スゥ/ s u:",
45
- "スャ/ sh a",
46
- "スュ/ sh u",
47
- "スョ/ sh o",
48
- "セェ/ s e:",
49
- "ソォ/ s o:",
50
- "ザァ/ z a:",
51
- "ジィ/ j i:",
52
- "ズゥ/ z u:",
53
- "ズャ/ zy a",
54
- "ズュ/ zy u",
55
- "ズョ/ zy o",
56
- "ゼェ/ z e:",
57
- "ゾォ/ z o:",
58
- "タァ/ t a:",
59
- "チィ/ ch i:",
60
- "ツァ/ ts a",
61
- "ツィ/ ts i",
62
- "ツゥ/ ts u:",
63
- "ツャ/ ch a",
64
- "ツュ/ ch u",
65
- "ツョ/ ch o",
66
- "ツェ/ ts e",
67
- "ツォ/ ts o",
68
- "テェ/ t e:",
69
- "トォ/ t o:",
70
- "ダァ/ d a:",
71
- "ヂィ/ j i:",
72
- "ヅゥ/ d u:",
73
- "ヅャ/ zy a",
74
- "ヅュ/ zy u",
75
- "ヅョ/ zy o",
76
- "デェ/ d e:",
77
- "ドォ/ d o:",
78
- "ナァ/ n a:",
79
- "ニィ/ n i:",
80
- "ヌゥ/ n u:",
81
- "ヌャ/ ny a",
82
- "ヌュ/ ny u",
83
- "ヌョ/ ny o",
84
- "ネェ/ n e:",
85
- "ノォ/ n o:",
86
- "ハァ/ h a:",
87
- "ヒィ/ h i:",
88
- "フゥ/ f u:",
89
- "フャ/ hy a",
90
- "フュ/ hy u",
91
- "フョ/ hy o",
92
- "ヘェ/ h e:",
93
- "ホォ/ h o:",
94
- "バァ/ b a:",
95
- "ビィ/ b i:",
96
- "ブゥ/ b u:",
97
- "フャ/ hy a",
98
- "ブュ/ by u",
99
- "フョ/ hy o",
100
- "ベェ/ b e:",
101
- "ボォ/ b o:",
102
- "パァ/ p a:",
103
- "ピィ/ p i:",
104
- "プゥ/ p u:",
105
- "プャ/ py a",
106
- "プュ/ py u",
107
- "プョ/ py o",
108
- "ペェ/ p e:",
109
- "ポォ/ p o:",
110
- "マァ/ m a:",
111
- "ミィ/ m i:",
112
- "ムゥ/ m u:",
113
- "ムャ/ my a",
114
- "ムュ/ my u",
115
- "ムョ/ my o",
116
- "メェ/ m e:",
117
- "モォ/ m o:",
118
- "ヤァ/ y a:",
119
- "ユゥ/ y u:",
120
- "ユャ/ y a:",
121
- "ユュ/ y u:",
122
- "ユョ/ y o:",
123
- "ヨォ/ y o:",
124
- "ラァ/ r a:",
125
- "リィ/ r i:",
126
- "ルゥ/ r u:",
127
- "ルャ/ ry a",
128
- "ルュ/ ry u",
129
- "ルョ/ ry o",
130
- "レェ/ r e:",
131
- "ロォ/ r o:",
132
- "ワァ/ w a:",
133
- "ヲォ/ o:",
134
- "ディ/ d i",
135
- "デェ/ d e:",
136
- "デャ/ dy a",
137
- "デュ/ dy u",
138
- "デョ/ dy o",
139
- "ティ/ t i",
140
- "テェ/ t e:",
141
- "テャ/ ty a",
142
- "テュ/ ty u",
143
- "テョ/ ty o",
144
- "スィ/ s i",
145
- "ズァ/ z u a",
146
- "ズィ/ z i",
147
- "ズゥ/ z u",
148
- "ズャ/ zy a",
149
- "ズュ/ zy u",
150
- "ズョ/ zy o",
151
- "ズェ/ z e",
152
- "ズォ/ z o",
153
- "キャ/ ky a",
154
- "キュ/ ky u",
155
- "キョ/ ky o",
156
- "シャ/ sh a",
157
- "シュ/ sh u",
158
- "シェ/ sh e",
159
- "ショ/ sh o",
160
- "チャ/ ch a",
161
- "チュ/ ch u",
162
- "チェ/ ch e",
163
- "チョ/ ch o",
164
- "トゥ/ t u",
165
- "トャ/ ty a",
166
- "トュ/ ty u",
167
- "トョ/ ty o",
168
- "ドァ/ d o a",
169
- "ドゥ/ d u",
170
- "ドャ/ dy a",
171
- "ドュ/ dy u",
172
- "ドョ/ dy o",
173
- "ドォ/ d o:",
174
- "ニャ/ ny a",
175
- "ニュ/ ny u",
176
- "ニョ/ ny o",
177
- "ヒャ/ hy a",
178
- "ヒュ/ hy u",
179
- "ヒョ/ hy o",
180
- "ミャ/ my a",
181
- "ミュ/ my u",
182
- "ミョ/ my o",
183
- "リャ/ ry a",
184
- "リュ/ ry u",
185
- "リョ/ ry o",
186
- "ギャ/ gy a",
187
- "ギュ/ gy u",
188
- "ギョ/ gy o",
189
- "ヂェ/ j e",
190
- "ヂャ/ j a",
191
- "ヂュ/ j u",
192
- "ヂョ/ j o",
193
- "ジェ/ j e",
194
- "ジャ/ j a",
195
- "ジュ/ j u",
196
- "ジョ/ j o",
197
- "ビャ/ by a",
198
- "ビュ/ by u",
199
- "ビョ/ by o",
200
- "ピャ/ py a",
201
- "ピュ/ py u",
202
- "ピョ/ py o",
203
- "ウァ/ u a",
204
- "ウィ/ w i",
205
- "ウェ/ w e",
206
- "ウォ/ w o",
207
- "ファ/ f a",
208
- "フィ/ f i",
209
- "フゥ/ f u",
210
- "フャ/ hy a",
211
- "フュ/ hy u",
212
- "フョ/ hy o",
213
- "フェ/ f e",
214
- "フォ/ f o",
215
- "ヴァ/ b a",
216
- "ヴィ/ b i",
217
- "ヴェ/ b e",
218
- "ヴォ/ b o",
219
- "ヴュ/ by u",
220
- # Conversion of 1 letter
221
- "ア/ a",
222
- "イ/ i",
223
- "ウ/ u",
224
- "エ/ e",
225
- "オ/ o",
226
- "カ/ k a",
227
- "キ/ k i",
228
- "ク/ k u",
229
- "ケ/ k e",
230
- "コ/ k o",
231
- "サ/ s a",
232
- "シ/ sh i",
233
- "ス/ s u",
234
- "セ/ s e",
235
- "ソ/ s o",
236
- "タ/ t a",
237
- "チ/ ch i",
238
- "ツ/ ts u",
239
- "テ/ t e",
240
- "ト/ t o",
241
- "ナ/ n a",
242
- "ニ/ n i",
243
- "ヌ/ n u",
244
- "ネ/ n e",
245
- "ノ/ n o",
246
- "ハ/ h a",
247
- "ヒ/ h i",
248
- "フ/ f u",
249
- "ヘ/ h e",
250
- "ホ/ h o",
251
- "マ/ m a",
252
- "ミ/ m i",
253
- "ム/ m u",
254
- "メ/ m e",
255
- "モ/ m o",
256
- "ラ/ r a",
257
- "リ/ r i",
258
- "ル/ r u",
259
- "レ/ r e",
260
- "ロ/ r o",
261
- "ガ/ g a",
262
- "ギ/ g i",
263
- "グ/ g u",
264
- "ゲ/ g e",
265
- "ゴ/ g o",
266
- "ザ/ z a",
267
- "ジ/ j i",
268
- "ズ/ z u",
269
- "ゼ/ z e",
270
- "ゾ/ z o",
271
- "ダ/ d a",
272
- "ヂ/ j i",
273
- "ヅ/ z u",
274
- "デ/ d e",
275
- "ド/ d o",
276
- "バ/ b a",
277
- "ビ/ b i",
278
- "ブ/ b u",
279
- "ベ/ b e",
280
- "ボ/ b o",
281
- "パ/ p a",
282
- "ピ/ p i",
283
- "プ/ p u",
284
- "ペ/ p e",
285
- "ポ/ p o",
286
- "ヤ/ y a",
287
- "ユ/ y u",
288
- "ヨ/ y o",
289
- "ワ/ w a",
290
- "ヰ/ i",
291
- "ヱ/ e",
292
- "ヲ/ o",
293
- "ン/ N",
294
- "ッ/ q",
295
- "ヴ/ b u",
296
- "ー/:",
297
- # Try converting broken text
298
- "ァ/ a",
299
- "ィ/ i",
300
- "ゥ/ u",
301
- "ェ/ e",
302
- "ォ/ o",
303
- "ヮ/ w a",
304
- "ォ/ o",
305
- # Try converting broken text
306
- "ャ/ y a",
307
- "ョ/ y o",
308
- "ュ/ y u",
309
- "琦/ ch i",
310
- "ヶ/ k e",
311
- "髙/ t a k a",
312
- "煞/ sh y a",
313
- # Symbols
314
- "、/ ,",
315
- "。/ .",
316
- "!/ !",
317
- "?/ ?",
318
- "・/ ,",
319
- ]
320
-
321
- _COLON_RX = re.compile(":+")
322
- _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
323
-
324
-
325
- def _makerulemap():
326
- l = [tuple(x.split("/")) for x in _CONVRULES]
327
- return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
328
-
329
-
330
- _RULEMAP1, _RULEMAP2 = _makerulemap()
331
-
332
-
333
- def kata2phoneme(text: str) -> str:
334
- """Convert katakana text to phonemes."""
335
- text = text.strip()
336
- res = []
337
- while text:
338
- if len(text) >= 2:
339
- x = _RULEMAP2.get(text[:2])
340
- if x is not None:
341
- text = text[2:]
342
- res += x.split(" ")[1:]
343
- continue
344
- x = _RULEMAP1.get(text[0])
345
- if x is not None:
346
- text = text[1:]
347
- res += x.split(" ")[1:]
348
- continue
349
- res.append(text[0])
350
- text = text[1:]
351
- # res = _COLON_RX.sub(":", res)
352
- return res
353
-
354
-
355
- _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
356
- _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
357
- _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
358
-
359
-
360
- def hira2kata(text: str) -> str:
361
- text = text.translate(_HIRA2KATATRANS)
362
- return text.replace("う゛", "ヴ")
363
-
364
-
365
- _SYMBOL_TOKENS = set(list("・、。?!"))
366
- _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
367
- _TAGGER = MeCab.Tagger()
368
-
369
-
370
- def text2kata(text: str) -> str:
371
- parsed = _TAGGER.parse(text)
372
- res = []
373
- for line in parsed.split("\n"):
374
- if line == "EOS":
375
- break
376
- parts = line.split("\t")
377
-
378
- word, yomi = parts[0], parts[1]
379
- if yomi:
380
- try:
381
- res.append(yomi.split(',')[6])
382
- except:
383
- import pdb; pdb.set_trace()
384
- else:
385
- if word in _SYMBOL_TOKENS:
386
- res.append(word)
387
- elif word in ("っ", "ッ"):
388
- res.append("ッ")
389
- elif word in _NO_YOMI_TOKENS:
390
- pass
391
- else:
392
- res.append(word)
393
- return hira2kata("".join(res))
394
-
395
-
396
- _ALPHASYMBOL_YOMI = {
397
- "#": "シャープ",
398
- "%": "パーセント",
399
- "&": "アンド",
400
- "+": "プラス",
401
- "-": "マイナス",
402
- ":": "コロン",
403
- ";": "セミコロン",
404
- "<": "小なり",
405
- "=": "イコール",
406
- ">": "大なり",
407
- "@": "アット",
408
- "a": "エー",
409
- "b": "ビー",
410
- "c": "シー",
411
- "d": "ディー",
412
- "e": "イー",
413
- "f": "エフ",
414
- "g": "ジー",
415
- "h": "エイチ",
416
- "i": "アイ",
417
- "j": "ジェー",
418
- "k": "ケー",
419
- "l": "エル",
420
- "m": "エム",
421
- "n": "エヌ",
422
- "o": "オー",
423
- "p": "ピー",
424
- "q": "キュー",
425
- "r": "アール",
426
- "s": "エス",
427
- "t": "ティー",
428
- "u": "ユー",
429
- "v": "ブイ",
430
- "w": "ダブリュー",
431
- "x": "エックス",
432
- "y": "ワイ",
433
- "z": "ゼット",
434
- "α": "アルファ",
435
- "β": "ベータ",
436
- "γ": "ガンマ",
437
- "δ": "デルタ",
438
- "ε": "イプシロン",
439
- "ζ": "ゼータ",
440
- "η": "イータ",
441
- "θ": "シータ",
442
- "ι": "イオタ",
443
- "κ": "カッパ",
444
- "λ": "ラムダ",
445
- "μ": "ミュー",
446
- "ν": "ニュー",
447
- "ξ": "クサイ",
448
- "ο": "オミクロン",
449
- "π": "パイ",
450
- "ρ": "ロー",
451
- "σ": "シグマ",
452
- "τ": "タウ",
453
- "υ": "ウプシロ��",
454
- "φ": "ファイ",
455
- "χ": "カイ",
456
- "ψ": "プサイ",
457
- "ω": "オメガ",
458
- }
459
-
460
-
461
- _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
462
- _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
463
- _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
464
- _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
465
-
466
-
467
- def japanese_convert_numbers_to_words(text: str) -> str:
468
- res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
469
- res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
470
- res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
471
- return res
472
-
473
-
474
- def japanese_convert_alpha_symbols_to_words(text: str) -> str:
475
- return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
476
-
477
-
478
- def japanese_text_to_phonemes(text: str) -> str:
479
- """Convert Japanese text to phonemes."""
480
- res = unicodedata.normalize("NFKC", text)
481
- res = japanese_convert_numbers_to_words(res)
482
- res = japanese_convert_alpha_symbols_to_words(res)
483
- res = text2kata(res)
484
- res = kata2phoneme(res)
485
- return res
486
-
487
-
488
- def is_japanese_character(char):
489
- # 定义日语文字系统的 Unicode 范围
490
- japanese_ranges = [
491
- (0x3040, 0x309F), # 平假名
492
- (0x30A0, 0x30FF), # 片假名
493
- (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
494
- (0x3400, 0x4DBF), # 汉字扩展 A
495
- (0x20000, 0x2A6DF), # 汉字扩展 B
496
- # 可以根据需要添加其他汉字扩展范围
497
- ]
498
-
499
- # 将字符的 Unicode 编码转换为整数
500
- char_code = ord(char)
501
-
502
- # 检查字符是否在任何一个日语范围内
503
- for start, end in japanese_ranges:
504
- if start <= char_code <= end:
505
- return True
506
-
507
- return False
508
-
509
-
510
- rep_map = {
511
- ":": ",",
512
- ";": ",",
513
- ",": ",",
514
- "。": ".",
515
- "!": "!",
516
- "?": "?",
517
- "\n": ".",
518
- "·": ",",
519
- "、": ",",
520
- "...": "…",
521
- }
522
-
523
-
524
- def replace_punctuation(text):
525
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
526
-
527
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
528
-
529
- replaced_text = re.sub(
530
- r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
531
- + "".join(punctuation)
532
- + r"]+",
533
- "",
534
- replaced_text,
535
- )
536
-
537
- return replaced_text
538
-
539
- from pykakasi import kakasi
540
- # Initialize kakasi object
541
- kakasi = kakasi()
542
- # Set options for converting Chinese characters to Katakana
543
- kakasi.setMode("J", "K") # Chinese to Katakana
544
- kakasi.setMode("H", "K") # Hiragana to Katakana
545
- # Convert Chinese characters to Katakana
546
- conv = kakasi.getConverter()
547
-
548
- def text_normalize(text):
549
- res = unicodedata.normalize("NFKC", text)
550
- res = japanese_convert_numbers_to_words(res)
551
- res = "".join([i for i in res if is_japanese_character(i)])
552
- res = replace_punctuation(res)
553
- res = conv.do(res)
554
- return res
555
-
556
-
557
- def distribute_phone(n_phone, n_word):
558
- phones_per_word = [0] * n_word
559
- for task in range(n_phone):
560
- min_tasks = min(phones_per_word)
561
- min_index = phones_per_word.index(min_tasks)
562
- phones_per_word[min_index] += 1
563
- return phones_per_word
564
-
565
-
566
-
567
- # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
568
-
569
- model_id = 'tohoku-nlp/bert-base-japanese-v3'
570
- tokenizer = AutoTokenizer.from_pretrained(model_id)
571
- def g2p(norm_text):
572
-
573
- tokenized = tokenizer.tokenize(norm_text)
574
- phs = []
575
- ph_groups = []
576
- for t in tokenized:
577
- if not t.startswith("#"):
578
- ph_groups.append([t])
579
- else:
580
- ph_groups[-1].append(t.replace("#", ""))
581
- word2ph = []
582
- for group in ph_groups:
583
- text = ""
584
- for ch in group:
585
- text += ch
586
- if text == '[UNK]':
587
- phs += ['_']
588
- word2ph += [1]
589
- continue
590
- elif text in punctuation:
591
- phs += [text]
592
- word2ph += [1]
593
- continue
594
- # import pdb; pdb.set_trace()
595
- # phonemes = japanese_text_to_phonemes(text)
596
- phonemes = kata2phoneme(text)
597
- # phonemes = [i for i in phonemes if i in symbols]
598
- for i in phonemes:
599
- assert i in symbols, (group, norm_text, tokenized, i)
600
- phone_len = len(phonemes)
601
- word_len = len(group)
602
-
603
- aaa = distribute_phone(phone_len, word_len)
604
- assert len(aaa) == word_len
605
- word2ph += aaa
606
-
607
- phs += phonemes
608
- phones = ["_"] + phs + ["_"]
609
- tones = [0 for i in phones]
610
- word2ph = [1] + word2ph + [1]
611
- assert len(word2ph) == len(tokenized) + 2
612
- return phones, tones, word2ph
613
-
614
- def get_bert_feature(text, word2ph, device):
615
- from text import japanese_bert
616
-
617
- return japanese_bert.get_bert_feature(text, word2ph, device=device)
618
-
619
-
620
- if __name__ == "__main__":
621
- # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
622
- text = "こんにちは、世界!..."
623
- text = 'ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?'
624
- text = 'あの、お前以外のみんなは、全員生きてること?'
625
- from text.japanese_bert import get_bert_feature
626
-
627
- text = text_normalize(text)
628
- print(text)
629
- phones, tones, word2ph = g2p(text)
630
- bert = get_bert_feature(text, word2ph)
631
-
632
- print(phones, tones, word2ph, bert.shape)
633
-
634
- # if __name__ == '__main__':
635
- # from pykakasi import kakasi
636
- # # Initialize kakasi object
637
- # kakasi = kakasi()
638
-
639
- # # Set options for converting Chinese characters to Katakana
640
- # kakasi.setMode("J", "H") # Chinese to Katakana
641
- # kakasi.setMode("K", "H") # Hiragana to Katakana
642
-
643
- # # Convert Chinese characters to Katakana
644
- # conv = kakasi.getConverter()
645
- # katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text
646
-
647
- # print(katakana_text) # Output: ニーハオセカイ
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from . import symbols
9
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
10
+
11
+ try:
12
+ import MeCab
13
+ except ImportError as e:
14
+ raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
15
+ from num2words import num2words
16
+
17
+ _CONVRULES = [
18
+ # Conversion of 2 letters
19
+ "アァ/ a a",
20
+ "イィ/ i i",
21
+ "イェ/ i e",
22
+ "イャ/ y a",
23
+ "ウゥ/ u:",
24
+ "エェ/ e e",
25
+ "オォ/ o:",
26
+ "カァ/ k a:",
27
+ "キィ/ k i:",
28
+ "クゥ/ k u:",
29
+ "クャ/ ky a",
30
+ "クュ/ ky u",
31
+ "クョ/ ky o",
32
+ "ケェ/ k e:",
33
+ "コォ/ k o:",
34
+ "ガァ/ g a:",
35
+ "ギィ/ g i:",
36
+ "グゥ/ g u:",
37
+ "グャ/ gy a",
38
+ "グュ/ gy u",
39
+ "グョ/ gy o",
40
+ "ゲェ/ g e:",
41
+ "ゴォ/ g o:",
42
+ "サァ/ s a:",
43
+ "シィ/ sh i:",
44
+ "スゥ/ s u:",
45
+ "スャ/ sh a",
46
+ "スュ/ sh u",
47
+ "スョ/ sh o",
48
+ "セェ/ s e:",
49
+ "ソォ/ s o:",
50
+ "ザァ/ z a:",
51
+ "ジィ/ j i:",
52
+ "ズゥ/ z u:",
53
+ "ズャ/ zy a",
54
+ "ズュ/ zy u",
55
+ "ズョ/ zy o",
56
+ "ゼェ/ z e:",
57
+ "ゾォ/ z o:",
58
+ "タァ/ t a:",
59
+ "チィ/ ch i:",
60
+ "ツァ/ ts a",
61
+ "ツィ/ ts i",
62
+ "ツゥ/ ts u:",
63
+ "ツャ/ ch a",
64
+ "ツュ/ ch u",
65
+ "ツョ/ ch o",
66
+ "ツェ/ ts e",
67
+ "ツォ/ ts o",
68
+ "テェ/ t e:",
69
+ "トォ/ t o:",
70
+ "ダァ/ d a:",
71
+ "ヂィ/ j i:",
72
+ "ヅゥ/ d u:",
73
+ "ヅャ/ zy a",
74
+ "ヅュ/ zy u",
75
+ "ヅョ/ zy o",
76
+ "デェ/ d e:",
77
+ "ドォ/ d o:",
78
+ "ナァ/ n a:",
79
+ "ニィ/ n i:",
80
+ "ヌゥ/ n u:",
81
+ "ヌャ/ ny a",
82
+ "ヌュ/ ny u",
83
+ "ヌョ/ ny o",
84
+ "ネェ/ n e:",
85
+ "ノォ/ n o:",
86
+ "ハァ/ h a:",
87
+ "ヒィ/ h i:",
88
+ "フゥ/ f u:",
89
+ "フャ/ hy a",
90
+ "フュ/ hy u",
91
+ "フョ/ hy o",
92
+ "ヘェ/ h e:",
93
+ "ホォ/ h o:",
94
+ "バァ/ b a:",
95
+ "ビィ/ b i:",
96
+ "ブゥ/ b u:",
97
+ "フャ/ hy a",
98
+ "ブュ/ by u",
99
+ "フョ/ hy o",
100
+ "ベェ/ b e:",
101
+ "ボォ/ b o:",
102
+ "パァ/ p a:",
103
+ "ピィ/ p i:",
104
+ "プゥ/ p u:",
105
+ "プャ/ py a",
106
+ "プュ/ py u",
107
+ "プョ/ py o",
108
+ "ペェ/ p e:",
109
+ "ポォ/ p o:",
110
+ "マァ/ m a:",
111
+ "ミィ/ m i:",
112
+ "ムゥ/ m u:",
113
+ "ムャ/ my a",
114
+ "ムュ/ my u",
115
+ "ムョ/ my o",
116
+ "メェ/ m e:",
117
+ "モォ/ m o:",
118
+ "ヤァ/ y a:",
119
+ "ユゥ/ y u:",
120
+ "ユャ/ y a:",
121
+ "ユュ/ y u:",
122
+ "ユョ/ y o:",
123
+ "ヨォ/ y o:",
124
+ "ラァ/ r a:",
125
+ "リィ/ r i:",
126
+ "ルゥ/ r u:",
127
+ "ルャ/ ry a",
128
+ "ルュ/ ry u",
129
+ "ルョ/ ry o",
130
+ "レェ/ r e:",
131
+ "ロォ/ r o:",
132
+ "ワァ/ w a:",
133
+ "ヲォ/ o:",
134
+ "ディ/ d i",
135
+ "デェ/ d e:",
136
+ "デャ/ dy a",
137
+ "デュ/ dy u",
138
+ "デョ/ dy o",
139
+ "ティ/ t i",
140
+ "テェ/ t e:",
141
+ "テャ/ ty a",
142
+ "テュ/ ty u",
143
+ "テョ/ ty o",
144
+ "スィ/ s i",
145
+ "ズァ/ z u a",
146
+ "ズィ/ z i",
147
+ "ズゥ/ z u",
148
+ "ズャ/ zy a",
149
+ "ズュ/ zy u",
150
+ "ズョ/ zy o",
151
+ "ズェ/ z e",
152
+ "ズォ/ z o",
153
+ "キャ/ ky a",
154
+ "キュ/ ky u",
155
+ "キョ/ ky o",
156
+ "シャ/ sh a",
157
+ "シュ/ sh u",
158
+ "シェ/ sh e",
159
+ "ショ/ sh o",
160
+ "チャ/ ch a",
161
+ "チュ/ ch u",
162
+ "チェ/ ch e",
163
+ "チョ/ ch o",
164
+ "トゥ/ t u",
165
+ "トャ/ ty a",
166
+ "トュ/ ty u",
167
+ "トョ/ ty o",
168
+ "ドァ/ d o a",
169
+ "ドゥ/ d u",
170
+ "ドャ/ dy a",
171
+ "ドュ/ dy u",
172
+ "ドョ/ dy o",
173
+ "ドォ/ d o:",
174
+ "ニャ/ ny a",
175
+ "ニュ/ ny u",
176
+ "ニョ/ ny o",
177
+ "ヒャ/ hy a",
178
+ "ヒュ/ hy u",
179
+ "ヒョ/ hy o",
180
+ "ミャ/ my a",
181
+ "ミュ/ my u",
182
+ "ミョ/ my o",
183
+ "リャ/ ry a",
184
+ "リュ/ ry u",
185
+ "リョ/ ry o",
186
+ "ギャ/ gy a",
187
+ "ギュ/ gy u",
188
+ "ギョ/ gy o",
189
+ "ヂェ/ j e",
190
+ "ヂャ/ j a",
191
+ "ヂュ/ j u",
192
+ "ヂョ/ j o",
193
+ "ジェ/ j e",
194
+ "ジャ/ j a",
195
+ "ジュ/ j u",
196
+ "ジョ/ j o",
197
+ "ビャ/ by a",
198
+ "ビュ/ by u",
199
+ "ビョ/ by o",
200
+ "ピャ/ py a",
201
+ "ピュ/ py u",
202
+ "ピョ/ py o",
203
+ "ウァ/ u a",
204
+ "ウィ/ w i",
205
+ "ウェ/ w e",
206
+ "ウォ/ w o",
207
+ "ファ/ f a",
208
+ "フィ/ f i",
209
+ "フゥ/ f u",
210
+ "フャ/ hy a",
211
+ "フュ/ hy u",
212
+ "フョ/ hy o",
213
+ "フェ/ f e",
214
+ "フォ/ f o",
215
+ "ヴァ/ b a",
216
+ "ヴィ/ b i",
217
+ "ヴェ/ b e",
218
+ "ヴォ/ b o",
219
+ "ヴュ/ by u",
220
+ # Conversion of 1 letter
221
+ "ア/ a",
222
+ "イ/ i",
223
+ "ウ/ u",
224
+ "エ/ e",
225
+ "オ/ o",
226
+ "カ/ k a",
227
+ "キ/ k i",
228
+ "ク/ k u",
229
+ "ケ/ k e",
230
+ "コ/ k o",
231
+ "サ/ s a",
232
+ "シ/ sh i",
233
+ "ス/ s u",
234
+ "セ/ s e",
235
+ "ソ/ s o",
236
+ "タ/ t a",
237
+ "チ/ ch i",
238
+ "ツ/ ts u",
239
+ "テ/ t e",
240
+ "ト/ t o",
241
+ "ナ/ n a",
242
+ "ニ/ n i",
243
+ "ヌ/ n u",
244
+ "ネ/ n e",
245
+ "ノ/ n o",
246
+ "ハ/ h a",
247
+ "ヒ/ h i",
248
+ "フ/ f u",
249
+ "ヘ/ h e",
250
+ "ホ/ h o",
251
+ "マ/ m a",
252
+ "ミ/ m i",
253
+ "ム/ m u",
254
+ "メ/ m e",
255
+ "モ/ m o",
256
+ "ラ/ r a",
257
+ "リ/ r i",
258
+ "ル/ r u",
259
+ "レ/ r e",
260
+ "ロ/ r o",
261
+ "ガ/ g a",
262
+ "ギ/ g i",
263
+ "グ/ g u",
264
+ "ゲ/ g e",
265
+ "ゴ/ g o",
266
+ "ザ/ z a",
267
+ "ジ/ j i",
268
+ "ズ/ z u",
269
+ "ゼ/ z e",
270
+ "ゾ/ z o",
271
+ "ダ/ d a",
272
+ "ヂ/ j i",
273
+ "ヅ/ z u",
274
+ "デ/ d e",
275
+ "ド/ d o",
276
+ "バ/ b a",
277
+ "ビ/ b i",
278
+ "ブ/ b u",
279
+ "ベ/ b e",
280
+ "ボ/ b o",
281
+ "パ/ p a",
282
+ "ピ/ p i",
283
+ "プ/ p u",
284
+ "ペ/ p e",
285
+ "ポ/ p o",
286
+ "ヤ/ y a",
287
+ "ユ/ y u",
288
+ "ヨ/ y o",
289
+ "ワ/ w a",
290
+ "ヰ/ i",
291
+ "ヱ/ e",
292
+ "ヲ/ o",
293
+ "ン/ N",
294
+ "ッ/ q",
295
+ "ヴ/ b u",
296
+ "ー/:",
297
+ # Try converting broken text
298
+ "ァ/ a",
299
+ "ィ/ i",
300
+ "ゥ/ u",
301
+ "ェ/ e",
302
+ "ォ/ o",
303
+ "ヮ/ w a",
304
+ "ォ/ o",
305
+ # Try converting broken text
306
+ "ャ/ y a",
307
+ "ョ/ y o",
308
+ "ュ/ y u",
309
+ "琦/ ch i",
310
+ "ヶ/ k e",
311
+ "髙/ t a k a",
312
+ "煞/ sh y a",
313
+ # Symbols
314
+ "、/ ,",
315
+ "。/ .",
316
+ "!/ !",
317
+ "?/ ?",
318
+ "・/ ,",
319
+ ]
320
+
321
+ _COLON_RX = re.compile(":+")
322
+ _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
323
+
324
+
325
+ def _makerulemap():
326
+ l = [tuple(x.split("/")) for x in _CONVRULES]
327
+ return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
328
+
329
+
330
+ _RULEMAP1, _RULEMAP2 = _makerulemap()
331
+
332
+
333
+ def kata2phoneme(text: str) -> str:
334
+ """Convert katakana text to phonemes."""
335
+ text = text.strip()
336
+ res = []
337
+ while text:
338
+ if len(text) >= 2:
339
+ x = _RULEMAP2.get(text[:2])
340
+ if x is not None:
341
+ text = text[2:]
342
+ res += x.split(" ")[1:]
343
+ continue
344
+ x = _RULEMAP1.get(text[0])
345
+ if x is not None:
346
+ text = text[1:]
347
+ res += x.split(" ")[1:]
348
+ continue
349
+ res.append(text[0])
350
+ text = text[1:]
351
+ # res = _COLON_RX.sub(":", res)
352
+ return res
353
+
354
+
355
+ _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
356
+ _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
357
+ _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
358
+
359
+
360
+ def hira2kata(text: str) -> str:
361
+ text = text.translate(_HIRA2KATATRANS)
362
+ return text.replace("う゛", "ヴ")
363
+
364
+
365
+ _SYMBOL_TOKENS = set(list("・、。?!"))
366
+ _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
367
+ _TAGGER = None #MeCab.Tagger()
368
+
369
+
370
+ def text2kata(text: str) -> str:
371
+ parsed = _TAGGER.parse(text)
372
+ res = []
373
+ for line in parsed.split("\n"):
374
+ if line == "EOS":
375
+ break
376
+ parts = line.split("\t")
377
+
378
+ word, yomi = parts[0], parts[1]
379
+ if yomi:
380
+ try:
381
+ res.append(yomi.split(',')[6])
382
+ except:
383
+ import pdb; pdb.set_trace()
384
+ else:
385
+ if word in _SYMBOL_TOKENS:
386
+ res.append(word)
387
+ elif word in ("っ", "ッ"):
388
+ res.append("ッ")
389
+ elif word in _NO_YOMI_TOKENS:
390
+ pass
391
+ else:
392
+ res.append(word)
393
+ return hira2kata("".join(res))
394
+
395
+
396
+ _ALPHASYMBOL_YOMI = {
397
+ "#": "シャープ",
398
+ "%": "パーセント",
399
+ "&": "アンド",
400
+ "+": "プラス",
401
+ "-": "マイナス",
402
+ ":": "コロン",
403
+ ";": "セミコロン",
404
+ "<": "小なり",
405
+ "=": "イコール",
406
+ ">": "大なり",
407
+ "@": "アット",
408
+ "a": "エー",
409
+ "b": "ビー",
410
+ "c": "シー",
411
+ "d": "ディー",
412
+ "e": "イー",
413
+ "f": "エフ",
414
+ "g": "ジー",
415
+ "h": "エイチ",
416
+ "i": "アイ",
417
+ "j": "ジェー",
418
+ "k": "ケー",
419
+ "l": "エル",
420
+ "m": "エム",
421
+ "n": "エヌ",
422
+ "o": "オー",
423
+ "p": "ピー",
424
+ "q": "キュー",
425
+ "r": "アール",
426
+ "s": "エス",
427
+ "t": "ティー",
428
+ "u": "ユー",
429
+ "v": "ブイ",
430
+ "w": "ダブリュー",
431
+ "x": "エックス",
432
+ "y": "ワイ",
433
+ "z": "ゼット",
434
+ "α": "アルファ",
435
+ "β": "ベータ",
436
+ "γ": "ガンマ",
437
+ "δ": "デルタ",
438
+ "ε": "イプシロン",
439
+ "ζ": "ゼータ",
440
+ "η": "イータ",
441
+ "θ": "シータ",
442
+ "ι": "イオタ",
443
+ "κ": "カッパ",
444
+ "λ": "ラムダ",
445
+ "μ": "ミュー",
446
+ "ν": "ニュー",
447
+ "ξ": "クサイ",
448
+ "ο": "オミクロン",
449
+ "π": "パイ",
450
+ "ρ": "ロー",
451
+ "σ": "シグマ",
452
+ "τ": "タウ",
453
+ "υ": "ウプシロン",
454
+ "φ": "ファイ",
455
+ "χ": "カイ",
456
+ "ψ": "プサイ",
457
+ "ω": "オメガ",
458
+ }
459
+
460
+
461
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
462
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
463
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
464
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
465
+
466
+
467
+ def japanese_convert_numbers_to_words(text: str) -> str:
468
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
469
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
470
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
471
+ return res
472
+
473
+
474
+ def japanese_convert_alpha_symbols_to_words(text: str) -> str:
475
+ return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
476
+
477
+
478
+ def japanese_text_to_phonemes(text: str) -> str:
479
+ """Convert Japanese text to phonemes."""
480
+ res = unicodedata.normalize("NFKC", text)
481
+ res = japanese_convert_numbers_to_words(res)
482
+ res = japanese_convert_alpha_symbols_to_words(res)
483
+ res = text2kata(res)
484
+ res = kata2phoneme(res)
485
+ return res
486
+
487
+
488
+ def is_japanese_character(char):
489
+ # 定义日语文字系统的 Unicode 范围
490
+ japanese_ranges = [
491
+ (0x3040, 0x309F), # 平假名
492
+ (0x30A0, 0x30FF), # 片假名
493
+ (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
494
+ (0x3400, 0x4DBF), # 汉字扩展 A
495
+ (0x20000, 0x2A6DF), # 汉字扩展 B
496
+ # 可以根据需要添加其他汉字扩展范围
497
+ ]
498
+
499
+ # 将字符的 Unicode 编码转换为整数
500
+ char_code = ord(char)
501
+
502
+ # 检查字符是否在任何一个日语范围内
503
+ for start, end in japanese_ranges:
504
+ if start <= char_code <= end:
505
+ return True
506
+
507
+ return False
508
+
509
+
510
+ rep_map = {
511
+ ":": ",",
512
+ ";": ",",
513
+ ",": ",",
514
+ "。": ".",
515
+ "!": "!",
516
+ "?": "?",
517
+ "\n": ".",
518
+ "·": ",",
519
+ "、": ",",
520
+ "...": "…",
521
+ }
522
+
523
+
524
+ def replace_punctuation(text):
525
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
526
+
527
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
528
+
529
+ replaced_text = re.sub(
530
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
531
+ + "".join(punctuation)
532
+ + r"]+",
533
+ "",
534
+ replaced_text,
535
+ )
536
+
537
+ return replaced_text
538
+
539
+ from pykakasi import kakasi
540
+ # Initialize kakasi object
541
+ kakasi = kakasi()
542
+ # Set options for converting Chinese characters to Katakana
543
+ kakasi.setMode("J", "K") # Chinese to Katakana
544
+ kakasi.setMode("H", "K") # Hiragana to Katakana
545
+ # Convert Chinese characters to Katakana
546
+ conv = kakasi.getConverter()
547
+
548
+ def text_normalize(text):
549
+ res = unicodedata.normalize("NFKC", text)
550
+ res = japanese_convert_numbers_to_words(res)
551
+ res = "".join([i for i in res if is_japanese_character(i)])
552
+ res = replace_punctuation(res)
553
+ res = conv.do(res)
554
+ return res
555
+
556
+
557
+ def distribute_phone(n_phone, n_word):
558
+ phones_per_word = [0] * n_word
559
+ for task in range(n_phone):
560
+ min_tasks = min(phones_per_word)
561
+ min_index = phones_per_word.index(min_tasks)
562
+ phones_per_word[min_index] += 1
563
+ return phones_per_word
564
+
565
+
566
+
567
+ # tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
568
+
569
+ model_id = 'tohoku-nlp/bert-base-japanese-v3'
570
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
571
+ def g2p(norm_text):
572
+
573
+ tokenized = tokenizer.tokenize(norm_text)
574
+ phs = []
575
+ ph_groups = []
576
+ for t in tokenized:
577
+ if not t.startswith("#"):
578
+ ph_groups.append([t])
579
+ else:
580
+ ph_groups[-1].append(t.replace("#", ""))
581
+ word2ph = []
582
+ for group in ph_groups:
583
+ text = ""
584
+ for ch in group:
585
+ text += ch
586
+ if text == '[UNK]':
587
+ phs += ['_']
588
+ word2ph += [1]
589
+ continue
590
+ elif text in punctuation:
591
+ phs += [text]
592
+ word2ph += [1]
593
+ continue
594
+ # import pdb; pdb.set_trace()
595
+ # phonemes = japanese_text_to_phonemes(text)
596
+ phonemes = kata2phoneme(text)
597
+ # phonemes = [i for i in phonemes if i in symbols]
598
+ for i in phonemes:
599
+ assert i in symbols, (group, norm_text, tokenized, i)
600
+ phone_len = len(phonemes)
601
+ word_len = len(group)
602
+
603
+ aaa = distribute_phone(phone_len, word_len)
604
+ assert len(aaa) == word_len
605
+ word2ph += aaa
606
+
607
+ phs += phonemes
608
+ phones = ["_"] + phs + ["_"]
609
+ tones = [0 for i in phones]
610
+ word2ph = [1] + word2ph + [1]
611
+ assert len(word2ph) == len(tokenized) + 2
612
+ return phones, tones, word2ph
613
+
614
+ def get_bert_feature(text, word2ph, device):
615
+ from text import japanese_bert
616
+
617
+ return japanese_bert.get_bert_feature(text, word2ph, device=device)
618
+
619
+
620
+ if __name__ == "__main__":
621
+ # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
622
+ text = "こんにちは、世界!..."
623
+ text = 'ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?'
624
+ text = 'あの、お前以外のみんなは、全員生きてること?'
625
+ from text.japanese_bert import get_bert_feature
626
+
627
+ text = text_normalize(text)
628
+ print(text)
629
+ phones, tones, word2ph = g2p(text)
630
+ bert = get_bert_feature(text, word2ph)
631
+
632
+ print(phones, tones, word2ph, bert.shape)
633
+
634
+ # if __name__ == '__main__':
635
+ # from pykakasi import kakasi
636
+ # # Initialize kakasi object
637
+ # kakasi = kakasi()
638
+
639
+ # # Set options for converting Chinese characters to Katakana
640
+ # kakasi.setMode("J", "H") # Chinese to Katakana
641
+ # kakasi.setMode("K", "H") # Hiragana to Katakana
642
+
643
+ # # Convert Chinese characters to Katakana
644
+ # conv = kakasi.getConverter()
645
+ # katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?') # Replace with your Chinese text
646
+
647
+ # print(katakana_text) # Output: ニーハオセカイ