Spaces:

SohomToom
/

TextToVoiceUsingOpenVoice

Running

App Files Files Community

SohomToom commited on about 20 hours ago

Commit

caf3576

verified ·

1 Parent(s): 6711545

Update MeloTTS/melo/text/korean.py

Browse files

Files changed (1) hide show

MeloTTS/melo/text/korean.py +191 -191

MeloTTS/melo/text/korean.py CHANGED Viewed

@@ -1,192 +1,192 @@
-# Convert Japanese text to phonemes which is
-# compatible with Julius https://github.com/julius-speech/segmentation-kit
-import re
-import unicodedata
-from transformers import AutoTokenizer
-from . import punctuation, symbols
-from num2words import num2words
-from melo.text.ko_dictionary import english_dictionary, etc_dictionary
-from anyascii import anyascii
-from jamo import hangul_to_jamo
-def normalize(text):
-    text = text.strip()
-    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
-    text = normalize_with_dictionary(text, etc_dictionary)
-    text = normalize_english(text)
-    text = text.lower()
-    return text
-def normalize_with_dictionary(text, dic):
-    if any(key in text for key in dic.keys()):
-        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
-        return pattern.sub(lambda x: dic[x.group()], text)
-    return text
-def normalize_english(text):
-    def fn(m):
-        word = m.group()
-        if word in english_dictionary:
-            return english_dictionary.get(word)
-        return word
-    text = re.sub("([A-Za-z]+)", fn, text)
-    return text
-g2p_kr = None
-def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
-    """
-    The input and output values look the same, but they are different in Unicode.
-    example :
-        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
-        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
-    """
-    global g2p_kr  # pylint: disable=global-statement
-    if g2p_kr is None:
-        from g2pkk import G2p
-        g2p_kr = G2p()
-    if character == "english":
-        from anyascii import anyascii
-        text = normalize(text)
-        text = g2p_kr(text)
-        text = anyascii(text)
-        return text
-    text = normalize(text)
-    text = g2p_kr(text)
-    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
-    return "".join(text)
-def text_normalize(text):
-    # res = unicodedata.normalize("NFKC", text)
-    # res = japanese_convert_numbers_to_words(res)
-    # # res = "".join([i for i in res if is_japanese_character(i)])
-    # res = replace_punctuation(res)
-    text = normalize(text)
-    return text
-def distribute_phone(n_phone, n_word):
-    phones_per_word = [0] * n_word
-    for task in range(n_phone):
-        min_tasks = min(phones_per_word)
-        min_index = phones_per_word.index(min_tasks)
-        phones_per_word[min_index] += 1
-    return phones_per_word
-# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
-model_id = 'kykim/bert-kor-base'
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-def g2p(norm_text):
-    tokenized = tokenizer.tokenize(norm_text)
-    phs = []
-    ph_groups = []
-    for t in tokenized:
-        if not t.startswith("#"):
-            ph_groups.append([t])
-        else:
-            ph_groups[-1].append(t.replace("#", ""))
-    word2ph = []
-    for group in ph_groups:
-        text = ""
-        for ch in group:
-            text += ch
-        if text == '[UNK]':
-            phs += ['_']
-            word2ph += [1]
-            continue
-        elif text in punctuation:
-            phs += [text]
-            word2ph += [1]
-            continue
-        # import pdb; pdb.set_trace()
-        # phonemes = japanese_text_to_phonemes(text)
-        # text = g2p_kr(text)
-        phonemes = korean_text_to_phonemes(text)
-        # import pdb; pdb.set_trace()
-        # # phonemes = [i for i in phonemes if i in symbols]
-        # for i in phonemes:
-        #     assert i in symbols, (group, norm_text, tokenized, i)
-        phone_len = len(phonemes)
-        word_len = len(group)
-        aaa = distribute_phone(phone_len, word_len)
-        assert len(aaa) == word_len
-        word2ph += aaa
-        phs += phonemes
-    phones = ["_"] + phs + ["_"]
-    tones = [0 for i in phones]
-    word2ph =  [1] + word2ph + [1]
-    assert len(word2ph) == len(tokenized) + 2
-    return phones, tones, word2ph
-def get_bert_feature(text, word2ph, device='cuda'):
-    from . import japanese_bert
-    return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
-if __name__ == "__main__":
-    # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
-    from text.symbols import symbols
-    text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
-    import json
-    # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
-    genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
-    from tqdm import tqdm
-    new_symbols = []
-    for key, item in tqdm(genshin_data.items()):
-        texts = item.get('voiceContent', '')
-        if isinstance(texts, list):
-            texts = ','.join(texts)
-        if texts is None:
-            continue
-        if len(texts) == 0:
-            continue
-        text = text_normalize(text)
-        phones, tones, word2ph = g2p(text)
-        bert = get_bert_feature(text, word2ph)
-        import  pdb; pdb.set_trace()
-        for ph in phones:
-            if ph not in symbols and ph not in new_symbols:
-                new_symbols.append(ph)
-                print('update!, now symbols:')
-                print(new_symbols)
-                with open('korean_symbol.txt', 'w') as f:
-                    f.write(f'{new_symbols}')
-# if __name__ == '__main__':
-#     from pykakasi import kakasi
-#     # Initialize kakasi object
-#     kakasi = kakasi()
-#     # Set options for converting Chinese characters to Katakana
-#     kakasi.setMode("J", "H")  # Chinese to Katakana
-#     kakasi.setMode("K", "H")  # Hiragana to Katakana
-#     # Convert Chinese characters to Katakana
-#     conv = kakasi.getConverter()
-#     katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?')  # Replace with your Chinese text
 #     print(katakana_text)  # Output: ニーハオセカイ

+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+import re
+import unicodedata
+from transformers import AutoTokenizer
+from . import punctuation, symbols
+from num2words import num2words
+from MeloTTS.melo.text.ko_dictionary import english_dictionary, etc_dictionary
+from anyascii import anyascii
+from jamo import hangul_to_jamo
+def normalize(text):
+    text = text.strip()
+    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
+    text = normalize_with_dictionary(text, etc_dictionary)
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+def normalize_with_dictionary(text, dic):
+    if any(key in text for key in dic.keys()):
+        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
+        return pattern.sub(lambda x: dic[x.group()], text)
+    return text
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
+g2p_kr = None
+def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
+    """
+    The input and output values look the same, but they are different in Unicode.
+    example :
+        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
+        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
+    """
+    global g2p_kr  # pylint: disable=global-statement
+    if g2p_kr is None:
+        from g2pkk import G2p
+        g2p_kr = G2p()
+    if character == "english":
+        from anyascii import anyascii
+        text = normalize(text)
+        text = g2p_kr(text)
+        text = anyascii(text)
+        return text
+    text = normalize(text)
+    text = g2p_kr(text)
+    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
+    return "".join(text)
+def text_normalize(text):
+    # res = unicodedata.normalize("NFKC", text)
+    # res = japanese_convert_numbers_to_words(res)
+    # # res = "".join([i for i in res if is_japanese_character(i)])
+    # res = replace_punctuation(res)
+    text = normalize(text)
+    return text
+def distribute_phone(n_phone, n_word):
+    phones_per_word = [0] * n_word
+    for task in range(n_phone):
+        min_tasks = min(phones_per_word)
+        min_index = phones_per_word.index(min_tasks)
+        phones_per_word[min_index] += 1
+    return phones_per_word
+# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
+model_id = 'kykim/bert-kor-base'
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+def g2p(norm_text):
+    tokenized = tokenizer.tokenize(norm_text)
+    phs = []
+    ph_groups = []
+    for t in tokenized:
+        if not t.startswith("#"):
+            ph_groups.append([t])
+        else:
+            ph_groups[-1].append(t.replace("#", ""))
+    word2ph = []
+    for group in ph_groups:
+        text = ""
+        for ch in group:
+            text += ch
+        if text == '[UNK]':
+            phs += ['_']
+            word2ph += [1]
+            continue
+        elif text in punctuation:
+            phs += [text]
+            word2ph += [1]
+            continue
+        # import pdb; pdb.set_trace()
+        # phonemes = japanese_text_to_phonemes(text)
+        # text = g2p_kr(text)
+        phonemes = korean_text_to_phonemes(text)
+        # import pdb; pdb.set_trace()
+        # # phonemes = [i for i in phonemes if i in symbols]
+        # for i in phonemes:
+        #     assert i in symbols, (group, norm_text, tokenized, i)
+        phone_len = len(phonemes)
+        word_len = len(group)
+        aaa = distribute_phone(phone_len, word_len)
+        assert len(aaa) == word_len
+        word2ph += aaa
+        phs += phonemes
+    phones = ["_"] + phs + ["_"]
+    tones = [0 for i in phones]
+    word2ph =  [1] + word2ph + [1]
+    assert len(word2ph) == len(tokenized) + 2
+    return phones, tones, word2ph
+def get_bert_feature(text, word2ph, device='cuda'):
+    from . import japanese_bert
+    return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
+if __name__ == "__main__":
+    # tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
+    from text.symbols import symbols
+    text = "전 제 일의 가치와 폰타인 대중들이 한 일의 의미를 잘 압니다. 앞으로도 전 제 일에 자부심을 갖고 살아갈 겁니다"
+    import json
+    # genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
+    genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
+    from tqdm import tqdm
+    new_symbols = []
+    for key, item in tqdm(genshin_data.items()):
+        texts = item.get('voiceContent', '')
+        if isinstance(texts, list):
+            texts = ','.join(texts)
+        if texts is None:
+            continue
+        if len(texts) == 0:
+            continue
+        text = text_normalize(text)
+        phones, tones, word2ph = g2p(text)
+        bert = get_bert_feature(text, word2ph)
+        import  pdb; pdb.set_trace()
+        for ph in phones:
+            if ph not in symbols and ph not in new_symbols:
+                new_symbols.append(ph)
+                print('update!, now symbols:')
+                print(new_symbols)
+                with open('korean_symbol.txt', 'w') as f:
+                    f.write(f'{new_symbols}')
+# if __name__ == '__main__':
+#     from pykakasi import kakasi
+#     # Initialize kakasi object
+#     kakasi = kakasi()
+#     # Set options for converting Chinese characters to Katakana
+#     kakasi.setMode("J", "H")  # Chinese to Katakana
+#     kakasi.setMode("K", "H")  # Hiragana to Katakana
+#     # Convert Chinese characters to Katakana
+#     conv = kakasi.getConverter()
+#     katakana_text = conv.do('ええ、僕はおきなと申します。こちらの小さいわらべは杏子。ご挨拶が遅れてしまいすみません。あなたの名は?')  # Replace with your Chinese text
 #     print(katakana_text)  # Output: ニーハオセカイ