Spaces:
Sleeping
Sleeping
| # Copyright (c) 2024 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import os | |
| import sys | |
| from diffrhythm.g2p.g2p import PhonemeBpeTokenizer | |
| from diffrhythm.g2p.utils.g2p import phonemizer_g2p | |
| import tqdm | |
| from typing import List | |
| import json | |
| import os | |
| import re | |
| def ph_g2p(text, language): | |
| return phonemizer_g2p(text=text, language=language) | |
| def g2p(text, sentence, language): | |
| return text_tokenizer.tokenize(text=text, sentence=sentence, language=language) | |
| def is_chinese(char): | |
| if char >= "\u4e00" and char <= "\u9fa5": | |
| return True | |
| else: | |
| return False | |
| def is_alphabet(char): | |
| if (char >= "\u0041" and char <= "\u005a") or ( | |
| char >= "\u0061" and char <= "\u007a" | |
| ): | |
| return True | |
| else: | |
| return False | |
| def is_other(char): | |
| if not (is_chinese(char) or is_alphabet(char)): | |
| return True | |
| else: | |
| return False | |
| def get_segment(text: str) -> List[str]: | |
| # sentence --> [ch_part, en_part, ch_part, ...] | |
| segments = [] | |
| types = [] | |
| flag = 0 | |
| temp_seg = "" | |
| temp_lang = "" | |
| # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. | |
| for i, ch in enumerate(text): | |
| if is_chinese(ch): | |
| types.append("zh") | |
| elif is_alphabet(ch): | |
| types.append("en") | |
| else: | |
| types.append("other") | |
| assert len(types) == len(text) | |
| for i in range(len(types)): | |
| # find the first char of the seg | |
| if flag == 0: | |
| temp_seg += text[i] | |
| temp_lang = types[i] | |
| flag = 1 | |
| else: | |
| if temp_lang == "other": | |
| if types[i] == temp_lang: | |
| temp_seg += text[i] | |
| else: | |
| temp_seg += text[i] | |
| temp_lang = types[i] | |
| else: | |
| if types[i] == temp_lang: | |
| temp_seg += text[i] | |
| elif types[i] == "other": | |
| temp_seg += text[i] | |
| else: | |
| segments.append((temp_seg, temp_lang)) | |
| temp_seg = text[i] | |
| temp_lang = types[i] | |
| flag = 1 | |
| segments.append((temp_seg, temp_lang)) | |
| return segments | |
| def chn_eng_g2p(text: str): | |
| # now only en and ch | |
| segments = get_segment(text) | |
| all_phoneme = "" | |
| all_tokens = [] | |
| for index in range(len(segments)): | |
| seg = segments[index] | |
| phoneme, token = g2p(seg[0], text, seg[1]) | |
| all_phoneme += phoneme + "|" | |
| all_tokens += token | |
| if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_": | |
| all_phoneme = all_phoneme[:-2] | |
| all_tokens = all_tokens[:-1] | |
| return all_phoneme, all_tokens | |
| text_tokenizer = PhonemeBpeTokenizer() | |
| with open("./diffrhythm/g2p/g2p/vocab.json", "r") as f: | |
| json_data = f.read() | |
| data = json.loads(json_data) | |
| vocab = data["vocab"] | |
| if __name__ == '__main__': | |
| phone, token = chn_eng_g2p("你好,hello world") | |
| phone, token = chn_eng_g2p("你好,hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑") | |
| print(phone) | |
| print(token) | |
| #phone, token = text_tokenizer.tokenize("你好,hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑", "", "auto") | |
| phone, token = text_tokenizer.tokenize("緑", "", "auto") | |
| #phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "auto") | |
| #phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "other") | |
| print(phone) | |
| print(token) |