|
import re |
|
from typing import Iterable, List, Tuple |
|
import cn2an |
|
from english_utils.abbreviations import expand_abbreviations |
|
from english_utils.time_norm import expand_time_english |
|
from english_utils.number_norm import normalize_numbers as replace_numbers_en |
|
|
|
|
|
def merge_short_sentences_zh(sens): |
|
|
|
"""Avoid short sentences by merging them with the following sentence. |
|
|
|
Args: |
|
List[str]: list of input sentences. |
|
|
|
Returns: |
|
List[str]: list of output sentences. |
|
""" |
|
sens_out = [] |
|
for s in sens: |
|
|
|
|
|
if len(sens_out) > 0 and len(sens_out[-1]) <= 2: |
|
sens_out[-1] = sens_out[-1] + " " + s |
|
else: |
|
sens_out.append(s) |
|
try: |
|
if len(sens_out[-1]) <= 2: |
|
sens_out[-2] = sens_out[-2] + " " + sens_out[-1] |
|
sens_out.pop(-1) |
|
except: |
|
pass |
|
return sens_out |
|
|
|
|
|
def split_sentences_zh(text, min_len=10): |
|
text = re.sub('[。!?;]', '.', text) |
|
text = re.sub('[,]', ',', text) |
|
|
|
text = re.sub('[\n\t ]+', ' ', text) |
|
|
|
text = re.sub('([,.!?;])', r'\1 $#!', text) |
|
|
|
|
|
sentences = [s.strip() for s in text.split('$#!')] |
|
if len(sentences[-1]) == 0: del sentences[-1] |
|
|
|
new_sentences = [] |
|
new_sent = [] |
|
count_len = 0 |
|
for ind, sent in enumerate(sentences): |
|
new_sent.append(sent) |
|
count_len += len(sent) |
|
if count_len > min_len or ind == len(sentences) - 1: |
|
count_len = 0 |
|
new_sentences.append(' '.join(new_sent)) |
|
new_sent = [] |
|
return merge_short_sentences_zh(new_sentences) |
|
|
|
|
|
def intersperse(lst, item): |
|
result = [item] * (len(lst) * 2 + 1) |
|
result[1::2] = lst |
|
return result |
|
|
|
|
|
def replace_numbers_zh(text): |
|
numbers = re.findall(r"\d+(?:\.?\d+)?", text) |
|
for number in numbers: |
|
text = text.replace(number, cn2an.an2cn(number), 1) |
|
return text |
|
|
|
|
|
def replace_punctuation(text): |
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": "…", |
|
"$": ".", |
|
"“": "'", |
|
"”": "'", |
|
"‘": "'", |
|
"’": "'", |
|
"(": "'", |
|
")": "'", |
|
"(": "'", |
|
")": "'", |
|
"《": "'", |
|
"》": "'", |
|
"【": "'", |
|
"】": "'", |
|
"[": "'", |
|
"]": "'", |
|
"—": "-", |
|
"~": "-", |
|
"~": "-", |
|
"「": "'", |
|
"」": "'", |
|
} |
|
|
|
for k, v in rep_map.items(): |
|
text = text.replace(k, v) |
|
return text |
|
|
|
|
|
class Lexicon: |
|
def __init__(self, lexion_filename: str, tokens_filename: str): |
|
tokens = dict() |
|
with open(tokens_filename, encoding="utf-8") as f: |
|
for line in f: |
|
s, i = line.split() |
|
tokens[s] = int(i) |
|
|
|
lexicon = dict() |
|
with open(lexion_filename, encoding="utf-8") as f: |
|
for line in f: |
|
splits = line.split() |
|
word_or_phrase = splits[0] |
|
phone_tone_list = splits[1:] |
|
assert len(phone_tone_list) & 1 == 0, len(phone_tone_list) |
|
phone_str = phone_tone_list[: len(phone_tone_list) // 2] |
|
phones = [tokens[p] for p in phone_str] |
|
|
|
tones = phone_tone_list[len(phone_tone_list) // 2 :] |
|
tones = [int(t) for t in tones] |
|
|
|
lexicon[word_or_phrase] = (phone_str, phones, tones) |
|
lexicon["呣"] = lexicon["母"] |
|
lexicon["嗯"] = lexicon["恩"] |
|
self.lexicon = lexicon |
|
|
|
punctuation = ["!", "?", "…", ",", ".", "'", "-"] |
|
for p in punctuation: |
|
i = tokens[p] |
|
tone = 0 |
|
self.lexicon[p] = ([p], [i], [tone]) |
|
self.lexicon[" "] = ([" "], [tokens["_"]], [0]) |
|
|
|
def g2p_zh_mix_en(self, text: str) -> Tuple[List[int], List[int]]: |
|
phone_str = [] |
|
phones = [] |
|
tones = [] |
|
|
|
if text not in self.lexicon: |
|
|
|
if len(text) > 1: |
|
for w in text: |
|
|
|
s, _, p, t = self.convert(w) |
|
if p: |
|
phone_str += s |
|
phones += p |
|
tones += t |
|
return phone_str, phones, tones |
|
|
|
phone_str, phones, tones = self.lexicon[text] |
|
return phone_str, phones, tones |
|
|
|
|
|
def split_zh_en(self, text): |
|
if re.search(r'[a-zA-Z]+', text): |
|
spliter = '#$&^!@' |
|
|
|
text = re.sub(r'[a-zA-Z]+', lambda x: f'{spliter}{x.group()}{spliter}', text) |
|
texts = text.split(spliter) |
|
texts = [t for t in texts if len(t) > 0] |
|
return texts |
|
else: |
|
return [text] |
|
|
|
|
|
def normalize_english(self, text): |
|
text = text.lower() |
|
text = expand_time_english(text) |
|
text = replace_numbers_en(text) |
|
text = expand_abbreviations(text) |
|
return text |
|
|
|
def normalize_chinese(self, text): |
|
text = replace_numbers_zh(text) |
|
return text |
|
|
|
|
|
def is_english(self, text): |
|
return 1 if re.match(r'[a-zA-Z\s]+', text) else 0 |
|
|
|
def convert(self, text: Iterable[str]) -> Tuple[List[int], List[int]]: |
|
phone_str = [] |
|
yinjie_num = [] |
|
phones = [] |
|
tones = [] |
|
|
|
text = replace_punctuation(text) |
|
texts_zh_en = self.split_zh_en(text) |
|
en_num = sum([self.is_english(i) for i in texts_zh_en]) |
|
if en_num * 2 >= len(texts_zh_en): |
|
texts_zh_en = self.split_zh_en(self.normalize_english(text)) |
|
else: |
|
texts_zh_en = self.split_zh_en(self.normalize_chinese(text)) |
|
for text_one_lang in texts_zh_en: |
|
if self.is_english(text_one_lang): |
|
|
|
s, p, t = self.g2p_zh_mix_en(text_one_lang) |
|
|
|
phone_str += s |
|
yinjie_num.append(len(s)) |
|
phones += p |
|
tones += t |
|
else: |
|
|
|
for tl in text_one_lang: |
|
s, p, t = self.g2p_zh_mix_en(tl) |
|
|
|
phone_str += s |
|
yinjie_num.append(len(s)) |
|
phones += p |
|
tones += t |
|
|
|
return phone_str, yinjie_num, phones, tones |