|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List |
|
from typing import Tuple |
|
|
|
import jieba |
|
from pypinyin import lazy_pinyin |
|
from pypinyin import Style |
|
|
|
|
|
class ToneSandhi: |
|
def __init__(self): |
|
self.must_neural_tone_words = { |
|
"麻烦", |
|
"麻利", |
|
"鸳鸯", |
|
"高粱", |
|
"骨头", |
|
"骆驼", |
|
"马虎", |
|
"首饰", |
|
"馒头", |
|
"馄饨", |
|
"风筝", |
|
"难为", |
|
"队伍", |
|
"阔气", |
|
"闺女", |
|
"门道", |
|
"锄头", |
|
"铺盖", |
|
"铃铛", |
|
"铁匠", |
|
"钥匙", |
|
"里脊", |
|
"里头", |
|
"部分", |
|
"那么", |
|
"道士", |
|
"造化", |
|
"迷糊", |
|
"连累", |
|
"这么", |
|
"这个", |
|
"运气", |
|
"过去", |
|
"软和", |
|
"转悠", |
|
"踏实", |
|
"跳蚤", |
|
"跟头", |
|
"趔趄", |
|
"财主", |
|
"豆腐", |
|
"讲究", |
|
"记性", |
|
"记号", |
|
"认识", |
|
"规矩", |
|
"见识", |
|
"裁缝", |
|
"补丁", |
|
"衣裳", |
|
"衣服", |
|
"衙门", |
|
"街坊", |
|
"行李", |
|
"行当", |
|
"蛤蟆", |
|
"蘑菇", |
|
"薄荷", |
|
"葫芦", |
|
"葡萄", |
|
"萝卜", |
|
"荸荠", |
|
"苗条", |
|
"苗头", |
|
"苍蝇", |
|
"芝麻", |
|
"舒服", |
|
"舒坦", |
|
"舌头", |
|
"自在", |
|
"膏药", |
|
"脾气", |
|
"脑袋", |
|
"脊梁", |
|
"能耐", |
|
"胳膊", |
|
"胭脂", |
|
"胡萝", |
|
"胡琴", |
|
"胡同", |
|
"聪明", |
|
"耽误", |
|
"耽搁", |
|
"耷拉", |
|
"耳朵", |
|
"老爷", |
|
"老实", |
|
"老婆", |
|
"老头", |
|
"老太", |
|
"翻腾", |
|
"罗嗦", |
|
"罐头", |
|
"编辑", |
|
"结实", |
|
"红火", |
|
"累赘", |
|
"糨糊", |
|
"糊涂", |
|
"精神", |
|
"粮食", |
|
"簸箕", |
|
"篱笆", |
|
"算计", |
|
"算盘", |
|
"答应", |
|
"笤帚", |
|
"笑语", |
|
"笑话", |
|
"窟窿", |
|
"窝囊", |
|
"窗户", |
|
"稳当", |
|
"稀罕", |
|
"称呼", |
|
"秧歌", |
|
"秀气", |
|
"秀才", |
|
"福气", |
|
"祖宗", |
|
"砚台", |
|
"码头", |
|
"石榴", |
|
"石头", |
|
"石匠", |
|
"知识", |
|
"眼睛", |
|
"眯缝", |
|
"眨巴", |
|
"眉毛", |
|
"相声", |
|
"盘算", |
|
"白净", |
|
"痢疾", |
|
"痛快", |
|
"疟疾", |
|
"疙瘩", |
|
"疏忽", |
|
"畜生", |
|
"生意", |
|
"甘蔗", |
|
"琵琶", |
|
"琢磨", |
|
"琉璃", |
|
"玻璃", |
|
"玫瑰", |
|
"玄乎", |
|
"狐狸", |
|
"状元", |
|
"特务", |
|
"牲口", |
|
"牙碜", |
|
"牌楼", |
|
"爽快", |
|
"爱人", |
|
"热闹", |
|
"烧饼", |
|
"烟筒", |
|
"烂糊", |
|
"点心", |
|
"炊帚", |
|
"灯笼", |
|
"火候", |
|
"漂亮", |
|
"滑溜", |
|
"溜达", |
|
"温和", |
|
"清楚", |
|
"消息", |
|
"浪头", |
|
"活泼", |
|
"比方", |
|
"正经", |
|
"欺负", |
|
"模糊", |
|
"槟榔", |
|
"棺材", |
|
"棒槌", |
|
"棉花", |
|
"核桃", |
|
"栅栏", |
|
"柴火", |
|
"架势", |
|
"枕头", |
|
"枇杷", |
|
"机灵", |
|
"本事", |
|
"木头", |
|
"木匠", |
|
"朋友", |
|
"月饼", |
|
"月亮", |
|
"暖和", |
|
"明白", |
|
"时候", |
|
"新鲜", |
|
"故事", |
|
"收拾", |
|
"收成", |
|
"提防", |
|
"挖苦", |
|
"挑剔", |
|
"指甲", |
|
"指头", |
|
"拾掇", |
|
"拳头", |
|
"拨弄", |
|
"招牌", |
|
"招呼", |
|
"抬举", |
|
"护士", |
|
"折腾", |
|
"扫帚", |
|
"打量", |
|
"打算", |
|
"打点", |
|
"打扮", |
|
"打听", |
|
"打发", |
|
"扎实", |
|
"扁担", |
|
"戒指", |
|
"懒得", |
|
"意识", |
|
"意思", |
|
"情形", |
|
"悟性", |
|
"怪物", |
|
"思量", |
|
"怎么", |
|
"念头", |
|
"念叨", |
|
"快活", |
|
"忙活", |
|
"志气", |
|
"心思", |
|
"得罪", |
|
"张罗", |
|
"弟兄", |
|
"开通", |
|
"应酬", |
|
"庄稼", |
|
"干事", |
|
"帮手", |
|
"帐篷", |
|
"希罕", |
|
"师父", |
|
"师傅", |
|
"巴结", |
|
"巴掌", |
|
"差事", |
|
"工夫", |
|
"岁数", |
|
"屁股", |
|
"尾巴", |
|
"少爷", |
|
"小气", |
|
"小伙", |
|
"将就", |
|
"对头", |
|
"对付", |
|
"寡妇", |
|
"家伙", |
|
"客气", |
|
"实在", |
|
"官司", |
|
"学问", |
|
"学生", |
|
"字号", |
|
"嫁妆", |
|
"媳妇", |
|
"媒人", |
|
"婆家", |
|
"娘家", |
|
"委屈", |
|
"姑娘", |
|
"姐夫", |
|
"妯娌", |
|
"妥当", |
|
"妖精", |
|
"奴才", |
|
"女婿", |
|
"头发", |
|
"太阳", |
|
"大爷", |
|
"大方", |
|
"大意", |
|
"大夫", |
|
"多少", |
|
"多么", |
|
"外甥", |
|
"壮实", |
|
"地道", |
|
"地方", |
|
"在乎", |
|
"困难", |
|
"嘴巴", |
|
"嘱咐", |
|
"嘟囔", |
|
"嘀咕", |
|
"喜欢", |
|
"喇嘛", |
|
"喇叭", |
|
"商量", |
|
"唾沫", |
|
"哑巴", |
|
"哈欠", |
|
"哆嗦", |
|
"咳嗽", |
|
"和尚", |
|
"告诉", |
|
"告示", |
|
"含糊", |
|
"吓唬", |
|
"后头", |
|
"名字", |
|
"名堂", |
|
"合同", |
|
"吆喝", |
|
"叫唤", |
|
"口袋", |
|
"厚道", |
|
"厉害", |
|
"千斤", |
|
"包袱", |
|
"包涵", |
|
"匀称", |
|
"勤快", |
|
"动静", |
|
"动弹", |
|
"功夫", |
|
"力气", |
|
"前头", |
|
"刺猬", |
|
"刺激", |
|
"别扭", |
|
"利落", |
|
"利索", |
|
"利害", |
|
"分析", |
|
"出息", |
|
"凑合", |
|
"凉快", |
|
"冷战", |
|
"冤枉", |
|
"冒失", |
|
"养活", |
|
"关系", |
|
"先生", |
|
"兄弟", |
|
"便宜", |
|
"使唤", |
|
"佩服", |
|
"作坊", |
|
"体面", |
|
"位置", |
|
"似的", |
|
"伙计", |
|
"休息", |
|
"什么", |
|
"人家", |
|
"亲戚", |
|
"亲家", |
|
"交情", |
|
"云彩", |
|
"事情", |
|
"买卖", |
|
"主意", |
|
"丫头", |
|
"丧气", |
|
"两口", |
|
"东西", |
|
"东家", |
|
"世故", |
|
"不由", |
|
"不在", |
|
"下水", |
|
"下巴", |
|
"上头", |
|
"上司", |
|
"丈夫", |
|
"丈人", |
|
"一辈", |
|
"那个", |
|
"菩萨", |
|
"父亲", |
|
"母亲", |
|
"咕噜", |
|
"邋遢", |
|
"费用", |
|
"冤家", |
|
"甜头", |
|
"介绍", |
|
"荒唐", |
|
"大人", |
|
"泥鳅", |
|
"幸福", |
|
"熟悉", |
|
"计划", |
|
"扑腾", |
|
"蜡烛", |
|
"姥爷", |
|
"照顾", |
|
"喉咙", |
|
"吉他", |
|
"弄堂", |
|
"蚂蚱", |
|
"凤凰", |
|
"拖沓", |
|
"寒碜", |
|
"糟蹋", |
|
"倒腾", |
|
"报复", |
|
"逻辑", |
|
"盘缠", |
|
"喽啰", |
|
"牢骚", |
|
"咖喱", |
|
"扫把", |
|
"惦记", |
|
} |
|
self.must_not_neural_tone_words = { |
|
"男子", |
|
"女子", |
|
"分子", |
|
"原子", |
|
"量子", |
|
"莲子", |
|
"石子", |
|
"瓜子", |
|
"电子", |
|
"人人", |
|
"虎虎", |
|
} |
|
self.punc = ":,;。?!“”‘’':,;.?!" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: |
|
|
|
for j, item in enumerate(word): |
|
if ( |
|
j - 1 >= 0 |
|
and item == word[j - 1] |
|
and pos[0] in {"n", "v", "a"} |
|
and word not in self.must_not_neural_tone_words |
|
): |
|
finals[j] = finals[j][:-1] + "5" |
|
ge_idx = word.find("个") |
|
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
elif len(word) >= 1 and word[-1] in "的地得": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
|
|
|
|
elif ( |
|
len(word) > 1 |
|
and word[-1] in "们子" |
|
and pos in {"r", "n"} |
|
and word not in self.must_not_neural_tone_words |
|
): |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif ( |
|
ge_idx >= 1 |
|
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") |
|
) or word == "个": |
|
finals[ge_idx] = finals[ge_idx][:-1] + "5" |
|
else: |
|
if ( |
|
word in self.must_neural_tone_words |
|
or word[-2:] in self.must_neural_tone_words |
|
): |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
word_list = self._split_word(word) |
|
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] |
|
for i, word in enumerate(word_list): |
|
|
|
if ( |
|
word in self.must_neural_tone_words |
|
or word[-2:] in self.must_neural_tone_words |
|
): |
|
finals_list[i][-1] = finals_list[i][-1][:-1] + "5" |
|
finals = sum(finals_list, []) |
|
return finals |
|
|
|
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
|
|
if len(word) == 3 and word[1] == "不": |
|
finals[1] = finals[1][:-1] + "5" |
|
else: |
|
for i, char in enumerate(word): |
|
|
|
if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": |
|
finals[i] = finals[i][:-1] + "2" |
|
return finals |
|
|
|
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
|
|
if word.find("一") != -1 and all( |
|
[item.isnumeric() for item in word if item != "一"] |
|
): |
|
return finals |
|
|
|
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: |
|
finals[1] = finals[1][:-1] + "5" |
|
|
|
elif word.startswith("第一"): |
|
finals[1] = finals[1][:-1] + "1" |
|
else: |
|
for i, char in enumerate(word): |
|
if char == "一" and i + 1 < len(word): |
|
|
|
if finals[i + 1][-1] == "4": |
|
finals[i] = finals[i][:-1] + "2" |
|
|
|
else: |
|
|
|
if word[i + 1] not in self.punc: |
|
finals[i] = finals[i][:-1] + "4" |
|
return finals |
|
|
|
def _split_word(self, word: str) -> List[str]: |
|
word_list = jieba.cut_for_search(word) |
|
word_list = sorted(word_list, key=lambda i: len(i), reverse=False) |
|
first_subword = word_list[0] |
|
first_begin_idx = word.find(first_subword) |
|
if first_begin_idx == 0: |
|
second_subword = word[len(first_subword) :] |
|
new_word_list = [first_subword, second_subword] |
|
else: |
|
second_subword = word[: -len(first_subword)] |
|
new_word_list = [second_subword, first_subword] |
|
return new_word_list |
|
|
|
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
if len(word) == 2 and self._all_tone_three(finals): |
|
finals[0] = finals[0][:-1] + "2" |
|
elif len(word) == 3: |
|
word_list = self._split_word(word) |
|
if self._all_tone_three(finals): |
|
|
|
if len(word_list[0]) == 2: |
|
finals[0] = finals[0][:-1] + "2" |
|
finals[1] = finals[1][:-1] + "2" |
|
|
|
elif len(word_list[0]) == 1: |
|
finals[1] = finals[1][:-1] + "2" |
|
else: |
|
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] |
|
if len(finals_list) == 2: |
|
for i, sub in enumerate(finals_list): |
|
|
|
if self._all_tone_three(sub) and len(sub) == 2: |
|
finals_list[i][0] = finals_list[i][0][:-1] + "2" |
|
|
|
elif ( |
|
i == 1 |
|
and not self._all_tone_three(sub) |
|
and finals_list[i][0][-1] == "3" |
|
and finals_list[0][-1][-1] == "3" |
|
): |
|
finals_list[0][-1] = finals_list[0][-1][:-1] + "2" |
|
finals = sum(finals_list, []) |
|
|
|
elif len(word) == 4: |
|
finals_list = [finals[:2], finals[2:]] |
|
finals = [] |
|
for sub in finals_list: |
|
if self._all_tone_three(sub): |
|
sub[0] = sub[0][:-1] + "2" |
|
finals += sub |
|
|
|
return finals |
|
|
|
def _all_tone_three(self, finals: List[str]) -> bool: |
|
return all(x[-1] == "3" for x in finals) |
|
|
|
|
|
|
|
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
last_word = "" |
|
for word, pos in seg: |
|
if last_word == "不": |
|
word = last_word + word |
|
if word != "不": |
|
new_seg.append((word, pos)) |
|
last_word = word[:] |
|
if last_word == "不": |
|
new_seg.append((last_word, "d")) |
|
last_word = "" |
|
return new_seg |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] * len(seg) |
|
|
|
i = 0 |
|
while i < len(seg): |
|
word, pos = seg[i] |
|
if ( |
|
i - 1 >= 0 |
|
and word == "一" |
|
and i + 1 < len(seg) |
|
and seg[i - 1][0] == seg[i + 1][0] |
|
and seg[i - 1][1] == "v" |
|
): |
|
new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] |
|
i += 2 |
|
else: |
|
if ( |
|
i - 2 >= 0 |
|
and seg[i - 1][0] == "一" |
|
and seg[i - 2][0] == word |
|
and pos == "v" |
|
): |
|
continue |
|
else: |
|
new_seg.append([word, pos]) |
|
i += 1 |
|
seg = [i for i in new_seg if len(i) > 0] |
|
new_seg = [] |
|
|
|
for i, (word, pos) in enumerate(seg): |
|
if new_seg and new_seg[-1][0] == "一": |
|
new_seg[-1][0] = new_seg[-1][0] + word |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
|
|
def _merge_continuous_three_tones( |
|
self, seg: List[Tuple[str, str]] |
|
) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
sub_finals_list = [ |
|
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
|
for (word, pos) in seg |
|
] |
|
assert len(sub_finals_list) == len(seg) |
|
merge_last = [False] * len(seg) |
|
for i, (word, pos) in enumerate(seg): |
|
if ( |
|
i - 1 >= 0 |
|
and self._all_tone_three(sub_finals_list[i - 1]) |
|
and self._all_tone_three(sub_finals_list[i]) |
|
and not merge_last[i - 1] |
|
): |
|
|
|
if ( |
|
not self._is_reduplication(seg[i - 1][0]) |
|
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 |
|
): |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
merge_last[i] = True |
|
else: |
|
new_seg.append([word, pos]) |
|
else: |
|
new_seg.append([word, pos]) |
|
|
|
return new_seg |
|
|
|
def _is_reduplication(self, word: str) -> bool: |
|
return len(word) == 2 and word[0] == word[1] |
|
|
|
|
|
def _merge_continuous_three_tones_2( |
|
self, seg: List[Tuple[str, str]] |
|
) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
sub_finals_list = [ |
|
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
|
for (word, pos) in seg |
|
] |
|
assert len(sub_finals_list) == len(seg) |
|
merge_last = [False] * len(seg) |
|
for i, (word, pos) in enumerate(seg): |
|
if ( |
|
i - 1 >= 0 |
|
and sub_finals_list[i - 1][-1][-1] == "3" |
|
and sub_finals_list[i][0][-1] == "3" |
|
and not merge_last[i - 1] |
|
): |
|
|
|
if ( |
|
not self._is_reduplication(seg[i - 1][0]) |
|
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 |
|
): |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
merge_last[i] = True |
|
else: |
|
new_seg.append([word, pos]) |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
for i, (word, pos) in enumerate(seg): |
|
if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
for i, (word, pos) in enumerate(seg): |
|
if new_seg and word == new_seg[-1][0]: |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
seg = self._merge_bu(seg) |
|
try: |
|
seg = self._merge_yi(seg) |
|
except: |
|
print("_merge_yi failed") |
|
seg = self._merge_reduplication(seg) |
|
seg = self._merge_continuous_three_tones(seg) |
|
seg = self._merge_continuous_three_tones_2(seg) |
|
seg = self._merge_er(seg) |
|
return seg |
|
|
|
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: |
|
finals = self._bu_sandhi(word, finals) |
|
finals = self._yi_sandhi(word, finals) |
|
finals = self._neural_sandhi(word, pos, finals) |
|
finals = self._three_sandhi(word, finals) |
|
return finals |
|
|