Spaces:
Running
on
T4
Running
on
T4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from typing import List | |
from typing import Tuple | |
import jieba | |
from pypinyin import lazy_pinyin | |
from pypinyin import Style | |
class ToneSandhi: | |
def __init__(self): | |
self.must_neural_tone_words = { | |
"麻烦", | |
"麻利", | |
"鸳鸯", | |
"高粱", | |
"骨头", | |
"骆驼", | |
"马虎", | |
"首饰", | |
"馒头", | |
"馄饨", | |
"风筝", | |
"难为", | |
"队伍", | |
"阔气", | |
"闺女", | |
"门道", | |
"锄头", | |
"铺盖", | |
"铃铛", | |
"铁匠", | |
"钥匙", | |
"里脊", | |
"里头", | |
"部分", | |
"那么", | |
"道士", | |
"造化", | |
"迷糊", | |
"连累", | |
"这么", | |
"这个", | |
"运气", | |
"过去", | |
"软和", | |
"转悠", | |
"踏实", | |
"跳蚤", | |
"跟头", | |
"趔趄", | |
"财主", | |
"豆腐", | |
"讲究", | |
"记性", | |
"记号", | |
"认识", | |
"规矩", | |
"见识", | |
"裁缝", | |
"补丁", | |
"衣裳", | |
"衣服", | |
"衙门", | |
"街坊", | |
"行李", | |
"行当", | |
"蛤蟆", | |
"蘑菇", | |
"薄荷", | |
"葫芦", | |
"葡萄", | |
"萝卜", | |
"荸荠", | |
"苗条", | |
"苗头", | |
"苍蝇", | |
"芝麻", | |
"舒服", | |
"舒坦", | |
"舌头", | |
"自在", | |
"膏药", | |
"脾气", | |
"脑袋", | |
"脊梁", | |
"能耐", | |
"胳膊", | |
"胭脂", | |
"胡萝", | |
"胡琴", | |
"胡同", | |
"聪明", | |
"耽误", | |
"耽搁", | |
"耷拉", | |
"耳朵", | |
"老爷", | |
"老实", | |
"老婆", | |
"老头", | |
"老太", | |
"翻腾", | |
"罗嗦", | |
"罐头", | |
"编辑", | |
"结实", | |
"红火", | |
"累赘", | |
"糨糊", | |
"糊涂", | |
"精神", | |
"粮食", | |
"簸箕", | |
"篱笆", | |
"算计", | |
"算盘", | |
"答应", | |
"笤帚", | |
"笑语", | |
"笑话", | |
"窟窿", | |
"窝囊", | |
"窗户", | |
"稳当", | |
"稀罕", | |
"称呼", | |
"秧歌", | |
"秀气", | |
"秀才", | |
"福气", | |
"祖宗", | |
"砚台", | |
"码头", | |
"石榴", | |
"石头", | |
"石匠", | |
"知识", | |
"眼睛", | |
"眯缝", | |
"眨巴", | |
"眉毛", | |
"相声", | |
"盘算", | |
"白净", | |
"痢疾", | |
"痛快", | |
"疟疾", | |
"疙瘩", | |
"疏忽", | |
"畜生", | |
"生意", | |
"甘蔗", | |
"琵琶", | |
"琢磨", | |
"琉璃", | |
"玻璃", | |
"玫瑰", | |
"玄乎", | |
"狐狸", | |
"状元", | |
"特务", | |
"牲口", | |
"牙碜", | |
"牌楼", | |
"爽快", | |
"爱人", | |
"热闹", | |
"烧饼", | |
"烟筒", | |
"烂糊", | |
"点心", | |
"炊帚", | |
"灯笼", | |
"火候", | |
"漂亮", | |
"滑溜", | |
"溜达", | |
"温和", | |
"清楚", | |
"消息", | |
"浪头", | |
"活泼", | |
"比方", | |
"正经", | |
"欺负", | |
"模糊", | |
"槟榔", | |
"棺材", | |
"棒槌", | |
"棉花", | |
"核桃", | |
"栅栏", | |
"柴火", | |
"架势", | |
"枕头", | |
"枇杷", | |
"机灵", | |
"本事", | |
"木头", | |
"木匠", | |
"朋友", | |
"月饼", | |
"月亮", | |
"暖和", | |
"明白", | |
"时候", | |
"新鲜", | |
"故事", | |
"收拾", | |
"收成", | |
"提防", | |
"挖苦", | |
"挑剔", | |
"指甲", | |
"指头", | |
"拾掇", | |
"拳头", | |
"拨弄", | |
"招牌", | |
"招呼", | |
"抬举", | |
"护士", | |
"折腾", | |
"扫帚", | |
"打量", | |
"打算", | |
"打点", | |
"打扮", | |
"打听", | |
"打发", | |
"扎实", | |
"扁担", | |
"戒指", | |
"懒得", | |
"意识", | |
"意思", | |
"情形", | |
"悟性", | |
"怪物", | |
"思量", | |
"怎么", | |
"念头", | |
"念叨", | |
"快活", | |
"忙活", | |
"志气", | |
"心思", | |
"得罪", | |
"张罗", | |
"弟兄", | |
"开通", | |
"应酬", | |
"庄稼", | |
"干事", | |
"帮手", | |
"帐篷", | |
"希罕", | |
"师父", | |
"师傅", | |
"巴结", | |
"巴掌", | |
"差事", | |
"工夫", | |
"岁数", | |
"屁股", | |
"尾巴", | |
"少爷", | |
"小气", | |
"小伙", | |
"将就", | |
"对头", | |
"对付", | |
"寡妇", | |
"家伙", | |
"客气", | |
"实在", | |
"官司", | |
"学问", | |
"学生", | |
"字号", | |
"嫁妆", | |
"媳妇", | |
"媒人", | |
"婆家", | |
"娘家", | |
"委屈", | |
"姑娘", | |
"姐夫", | |
"妯娌", | |
"妥当", | |
"妖精", | |
"奴才", | |
"女婿", | |
"头发", | |
"太阳", | |
"大爷", | |
"大方", | |
"大意", | |
"大夫", | |
"多少", | |
"多么", | |
"外甥", | |
"壮实", | |
"地道", | |
"地方", | |
"在乎", | |
"困难", | |
"嘴巴", | |
"嘱咐", | |
"嘟囔", | |
"嘀咕", | |
"喜欢", | |
"喇嘛", | |
"喇叭", | |
"商量", | |
"唾沫", | |
"哑巴", | |
"哈欠", | |
"哆嗦", | |
"咳嗽", | |
"和尚", | |
"告诉", | |
"告示", | |
"含糊", | |
"吓唬", | |
"后头", | |
"名字", | |
"名堂", | |
"合同", | |
"吆喝", | |
"叫唤", | |
"口袋", | |
"厚道", | |
"厉害", | |
"千斤", | |
"包袱", | |
"包涵", | |
"匀称", | |
"勤快", | |
"动静", | |
"动弹", | |
"功夫", | |
"力气", | |
"前头", | |
"刺猬", | |
"刺激", | |
"别扭", | |
"利落", | |
"利索", | |
"利害", | |
"分析", | |
"出息", | |
"凑合", | |
"凉快", | |
"冷战", | |
"冤枉", | |
"冒失", | |
"养活", | |
"关系", | |
"先生", | |
"兄弟", | |
"便宜", | |
"使唤", | |
"佩服", | |
"作坊", | |
"体面", | |
"位置", | |
"似的", | |
"伙计", | |
"休息", | |
"什么", | |
"人家", | |
"亲戚", | |
"亲家", | |
"交情", | |
"云彩", | |
"事情", | |
"买卖", | |
"主意", | |
"丫头", | |
"丧气", | |
"两口", | |
"东西", | |
"东家", | |
"世故", | |
"不由", | |
"不在", | |
"下水", | |
"下巴", | |
"上头", | |
"上司", | |
"丈夫", | |
"丈人", | |
"一辈", | |
"那个", | |
"菩萨", | |
"父亲", | |
"母亲", | |
"咕噜", | |
"邋遢", | |
"费用", | |
"冤家", | |
"甜头", | |
"介绍", | |
"荒唐", | |
"大人", | |
"泥鳅", | |
"幸福", | |
"熟悉", | |
"计划", | |
"扑腾", | |
"蜡烛", | |
"姥爷", | |
"照顾", | |
"喉咙", | |
"吉他", | |
"弄堂", | |
"蚂蚱", | |
"凤凰", | |
"拖沓", | |
"寒碜", | |
"糟蹋", | |
"倒腾", | |
"报复", | |
"逻辑", | |
"盘缠", | |
"喽啰", | |
"牢骚", | |
"咖喱", | |
"扫把", | |
"惦记", | |
} | |
self.must_not_neural_tone_words = { | |
"男子", | |
"女子", | |
"分子", | |
"原子", | |
"量子", | |
"莲子", | |
"石子", | |
"瓜子", | |
"电子", | |
"人人", | |
"虎虎", | |
} | |
self.punc = ":,;。?!“”‘’':,;.?!" | |
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 | |
# e.g. | |
# word: "家里" | |
# pos: "s" | |
# finals: ['ia1', 'i3'] | |
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 | |
for j, item in enumerate(word): | |
if ( | |
j - 1 >= 0 | |
and item == word[j - 1] | |
and pos[0] in {"n", "v", "a"} | |
and word not in self.must_not_neural_tone_words | |
): | |
finals[j] = finals[j][:-1] + "5" | |
ge_idx = word.find("个") | |
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": | |
finals[-1] = finals[-1][:-1] + "5" | |
elif len(word) >= 1 and word[-1] in "的地得": | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. 走了, 看着, 去过 | |
# elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: | |
# finals[-1] = finals[-1][:-1] + "5" | |
elif ( | |
len(word) > 1 | |
and word[-1] in "们子" | |
and pos in {"r", "n"} | |
and word not in self.must_not_neural_tone_words | |
): | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. 桌上, 地下, 家里 | |
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: | |
finals[-1] = finals[-1][:-1] + "5" | |
# e.g. 上来, 下去 | |
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": | |
finals[-1] = finals[-1][:-1] + "5" | |
# 个做量词 | |
elif ( | |
ge_idx >= 1 | |
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") | |
) or word == "个": | |
finals[ge_idx] = finals[ge_idx][:-1] + "5" | |
else: | |
if ( | |
word in self.must_neural_tone_words | |
or word[-2:] in self.must_neural_tone_words | |
): | |
finals[-1] = finals[-1][:-1] + "5" | |
word_list = self._split_word(word) | |
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
for i, word in enumerate(word_list): | |
# conventional neural in Chinese | |
if ( | |
word in self.must_neural_tone_words | |
or word[-2:] in self.must_neural_tone_words | |
): | |
finals_list[i][-1] = finals_list[i][-1][:-1] + "5" | |
finals = sum(finals_list, []) | |
return finals | |
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
# e.g. 看不懂 | |
if len(word) == 3 and word[1] == "不": | |
finals[1] = finals[1][:-1] + "5" | |
else: | |
for i, char in enumerate(word): | |
# "不" before tone4 should be bu2, e.g. 不怕 | |
if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": | |
finals[i] = finals[i][:-1] + "2" | |
return finals | |
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
# "一" in number sequences, e.g. 一零零, 二一零 | |
if word.find("一") != -1 and all( | |
[item.isnumeric() for item in word if item != "一"] | |
): | |
return finals | |
# "一" between reduplication words should be yi5, e.g. 看一看 | |
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: | |
finals[1] = finals[1][:-1] + "5" | |
# when "一" is ordinal word, it should be yi1 | |
elif word.startswith("第一"): | |
finals[1] = finals[1][:-1] + "1" | |
else: | |
for i, char in enumerate(word): | |
if char == "一" and i + 1 < len(word): | |
# "一" before tone4 should be yi2, e.g. 一段 | |
if finals[i + 1][-1] == "4": | |
finals[i] = finals[i][:-1] + "2" | |
# "一" before non-tone4 should be yi4, e.g. 一天 | |
else: | |
# "一" 后面如果是标点,还读一声 | |
if word[i + 1] not in self.punc: | |
finals[i] = finals[i][:-1] + "4" | |
return finals | |
def _split_word(self, word: str) -> List[str]: | |
word_list = jieba.cut_for_search(word) | |
word_list = sorted(word_list, key=lambda i: len(i), reverse=False) | |
first_subword = word_list[0] | |
first_begin_idx = word.find(first_subword) | |
if first_begin_idx == 0: | |
second_subword = word[len(first_subword) :] | |
new_word_list = [first_subword, second_subword] | |
else: | |
second_subword = word[: -len(first_subword)] | |
new_word_list = [second_subword, first_subword] | |
return new_word_list | |
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
if len(word) == 2 and self._all_tone_three(finals): | |
finals[0] = finals[0][:-1] + "2" | |
elif len(word) == 3: | |
word_list = self._split_word(word) | |
if self._all_tone_three(finals): | |
# disyllabic + monosyllabic, e.g. 蒙古/包 | |
if len(word_list[0]) == 2: | |
finals[0] = finals[0][:-1] + "2" | |
finals[1] = finals[1][:-1] + "2" | |
# monosyllabic + disyllabic, e.g. 纸/老虎 | |
elif len(word_list[0]) == 1: | |
finals[1] = finals[1][:-1] + "2" | |
else: | |
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
if len(finals_list) == 2: | |
for i, sub in enumerate(finals_list): | |
# e.g. 所有/人 | |
if self._all_tone_three(sub) and len(sub) == 2: | |
finals_list[i][0] = finals_list[i][0][:-1] + "2" | |
# e.g. 好/喜欢 | |
elif ( | |
i == 1 | |
and not self._all_tone_three(sub) | |
and finals_list[i][0][-1] == "3" | |
and finals_list[0][-1][-1] == "3" | |
): | |
finals_list[0][-1] = finals_list[0][-1][:-1] + "2" | |
finals = sum(finals_list, []) | |
# split idiom into two words who's length is 2 | |
elif len(word) == 4: | |
finals_list = [finals[:2], finals[2:]] | |
finals = [] | |
for sub in finals_list: | |
if self._all_tone_three(sub): | |
sub[0] = sub[0][:-1] + "2" | |
finals += sub | |
return finals | |
def _all_tone_three(self, finals: List[str]) -> bool: | |
return all(x[-1] == "3" for x in finals) | |
# merge "不" and the word behind it | |
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error | |
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
last_word = "" | |
for word, pos in seg: | |
if last_word == "不": | |
word = last_word + word | |
if word != "不": | |
new_seg.append((word, pos)) | |
last_word = word[:] | |
if last_word == "不": | |
new_seg.append((last_word, "d")) | |
last_word = "" | |
return new_seg | |
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听" | |
# function 2: merge single "一" and the word behind it | |
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error | |
# e.g. | |
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] | |
# output seg: [['听一听', 'v']] | |
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
# function 1 | |
for i, (word, pos) in enumerate(seg): | |
if ( | |
i - 1 >= 0 | |
and word == "一" | |
and i + 1 < len(seg) | |
and seg[i - 1][0] == seg[i + 1][0] | |
and seg[i - 1][1] == "v" | |
): | |
new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] | |
else: | |
if ( | |
i - 2 >= 0 | |
and seg[i - 1][0] == "一" | |
and seg[i - 2][0] == word | |
and pos == "v" | |
): | |
continue | |
else: | |
new_seg.append([word, pos]) | |
seg = new_seg | |
new_seg = [] | |
# function 2 | |
for i, (word, pos) in enumerate(seg): | |
if new_seg and new_seg[-1][0] == "一": | |
new_seg[-1][0] = new_seg[-1][0] + word | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
# the first and the second words are all_tone_three | |
def _merge_continuous_three_tones( | |
self, seg: List[Tuple[str, str]] | |
) -> List[Tuple[str, str]]: | |
new_seg = [] | |
sub_finals_list = [ | |
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) | |
for (word, pos) in seg | |
] | |
assert len(sub_finals_list) == len(seg) | |
merge_last = [False] * len(seg) | |
for i, (word, pos) in enumerate(seg): | |
if ( | |
i - 1 >= 0 | |
and self._all_tone_three(sub_finals_list[i - 1]) | |
and self._all_tone_three(sub_finals_list[i]) | |
and not merge_last[i - 1] | |
): | |
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
if ( | |
not self._is_reduplication(seg[i - 1][0]) | |
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 | |
): | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
merge_last[i] = True | |
else: | |
new_seg.append([word, pos]) | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _is_reduplication(self, word: str) -> bool: | |
return len(word) == 2 and word[0] == word[1] | |
# the last char of first word and the first char of second word is tone_three | |
def _merge_continuous_three_tones_2( | |
self, seg: List[Tuple[str, str]] | |
) -> List[Tuple[str, str]]: | |
new_seg = [] | |
sub_finals_list = [ | |
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) | |
for (word, pos) in seg | |
] | |
assert len(sub_finals_list) == len(seg) | |
merge_last = [False] * len(seg) | |
for i, (word, pos) in enumerate(seg): | |
if ( | |
i - 1 >= 0 | |
and sub_finals_list[i - 1][-1][-1] == "3" | |
and sub_finals_list[i][0][-1] == "3" | |
and not merge_last[i - 1] | |
): | |
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
if ( | |
not self._is_reduplication(seg[i - 1][0]) | |
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 | |
): | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
merge_last[i] = True | |
else: | |
new_seg.append([word, pos]) | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
for i, (word, pos) in enumerate(seg): | |
if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
new_seg = [] | |
for i, (word, pos) in enumerate(seg): | |
if new_seg and word == new_seg[-1][0]: | |
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
else: | |
new_seg.append([word, pos]) | |
return new_seg | |
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
seg = self._merge_bu(seg) | |
try: | |
seg = self._merge_yi(seg) | |
except: | |
print("_merge_yi failed") | |
seg = self._merge_reduplication(seg) | |
seg = self._merge_continuous_three_tones(seg) | |
seg = self._merge_continuous_three_tones_2(seg) | |
seg = self._merge_er(seg) | |
return seg | |
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
finals = self._bu_sandhi(word, finals) | |
finals = self._yi_sandhi(word, finals) | |
finals = self._neural_sandhi(word, pos, finals) | |
finals = self._three_sandhi(word, finals) | |
return finals | |