tokenizer-arena / vocab /kplug /test_langconv.py
eson's picture
update
428b731
raw
history blame
No virus
1.63 kB
"""
"""
import langconv
import opencc
converter1 = opencc.OpenCC('t2s.json')
converter2 = langconv.Converter('zh-hans')
def test_special():
"""
'拋' == '抛' # False 神奇,后面是对的
"""
pass
def test_str():
sentence = '高雄氣爆事故台灣人壽成立,余数,覆盖,颠覆,翻云覆雨,叱咤风云,哪吒,著作,合著,' \
'油腻,元気森林,凃涂层,拋抛弃,內塚沒' \
'锃旯莜蜉氚' \
'PC乾隆 乾 清朝乾隆年间 安儿乐有乾坤 胡诌 鲇鳐羰砵呡 箬叶 鬼塚虎'
print(converter1.convert(sentence))
print(converter2.convert(sentence))
def test_oov_google():
# 确保新增的oov不是繁体字
oov_google = "“”—–…’‘绗楦硌袢钕蕞鳙癀皲貉唛笕椴―胗旯鳜鲅鳊鲳鲽鲣枞炝醅馊捯抻绉馐饧莜嘬" \
"腘肫鳟镊犽洌蝰铱髌锃镲锗甑戗裥弎粝霂猄轱苎偲兿铷栢帏黢洇沄誊忸怩蚬籺"
for token in oov_google:
simp = converter1.convert(token)
assert simp == converter2.convert(token)
if simp != token:
print(token, simp)
def test_vocab_google():
""" 验证opencc和langconv的一致性 """
vocab = [line.rstrip('\n') for line in open('vocab.google.txt')]
for i, token in enumerate(vocab):
t1 = converter1.convert(token)
t2 = converter1.convert(token)
if token == '灣':
print(i, token)
if t1 != t2:
print(i, token, t1, t2)
if __name__ == '__main__':
# test_vocab_google()
test_str()