tokenizer-arena / utils /lang_util_2.py
eson's picture
update
f331792
raw history blame
No virus
3.05 kB
"""
日语、韩语 等
https://www.cnblogs.com/luoganttcc/p/16605150.html
https://zhuanlan.zhihu.com/p/618684374
- https://zhuanlan.zhihu.com/p/84625185 赞
## 相关包
import opencc
import langid
imort langdetect
https://github.com/pemistahl/lingua-py
- 原理:
"""
from zhon.hanzi import punctuation as zh_punc
def is_zh_char(uchar):
"""
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
re.compile("([\u4E00-\u9FD5]+)", re.U)
"""
return u'\u4e00' <= uchar <= u'\u9fa5'
def has_zh_punc(text):
"""
是否包含中文标点
"""
return any(ch in zh_punc for ch in text)
def has_zh(text):
""" contains Chinese characters """
return any(is_zh_char(ch) for ch in text)
def get_zh_count(text):
return sum([is_zh_char(uchar) for uchar in text])
def is_all_zh(text):
return all(is_zh_char(char) for char in text)
def is_all_en(text):
return text.encode('utf-8').isalpha()
ranges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5',
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]
# 韩语 [\uac00-\ud7ff]
def is_cjk(char):
"""
CJK(Chinese、Japanese、Korean)
日语中有很多汉字,日本汉字超过2万。
韩语有谚文,超过50个,有朝鲜汉字超过2万。
"""
return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
def cjk_substrings(string):
i = 0
while i < len(string):
if is_cjk(string[i]):
start = i
while is_cjk(string[i]): i += 1
yield string[start:i]
i += 1
def aa():
# string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
for idx, item in enumerate(ranges):
print(idx, end=": ")
for j in range(10):
print(chr(item["from"] + j), end=", ")
print("")
# for sub in cjk_substrings(string):
# string = string.replace(sub, "(" + sub + ")")
# print(string)
def is_traditional_chinese(text):
cc = opencc.OpenCC('t2s')
converted_text = cc.convert(text)
if converted_text != text:
return True
return False
# aa()