|
""" |
|
日语、韩语 等 |
|
https://www.cnblogs.com/luoganttcc/p/16605150.html |
|
https://zhuanlan.zhihu.com/p/618684374 |
|
- https://zhuanlan.zhihu.com/p/84625185 赞 |
|
|
|
|
|
## 相关包 |
|
|
|
import opencc |
|
import langid |
|
imort langdetect |
|
https://github.com/pemistahl/lingua-py |
|
- 原理: |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
from zhon.hanzi import punctuation as zh_punc |
|
|
|
def is_zh_char(uchar): |
|
""" |
|
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 |
|
re.compile("([\u4E00-\u9FD5]+)", re.U) |
|
""" |
|
return u'\u4e00' <= uchar <= u'\u9fa5' |
|
|
|
def has_zh_punc(text): |
|
""" |
|
是否包含中文标点 |
|
""" |
|
return any(ch in zh_punc for ch in text) |
|
|
|
|
|
def has_zh(text): |
|
""" contains Chinese characters """ |
|
return any(is_zh_char(ch) for ch in text) |
|
|
|
|
|
def get_zh_count(text): |
|
return sum([is_zh_char(uchar) for uchar in text]) |
|
|
|
|
|
def is_all_zh(text): |
|
return all(is_zh_char(char) for char in text) |
|
|
|
|
|
def is_all_en(text): |
|
return text.encode('utf-8').isalpha() |
|
|
|
|
|
|
|
|
|
ranges = [ |
|
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, |
|
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, |
|
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, |
|
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, |
|
{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, |
|
{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, |
|
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, |
|
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, |
|
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, |
|
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, |
|
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, |
|
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, |
|
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} |
|
] |
|
|
|
|
|
|
|
|
|
def is_cjk(char): |
|
""" |
|
CJK(Chinese、Japanese、Korean) |
|
日语中有很多汉字,日本汉字超过2万。 |
|
韩语有谚文,超过50个,有朝鲜汉字超过2万。 |
|
""" |
|
return any([range["from"] <= ord(char) <= range["to"] for range in ranges]) |
|
|
|
|
|
def cjk_substrings(string): |
|
i = 0 |
|
while i < len(string): |
|
if is_cjk(string[i]): |
|
start = i |
|
while is_cjk(string[i]): i += 1 |
|
yield string[start:i] |
|
i += 1 |
|
|
|
|
|
def aa(): |
|
|
|
for idx, item in enumerate(ranges): |
|
print(idx, end=": ") |
|
for j in range(10): |
|
print(chr(item["from"] + j), end=", ") |
|
print("") |
|
|
|
|
|
|
|
|
|
|
|
def is_traditional_chinese(text): |
|
cc = opencc.OpenCC('t2s') |
|
converted_text = cc.convert(text) |
|
if converted_text != text: |
|
return True |
|
return False |
|
|
|
|
|
|
|
|
|
|