""" 日语、韩语 等 https://www.cnblogs.com/luoganttcc/p/16605150.html https://zhuanlan.zhihu.com/p/618684374 - https://zhuanlan.zhihu.com/p/84625185 赞 ## 相关包 import opencc import langid imort langdetect https://github.com/pemistahl/lingua-py - 原理: """ from zhon.hanzi import punctuation as zh_punc def is_zh_char(uchar): """ https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 re.compile("([\u4E00-\u9FD5]+)", re.U) """ return u'\u4e00' <= uchar <= u'\u9fa5' def has_zh_punc(text): """ 是否包含中文标点 """ return any(ch in zh_punc for ch in text) def has_zh(text): """ contains Chinese characters """ return any(is_zh_char(ch) for ch in text) def get_zh_count(text): return sum([is_zh_char(uchar) for uchar in text]) def is_all_zh(text): return all(is_zh_char(char) for char in text) def is_all_en(text): return text.encode('utf-8').isalpha() ranges = [ {"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs {"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs {'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个 {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个 {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5', {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, # {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0 ] # 韩语 [\uac00-\ud7ff] def is_cjk(char): """ CJK(Chinese、Japanese、Korean) 日语中有很多汉字,日本汉字超过2万。 韩语有谚文,超过50个,有朝鲜汉字超过2万。 """ return any([range["from"] <= ord(char) <= range["to"] for range in ranges]) def cjk_substrings(string): i = 0 while i < len(string): if is_cjk(string[i]): start = i while is_cjk(string[i]): i += 1 yield string[start:i] i += 1 def aa(): # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8") for idx, item in enumerate(ranges): print(idx, end=": ") for j in range(10): print(chr(item["from"] + j), end=", ") print("") # for sub in cjk_substrings(string): # string = string.replace(sub, "(" + sub + ")") # print(string) def is_traditional_chinese(text): cc = opencc.OpenCC('t2s') converted_text = cc.convert(text) if converted_text != text: return True return False # aa()