Spaces:
Running
Running
File size: 3,310 Bytes
9558ae0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
"""
日语、韩语 等
https://www.cnblogs.com/luoganttcc/p/16605150.html
https://zhuanlan.zhihu.com/p/618684374
- https://zhuanlan.zhihu.com/p/84625185 赞
## 相关包
import opencc
import langid
imort langdetect
https://github.com/pemistahl/lingua-py
- 原理:
"""
from zhon.hanzi import punctuation as zh_punc
def is_zh_char(uchar):
"""
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
re.compile("([\u4E00-\u9FD5]+)", re.U)
"""
return u'\u4e00' <= uchar <= u'\u9fa5'
def has_zh_punc(text):
"""
是否包含中文标点
"""
return any(ch in zh_punc for ch in text)
def has_zh(text):
""" contains Chinese characters """
return any(is_zh_char(ch) for ch in text)
def get_zh_count(text):
return sum([is_zh_char(uchar) for uchar in text])
def is_all_zh(text):
return all(is_zh_char(char) for char in text)
def is_all_en(text):
return text.encode('utf-8').isalpha()
# import opencc
def is_russian():
""" 俄语 """
pass
def is_french():
""" 法语 """
def aa():
"""
zh-Hans: Chinese (Simplified)
:return:
"""
pass
def bb():
"""
zh-Hant: Chinese (Traditional)
:return:
"""
ranges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5',
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]
# 韩语 [\uac00-\ud7ff]
def is_cjk(char):
"""
CJK(Chinese、Japanese、Korean)
日语中有很多汉字,日本汉字超过2万。
韩语有谚文,超过50个,有朝鲜汉字超过2万。
"""
return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
def cjk_substrings(string):
i = 0
while i < len(string):
if is_cjk(string[i]):
start = i
while is_cjk(string[i]): i += 1
yield string[start:i]
i += 1
def aa():
# string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
for idx, item in enumerate(ranges):
print(idx, end=": ")
for j in range(10):
print(chr(item["from"] + j), end=", ")
print("")
# for sub in cjk_substrings(string):
# string = string.replace(sub, "(" + sub + ")")
# print(string)
def is_traditional_chinese(text):
cc = opencc.OpenCC('t2s')
converted_text = cc.convert(text)
if converted_text != text:
return True
return False
# aa()
|