File size: 3,310 Bytes
9558ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
日语、韩语 等
https://www.cnblogs.com/luoganttcc/p/16605150.html
https://zhuanlan.zhihu.com/p/618684374
- https://zhuanlan.zhihu.com/p/84625185 赞


## 相关包

import opencc
import langid
imort langdetect
https://github.com/pemistahl/lingua-py
  - 原理:


"""



from zhon.hanzi import punctuation as zh_punc

def is_zh_char(uchar):
    """
    https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
    re.compile("([\u4E00-\u9FD5]+)", re.U)
    """
    return u'\u4e00' <= uchar <= u'\u9fa5'

def has_zh_punc(text):
    """
    是否包含中文标点
    """
    return any(ch in zh_punc for ch in text)


def has_zh(text):
    """ contains Chinese characters """
    return any(is_zh_char(ch) for ch in text)


def get_zh_count(text):
    return sum([is_zh_char(uchar) for uchar in text])


def is_all_zh(text):
    return all(is_zh_char(char) for char in text)


def is_all_en(text):
    return text.encode('utf-8').isalpha()




# import opencc

def is_russian():
    """ 俄语 """
    pass

def is_french():
    """ 法语 """

def aa():
    """
    zh-Hans: Chinese (Simplified)
    :return:
    """
    pass


def bb():
    """
    zh-Hant: Chinese (Traditional)
    :return:
    """


ranges = [
    {"from": ord(u"\u3300"), "to": ord(u"\u33ff")},  # compatibility ideographs
    {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")},  # compatibility ideographs
    {"from": ord(u"\uf900"), "to": ord(u"\ufaff")},  # compatibility ideographs
    {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")},  # compatibility ideographs
    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},  # Japanese Hiragana  日本平假名 96个
    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},  # Japanese Katakana  日语片假名 96个
    {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")},  # cjk radicals supplement
    {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},  # 中文  u"\u4e00"-'\u9fa5',
    {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},  #
    {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
    {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
    {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
    {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")}  # included as of Unicode 8.0
]

# 韩语 [\uac00-\ud7ff]


def is_cjk(char):
    """
    CJK(Chinese、Japanese、Korean)
    日语中有很多汉字,日本汉字超过2万。
    韩语有谚文,超过50个,有朝鲜汉字超过2万。
    """
    return any([range["from"] <= ord(char) <= range["to"] for range in ranges])


def cjk_substrings(string):
    i = 0
    while i < len(string):
        if is_cjk(string[i]):
            start = i
            while is_cjk(string[i]): i += 1
            yield string[start:i]
        i += 1


def aa():
    # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
    for idx, item in enumerate(ranges):
        print(idx, end=": ")
        for j in range(10):
            print(chr(item["from"] + j), end=", ")
        print("")
    # for sub in cjk_substrings(string):
    #   string = string.replace(sub, "(" + sub + ")")
    # print(string)


def is_traditional_chinese(text):
    cc = opencc.OpenCC('t2s')
    converted_text = cc.convert(text)
    if converted_text != text:
        return True
    return False



    # aa()