File size: 3,049 Bytes
9558ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
日语、韩语 等
https://www.cnblogs.com/luoganttcc/p/16605150.html
https://zhuanlan.zhihu.com/p/618684374
- https://zhuanlan.zhihu.com/p/84625185 赞


## 相关包

import opencc
import langid
imort langdetect
https://github.com/pemistahl/lingua-py
  - 原理:


"""



from zhon.hanzi import punctuation as zh_punc

def is_zh_char(uchar):
    """
    https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
    re.compile("([\u4E00-\u9FD5]+)", re.U)
    """
    return u'\u4e00' <= uchar <= u'\u9fa5'

def has_zh_punc(text):
    """
    是否包含中文标点
    """
    return any(ch in zh_punc for ch in text)


def has_zh(text):
    """ contains Chinese characters """
    return any(is_zh_char(ch) for ch in text)


def get_zh_count(text):
    return sum([is_zh_char(uchar) for uchar in text])


def is_all_zh(text):
    return all(is_zh_char(char) for char in text)


def is_all_en(text):
    return text.encode('utf-8').isalpha()




ranges = [
    {"from": ord(u"\u3300"), "to": ord(u"\u33ff")},  # compatibility ideographs
    {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")},  # compatibility ideographs
    {"from": ord(u"\uf900"), "to": ord(u"\ufaff")},  # compatibility ideographs
    {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")},  # compatibility ideographs
    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},  # Japanese Hiragana  日本平假名 96个
    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},  # Japanese Katakana  日语片假名 96个
    {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")},  # cjk radicals supplement
    {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},  # 中文  u"\u4e00"-'\u9fa5',
    {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},  #
    {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
    {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
    {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
    {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")}  # included as of Unicode 8.0
]

# 韩语 [\uac00-\ud7ff]


def is_cjk(char):
    """
    CJK(Chinese、Japanese、Korean)
    日语中有很多汉字,日本汉字超过2万。
    韩语有谚文,超过50个,有朝鲜汉字超过2万。
    """
    return any([range["from"] <= ord(char) <= range["to"] for range in ranges])


def cjk_substrings(string):
    i = 0
    while i < len(string):
        if is_cjk(string[i]):
            start = i
            while is_cjk(string[i]): i += 1
            yield string[start:i]
        i += 1


def aa():
    # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
    for idx, item in enumerate(ranges):
        print(idx, end=": ")
        for j in range(10):
            print(chr(item["from"] + j), end=", ")
        print("")
    # for sub in cjk_substrings(string):
    #   string = string.replace(sub, "(" + sub + ")")
    # print(string)


def is_traditional_chinese(text):
    cc = opencc.OpenCC('t2s')
    converted_text = cc.convert(text)
    if converted_text != text:
        return True
    return False



    # aa()