eson commited on
Commit
9558ae0
1 Parent(s): 1b7fc74
Files changed (1) hide show
  1. utils/lang_util_2.py +139 -0
utils/lang_util_2.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 日语、韩语 等
3
+ https://www.cnblogs.com/luoganttcc/p/16605150.html
4
+ https://zhuanlan.zhihu.com/p/618684374
5
+ - https://zhuanlan.zhihu.com/p/84625185 赞
6
+
7
+
8
+ ## 相关包
9
+
10
+ import opencc
11
+ import langid
12
+ imort langdetect
13
+ https://github.com/pemistahl/lingua-py
14
+ - 原理:
15
+
16
+
17
+ """
18
+
19
+
20
+
21
+ from zhon.hanzi import punctuation as zh_punc
22
+
23
+ def is_zh_char(uchar):
24
+ """
25
+ https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
26
+ re.compile("([\u4E00-\u9FD5]+)", re.U)
27
+ """
28
+ return u'\u4e00' <= uchar <= u'\u9fa5'
29
+
30
+ def has_zh_punc(text):
31
+ """
32
+ 是否包含中文标点
33
+ """
34
+ return any(ch in zh_punc for ch in text)
35
+
36
+
37
+ def has_zh(text):
38
+ """ contains Chinese characters """
39
+ return any(is_zh_char(ch) for ch in text)
40
+
41
+
42
+ def get_zh_count(text):
43
+ return sum([is_zh_char(uchar) for uchar in text])
44
+
45
+
46
+ def is_all_zh(text):
47
+ return all(is_zh_char(char) for char in text)
48
+
49
+
50
+ def is_all_en(text):
51
+ return text.encode('utf-8').isalpha()
52
+
53
+
54
+
55
+
56
+ # import opencc
57
+
58
+ def is_russian():
59
+ """ 俄语 """
60
+ pass
61
+
62
+ def is_french():
63
+ """ 法语 """
64
+
65
+ def aa():
66
+ """
67
+ zh-Hans: Chinese (Simplified)
68
+ :return:
69
+ """
70
+ pass
71
+
72
+
73
+ def bb():
74
+ """
75
+ zh-Hant: Chinese (Traditional)
76
+ :return:
77
+ """
78
+
79
+
80
+ ranges = [
81
+ {"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
82
+ {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
83
+ {"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
84
+ {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
85
+ {'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
86
+ {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
87
+ {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
88
+ {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5',
89
+ {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
90
+ {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
91
+ {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
92
+ {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
93
+ {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
94
+ ]
95
+
96
+ # 韩语 [\uac00-\ud7ff]
97
+
98
+
99
+ def is_cjk(char):
100
+ """
101
+ CJK(Chinese、Japanese、Korean)
102
+ 日语中有很多汉字,日本汉字超过2万。
103
+ 韩语有谚文,超过50个,有朝鲜汉字超过2万。
104
+ """
105
+ return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
106
+
107
+
108
+ def cjk_substrings(string):
109
+ i = 0
110
+ while i < len(string):
111
+ if is_cjk(string[i]):
112
+ start = i
113
+ while is_cjk(string[i]): i += 1
114
+ yield string[start:i]
115
+ i += 1
116
+
117
+
118
+ def aa():
119
+ # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
120
+ for idx, item in enumerate(ranges):
121
+ print(idx, end=": ")
122
+ for j in range(10):
123
+ print(chr(item["from"] + j), end=", ")
124
+ print("")
125
+ # for sub in cjk_substrings(string):
126
+ # string = string.replace(sub, "(" + sub + ")")
127
+ # print(string)
128
+
129
+
130
+ def is_traditional_chinese(text):
131
+ cc = opencc.OpenCC('t2s')
132
+ converted_text = cc.convert(text)
133
+ if converted_text != text:
134
+ return True
135
+ return False
136
+
137
+
138
+
139
+ # aa()