Johnathan commited on
Commit
143dcd3
1 Parent(s): c7186c4

add other code

Browse files
Files changed (3) hide show
  1. file_setting.py +46 -0
  2. seg_file.py +65 -0
  3. segmentation.py +310 -0
file_setting.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import sys
5
+
6
+ module_dir = os.path.dirname(__file__)
7
+
8
+ data_dir = os.path.join(module_dir, "data")
9
+
10
+ leaf_idf_path = os.path.join(data_dir, "leaf_idf_2022-05-26.txt")
11
+ leaf_IDF_path = os.path.join(data_dir, "leaf_IDF_20220609.txt")
12
+ leaf_conversion_path = os.path.join(data_dir, "leaf_conversion_20220520.xlsx")
13
+
14
+ # leaf_idf
15
+ leaf_idf_dict = dict()
16
+ with open(leaf_idf_path, mode = "r", encoding = "utf-8") as r:
17
+ for line in r:
18
+ if line[0] == ",":
19
+ leaf_idf_dict[","] = 0.9
20
+ continue
21
+ try:
22
+ tmp = line.split(",")
23
+ leaf_idf_dict[str(tmp[0].strip())] = float(tmp[1].strip())
24
+ except:
25
+ print(line)
26
+
27
+ # leaf_IDF
28
+ leaf_IDF_dict = dict()
29
+ with open(leaf_IDF_path, mode = "r", encoding = "utf-8") as r:
30
+ for line in r:
31
+ line_lst = line.split(" ")
32
+ leaf_IDF_dict[str(line_lst[0])] = float(line_lst[1])
33
+
34
+ # leaf_conversion
35
+ leaf_conversion_df = pd.read_excel(leaf_conversion_path)
36
+
37
+ leafconv_before_lst = []
38
+ for ele in leaf_conversion_df["before leaf"]:
39
+ tmp = ele.split(" ")
40
+ leafconv_before_lst.append(tmp)
41
+
42
+ leafconv_before_lst = sorted(leafconv_before_lst, key = len, reverse = True)
43
+
44
+ leafconv_before = [str(ele) for ele in leaf_conversion_df["before leaf"]]
45
+ leafconv_after = [str(ele) for ele in leaf_conversion_df["after leaf"]]
46
+ leafConv_dict = dict(zip(leafconv_before, leafconv_after))
seg_file.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import json
4
+ import unicodedata as uni
5
+ import sys
6
+
7
+ import collections
8
+
9
+ module_path = os.path.dirname(__file__)
10
+
11
+
12
+ dataFolder_path = os.path.join(module_path, "seg_data")
13
+
14
+ """
15
+ 斷詞用檔案
16
+ """
17
+ chars_2gram_path = os.path.join(dataFolder_path, "chars_2gram.txt")
18
+ chars_path = os.path.join(dataFolder_path, "chars_0317.txt")
19
+ numberenglish_path = os.path.join(dataFolder_path, "numberenglish.txt")
20
+ # userdict_path = os.path.join(dataFolder_path, "userdict_20220126.txt")
21
+ # userdict_path = os.path.join(dataFolder_path, "userdict_2022-04-29_remove_apostrophe.txt") # 0505 更新
22
+ userdict_path = os.path.join(dataFolder_path, "userdict_2022-06-16.txt") # 0527 更新
23
+
24
+
25
+
26
+
27
+ """
28
+ 以下為開啟檔案
29
+ """
30
+ #
31
+ #讀取dictionary
32
+ userdict = dict()
33
+ with open(userdict_path, encoding='utf-8') as f:
34
+ for word in f:
35
+ tmp_word = (uni.normalize("NFKC", word.strip())).split(" ")
36
+
37
+
38
+ userdict[str(tmp_word[0])] = str(tmp_word[1])
39
+
40
+
41
+ #這個字典為所有英數字單字的集合
42
+ numberEnglish_dic_t = []
43
+ with open(numberenglish_path, 'r', encoding='utf-8') as f:
44
+ for line in f:
45
+ line = line.strip('\n')
46
+ numberEnglish_dic_t.append(line)
47
+
48
+ #單一國字的出現頻率
49
+ chars_dic_t = {}
50
+ with open(chars_path, 'r', encoding='utf-8') as f:
51
+ for i, line in enumerate(f):
52
+ line = line.strip('\n')
53
+ chars_dic_t[line] = i
54
+
55
+ #兩個中文字的出現頻率
56
+ #chars_dic_two
57
+ chars_dic_two_t = {}
58
+ with open(chars_2gram_path, 'r', encoding='utf-8') as f:
59
+ for i, line in enumerate(f):
60
+ line = line.strip('\n')
61
+ chars_dic_two_t[line] = i
62
+
63
+ # print("read files for dictionary done")
64
+
65
+
segmentation.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import json
5
+ import unicodedata
6
+ import pandas as pd
7
+
8
+ from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t
9
+
10
+
11
+ class segmentation():
12
+ def __init__(self):
13
+
14
+ self.word_dic = userdict
15
+ self.numberEnglish_dic = numberEnglish_dic_t
16
+ self.chars_dic = chars_dic_t
17
+ self.chars_dic_two = chars_dic_t
18
+
19
+ def step1(self, contents):
20
+
21
+ res = []
22
+ if contents[0] in self.numberEnglish_dic:
23
+ flag = 0
24
+ i = 0
25
+ while flag == 0:
26
+ if i != len(contents) and contents[i] in self.numberEnglish_dic:
27
+ #print(contents[0:i])
28
+ i += 1
29
+ continue
30
+ else:
31
+ flag = 1
32
+ if contents[0:i] not in self.word_dic.keys():
33
+ yield contents[i:]
34
+ yield contents[0:i]
35
+ else:
36
+ yield contents
37
+
38
+ else:
39
+ yield contents
40
+
41
+ def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len):
42
+
43
+ if len(contents) == 0:
44
+ if len(nowcomparestr) != 0:
45
+ chunknum += 1
46
+ nowchunk.append(nowcomparestr)
47
+ if nowchunk not in comparechunklist:
48
+ comparechunklist.append(nowchunk)
49
+ return comparechunklist
50
+ else:
51
+ if len(nowcomparestr) == 0:
52
+ temp = 0
53
+ for i in range(max_len):
54
+ if i+1 >= len(contents):
55
+ temp = 1
56
+ if contents[0:i+1] in self.word_dic.keys():
57
+ new = nowchunk.copy()
58
+ segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new,
59
+ comparechunklist, max_len)
60
+ chunknum += 1
61
+ nowchunk.append(contents[0:i+1])
62
+ if chunknum == 3:
63
+ if nowchunk not in comparechunklist:
64
+ comparechunklist.append(nowchunk)
65
+ return comparechunklist
66
+ else:
67
+ new = nowchunk.copy()
68
+ return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
69
+ else:
70
+
71
+ if temp == 0 and i+1 != max_len:
72
+ continue
73
+ else:
74
+ new = nowchunk.copy()
75
+ return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len)
76
+ if temp == 1:
77
+ break
78
+ else:
79
+ temp = 0
80
+ for i in range(max_len - len(nowcomparestr)):
81
+ if i+1 >= len(contents):
82
+ temp = 1
83
+ if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys():
84
+ new = nowchunk.copy()
85
+ segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new,
86
+ comparechunklist, max_len)
87
+ chunknum += 1
88
+ nowchunk.append(nowcomparestr + contents[0:i+1])
89
+ if(chunknum == 3):
90
+ if(nowchunk not in comparechunklist):
91
+ comparechunklist.append(nowchunk)
92
+ return comparechunklist
93
+ else:
94
+ new = nowchunk.copy()
95
+ return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
96
+ else:
97
+ if temp == 0 and i+1 != max_len - len(nowcomparestr):
98
+ continue
99
+ else:
100
+ chunknum += 1
101
+ nowchunk.append(nowcomparestr)
102
+ if(chunknum == 3):
103
+ if(nowchunk not in comparechunklist):
104
+ comparechunklist.append(nowchunk)
105
+ return comparechunklist
106
+ else:
107
+ new = nowchunk.copy()
108
+ return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len)
109
+
110
+ if temp == 1:
111
+ break
112
+
113
+ def step2_2(self, chunklist):
114
+ len_chunklist = []
115
+
116
+ for i in chunklist:
117
+ temp = 0
118
+ for j in i:
119
+ temp += len(j)
120
+ len_chunklist.append(temp)
121
+ num = 0
122
+ ans = []
123
+ step3_list = []
124
+ for i in len_chunklist:
125
+ if i == max(len_chunklist):
126
+ num += 1
127
+ for i in range(len(chunklist)):
128
+ if len_chunklist[i] == max(len_chunklist):
129
+ yield chunklist[i]
130
+
131
+ def step3(self, input_list):
132
+ avg_len = {}
133
+ for sep_list in input_list:
134
+ sep_len = 0
135
+ for sep in sep_list:
136
+ sep_len += len(sep)
137
+ avg_len = {**avg_len,**{input_list.index(sep_list):sep_len/len(sep_list)}}
138
+ out_put = []
139
+ count = 0
140
+ for key,value in avg_len.items():
141
+ if value == max(avg_len.values()):
142
+ out_put.append(input_list[key])
143
+ return out_put
144
+
145
+ def step4_5(self, step3_list):
146
+ rank = []
147
+ res_list = []
148
+ for i in step3_list:
149
+ max_rank = 10001
150
+ for j in i:
151
+
152
+ if len(j) == 1:
153
+ if 10000 < max_rank:
154
+ max_rank = 10000
155
+ if j in self.chars_dic.keys():
156
+ if self.chars_dic[j] < max_rank:
157
+ max_rank = self.chars_dic[j]
158
+ rank.append(max_rank)
159
+
160
+ top_rank = min(rank)
161
+ # 斷詞結果裡有單詞
162
+ if top_rank < 10000:
163
+ res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank]
164
+
165
+ # 斷詞結果裡有單詞 但是辭典裡沒有這個單詞 top_rank = 10000
166
+ # 斷詞結果裡沒有單詞 top_rank = 10001
167
+ else:
168
+ res_list = step3_list[:]
169
+
170
+ return res_list
171
+
172
+ def step6(self, step5_list):
173
+ res = []
174
+ total_list = []
175
+
176
+ def Get_Average(l):
177
+ total = 0
178
+ for item in l:
179
+ total += len(item)
180
+ return total/len(l)
181
+
182
+ for i in step5_list:
183
+ total = 0
184
+ ave = Get_Average(i)
185
+ for j in i:
186
+ total += (len(j) - ave) ** 2
187
+ total_list.append(total)
188
+ j = min(total_list)
189
+ res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j]
190
+
191
+ return res
192
+
193
+ def step7(self, step6_list):
194
+ rank = []
195
+ res_list = []
196
+ for i in step6_list:
197
+ max_rank = 100001
198
+ for j in i:
199
+ if len(j) == 2:
200
+ if 100000 < max_rank:
201
+ max_rank = 100000
202
+ if j in self.chars_dic_two.keys():
203
+ if self.chars_dic_two[j] < max_rank:
204
+ max_rank = self.chars_dic_two[j]
205
+ rank.append(max_rank)
206
+ if min(rank) < 100000:
207
+ for i, j in enumerate(rank):
208
+ if j == min(rank):
209
+ res_list.append(step6_list[i])
210
+ else:
211
+ for i, j in enumerate(rank):
212
+ if j == 100001:
213
+ res_list.append(step6_list[i])
214
+ elif j == min(rank):
215
+ res_list.append(step6_list[i])
216
+ return res_list
217
+
218
+ def step1to7(self, contents, max_len):
219
+
220
+ res = [text for text in segmentation.step1(self, contents)]
221
+
222
+ if len(res) == 1:
223
+ res = res[0]
224
+ else:
225
+ return [res[1]]
226
+ res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len)
227
+
228
+ res = [text for text in segmentation.step2_2(self, res)]
229
+ if len(res) == 1:
230
+ return res[0]
231
+
232
+ res = segmentation.step3(self, res)
233
+ if len(res) == 1:
234
+ return res[0]
235
+
236
+ res = segmentation.step4_5(self, res)
237
+ if len(res) == 1:
238
+ return res[0]
239
+
240
+ res = segmentation.step6(self, res)
241
+ if len(res) == 1:
242
+ return res[0]
243
+
244
+ res = segmentation.step7(self, res)
245
+ return res[0]
246
+
247
+ def all_step1to7(self, contents, max_len):
248
+ contents = [i.replace(' ','') for i in str(contents)]
249
+ contents = ''.join(contents)
250
+ contents = contents.lower()
251
+ temp = 0
252
+ leaf = []
253
+ log = []
254
+ while(temp == 0):
255
+ if(contents[0] == ' '):
256
+ contents = contents[1:]
257
+ continuex
258
+ res = segmentation.step1to7(self, contents, max_len)
259
+ # 假如辭典沒有這個字,加進log
260
+ if res == []:
261
+ log.append(contents[0])
262
+ contents = contents[1:]
263
+ else:
264
+ res = res[0]
265
+ leaf.append(res)
266
+ contents = contents[len(res):]
267
+ if len(contents) <= 0:
268
+ temp = 1
269
+ return leaf,log
270
+
271
+ def leaf_one(self, contents):
272
+
273
+ try:
274
+ # 全形轉半形
275
+ contents = unicodedata.normalize('NFKC', str(contents))
276
+ contents =str(contents).replace(' ','').replace('\t','')
277
+ leaf,log = segmentation.all_step1to7(self, contents, 10)
278
+ for i, j in enumerate(leaf):
279
+ if j in self.word_dic.keys():
280
+ leaf[i] = self.word_dic[j]
281
+
282
+ leaf_line = ' '.join(leaf)
283
+
284
+ return leaf_line
285
+ except Exception as e:
286
+ print('==============================')
287
+ print(contents)
288
+ print(e)
289
+ print('==============================')
290
+
291
+ def seg_one(self, contents):
292
+ # try:
293
+ # 全形轉半形
294
+ contents = unicodedata.normalize('NFKC', str(contents))
295
+ contents =str(contents).replace(' ','').replace('\t','')
296
+ leaf,log = segmentation.all_step1to7(self, contents,10)
297
+ seg_line = ' '.join(leaf)
298
+ return seg_line
299
+ # except Exception as e:
300
+ # print('==============================')
301
+ # print(contents)
302
+ # print(e)
303
+ # print('==============================')
304
+
305
+ # if __name__ == "__main__":
306
+
307
+ # test = segmentation(["保險法", "刑法"])
308
+
309
+ # test_f = test.seg_one("保險法第一條")
310
+ # print(test_f)