Spaces:
Runtime error
Runtime error
add other code
Browse files- file_setting.py +46 -0
- seg_file.py +65 -0
- segmentation.py +310 -0
file_setting.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import sys
|
5 |
+
|
6 |
+
module_dir = os.path.dirname(__file__)
|
7 |
+
|
8 |
+
data_dir = os.path.join(module_dir, "data")
|
9 |
+
|
10 |
+
leaf_idf_path = os.path.join(data_dir, "leaf_idf_2022-05-26.txt")
|
11 |
+
leaf_IDF_path = os.path.join(data_dir, "leaf_IDF_20220609.txt")
|
12 |
+
leaf_conversion_path = os.path.join(data_dir, "leaf_conversion_20220520.xlsx")
|
13 |
+
|
14 |
+
# leaf_idf
|
15 |
+
leaf_idf_dict = dict()
|
16 |
+
with open(leaf_idf_path, mode = "r", encoding = "utf-8") as r:
|
17 |
+
for line in r:
|
18 |
+
if line[0] == ",":
|
19 |
+
leaf_idf_dict[","] = 0.9
|
20 |
+
continue
|
21 |
+
try:
|
22 |
+
tmp = line.split(",")
|
23 |
+
leaf_idf_dict[str(tmp[0].strip())] = float(tmp[1].strip())
|
24 |
+
except:
|
25 |
+
print(line)
|
26 |
+
|
27 |
+
# leaf_IDF
|
28 |
+
leaf_IDF_dict = dict()
|
29 |
+
with open(leaf_IDF_path, mode = "r", encoding = "utf-8") as r:
|
30 |
+
for line in r:
|
31 |
+
line_lst = line.split(" ")
|
32 |
+
leaf_IDF_dict[str(line_lst[0])] = float(line_lst[1])
|
33 |
+
|
34 |
+
# leaf_conversion
|
35 |
+
leaf_conversion_df = pd.read_excel(leaf_conversion_path)
|
36 |
+
|
37 |
+
leafconv_before_lst = []
|
38 |
+
for ele in leaf_conversion_df["before leaf"]:
|
39 |
+
tmp = ele.split(" ")
|
40 |
+
leafconv_before_lst.append(tmp)
|
41 |
+
|
42 |
+
leafconv_before_lst = sorted(leafconv_before_lst, key = len, reverse = True)
|
43 |
+
|
44 |
+
leafconv_before = [str(ele) for ele in leaf_conversion_df["before leaf"]]
|
45 |
+
leafconv_after = [str(ele) for ele in leaf_conversion_df["after leaf"]]
|
46 |
+
leafConv_dict = dict(zip(leafconv_before, leafconv_after))
|
seg_file.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import unicodedata as uni
|
5 |
+
import sys
|
6 |
+
|
7 |
+
import collections
|
8 |
+
|
9 |
+
module_path = os.path.dirname(__file__)
|
10 |
+
|
11 |
+
|
12 |
+
dataFolder_path = os.path.join(module_path, "seg_data")
|
13 |
+
|
14 |
+
"""
|
15 |
+
斷詞用檔案
|
16 |
+
"""
|
17 |
+
chars_2gram_path = os.path.join(dataFolder_path, "chars_2gram.txt")
|
18 |
+
chars_path = os.path.join(dataFolder_path, "chars_0317.txt")
|
19 |
+
numberenglish_path = os.path.join(dataFolder_path, "numberenglish.txt")
|
20 |
+
# userdict_path = os.path.join(dataFolder_path, "userdict_20220126.txt")
|
21 |
+
# userdict_path = os.path.join(dataFolder_path, "userdict_2022-04-29_remove_apostrophe.txt") # 0505 更新
|
22 |
+
userdict_path = os.path.join(dataFolder_path, "userdict_2022-06-16.txt") # 0527 更新
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
"""
|
28 |
+
以下為開啟檔案
|
29 |
+
"""
|
30 |
+
#
|
31 |
+
#讀取dictionary
|
32 |
+
userdict = dict()
|
33 |
+
with open(userdict_path, encoding='utf-8') as f:
|
34 |
+
for word in f:
|
35 |
+
tmp_word = (uni.normalize("NFKC", word.strip())).split(" ")
|
36 |
+
|
37 |
+
|
38 |
+
userdict[str(tmp_word[0])] = str(tmp_word[1])
|
39 |
+
|
40 |
+
|
41 |
+
#這個字典為所有英數字單字的集合
|
42 |
+
numberEnglish_dic_t = []
|
43 |
+
with open(numberenglish_path, 'r', encoding='utf-8') as f:
|
44 |
+
for line in f:
|
45 |
+
line = line.strip('\n')
|
46 |
+
numberEnglish_dic_t.append(line)
|
47 |
+
|
48 |
+
#單一國字的出現頻率
|
49 |
+
chars_dic_t = {}
|
50 |
+
with open(chars_path, 'r', encoding='utf-8') as f:
|
51 |
+
for i, line in enumerate(f):
|
52 |
+
line = line.strip('\n')
|
53 |
+
chars_dic_t[line] = i
|
54 |
+
|
55 |
+
#兩個中文字的出現頻率
|
56 |
+
#chars_dic_two
|
57 |
+
chars_dic_two_t = {}
|
58 |
+
with open(chars_2gram_path, 'r', encoding='utf-8') as f:
|
59 |
+
for i, line in enumerate(f):
|
60 |
+
line = line.strip('\n')
|
61 |
+
chars_dic_two_t[line] = i
|
62 |
+
|
63 |
+
# print("read files for dictionary done")
|
64 |
+
|
65 |
+
|
segmentation.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
import json
|
5 |
+
import unicodedata
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t
|
9 |
+
|
10 |
+
|
11 |
+
class segmentation():
|
12 |
+
def __init__(self):
|
13 |
+
|
14 |
+
self.word_dic = userdict
|
15 |
+
self.numberEnglish_dic = numberEnglish_dic_t
|
16 |
+
self.chars_dic = chars_dic_t
|
17 |
+
self.chars_dic_two = chars_dic_t
|
18 |
+
|
19 |
+
def step1(self, contents):
|
20 |
+
|
21 |
+
res = []
|
22 |
+
if contents[0] in self.numberEnglish_dic:
|
23 |
+
flag = 0
|
24 |
+
i = 0
|
25 |
+
while flag == 0:
|
26 |
+
if i != len(contents) and contents[i] in self.numberEnglish_dic:
|
27 |
+
#print(contents[0:i])
|
28 |
+
i += 1
|
29 |
+
continue
|
30 |
+
else:
|
31 |
+
flag = 1
|
32 |
+
if contents[0:i] not in self.word_dic.keys():
|
33 |
+
yield contents[i:]
|
34 |
+
yield contents[0:i]
|
35 |
+
else:
|
36 |
+
yield contents
|
37 |
+
|
38 |
+
else:
|
39 |
+
yield contents
|
40 |
+
|
41 |
+
def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len):
|
42 |
+
|
43 |
+
if len(contents) == 0:
|
44 |
+
if len(nowcomparestr) != 0:
|
45 |
+
chunknum += 1
|
46 |
+
nowchunk.append(nowcomparestr)
|
47 |
+
if nowchunk not in comparechunklist:
|
48 |
+
comparechunklist.append(nowchunk)
|
49 |
+
return comparechunklist
|
50 |
+
else:
|
51 |
+
if len(nowcomparestr) == 0:
|
52 |
+
temp = 0
|
53 |
+
for i in range(max_len):
|
54 |
+
if i+1 >= len(contents):
|
55 |
+
temp = 1
|
56 |
+
if contents[0:i+1] in self.word_dic.keys():
|
57 |
+
new = nowchunk.copy()
|
58 |
+
segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new,
|
59 |
+
comparechunklist, max_len)
|
60 |
+
chunknum += 1
|
61 |
+
nowchunk.append(contents[0:i+1])
|
62 |
+
if chunknum == 3:
|
63 |
+
if nowchunk not in comparechunklist:
|
64 |
+
comparechunklist.append(nowchunk)
|
65 |
+
return comparechunklist
|
66 |
+
else:
|
67 |
+
new = nowchunk.copy()
|
68 |
+
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
|
69 |
+
else:
|
70 |
+
|
71 |
+
if temp == 0 and i+1 != max_len:
|
72 |
+
continue
|
73 |
+
else:
|
74 |
+
new = nowchunk.copy()
|
75 |
+
return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len)
|
76 |
+
if temp == 1:
|
77 |
+
break
|
78 |
+
else:
|
79 |
+
temp = 0
|
80 |
+
for i in range(max_len - len(nowcomparestr)):
|
81 |
+
if i+1 >= len(contents):
|
82 |
+
temp = 1
|
83 |
+
if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys():
|
84 |
+
new = nowchunk.copy()
|
85 |
+
segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new,
|
86 |
+
comparechunklist, max_len)
|
87 |
+
chunknum += 1
|
88 |
+
nowchunk.append(nowcomparestr + contents[0:i+1])
|
89 |
+
if(chunknum == 3):
|
90 |
+
if(nowchunk not in comparechunklist):
|
91 |
+
comparechunklist.append(nowchunk)
|
92 |
+
return comparechunklist
|
93 |
+
else:
|
94 |
+
new = nowchunk.copy()
|
95 |
+
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
|
96 |
+
else:
|
97 |
+
if temp == 0 and i+1 != max_len - len(nowcomparestr):
|
98 |
+
continue
|
99 |
+
else:
|
100 |
+
chunknum += 1
|
101 |
+
nowchunk.append(nowcomparestr)
|
102 |
+
if(chunknum == 3):
|
103 |
+
if(nowchunk not in comparechunklist):
|
104 |
+
comparechunklist.append(nowchunk)
|
105 |
+
return comparechunklist
|
106 |
+
else:
|
107 |
+
new = nowchunk.copy()
|
108 |
+
return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len)
|
109 |
+
|
110 |
+
if temp == 1:
|
111 |
+
break
|
112 |
+
|
113 |
+
def step2_2(self, chunklist):
|
114 |
+
len_chunklist = []
|
115 |
+
|
116 |
+
for i in chunklist:
|
117 |
+
temp = 0
|
118 |
+
for j in i:
|
119 |
+
temp += len(j)
|
120 |
+
len_chunklist.append(temp)
|
121 |
+
num = 0
|
122 |
+
ans = []
|
123 |
+
step3_list = []
|
124 |
+
for i in len_chunklist:
|
125 |
+
if i == max(len_chunklist):
|
126 |
+
num += 1
|
127 |
+
for i in range(len(chunklist)):
|
128 |
+
if len_chunklist[i] == max(len_chunklist):
|
129 |
+
yield chunklist[i]
|
130 |
+
|
131 |
+
def step3(self, input_list):
|
132 |
+
avg_len = {}
|
133 |
+
for sep_list in input_list:
|
134 |
+
sep_len = 0
|
135 |
+
for sep in sep_list:
|
136 |
+
sep_len += len(sep)
|
137 |
+
avg_len = {**avg_len,**{input_list.index(sep_list):sep_len/len(sep_list)}}
|
138 |
+
out_put = []
|
139 |
+
count = 0
|
140 |
+
for key,value in avg_len.items():
|
141 |
+
if value == max(avg_len.values()):
|
142 |
+
out_put.append(input_list[key])
|
143 |
+
return out_put
|
144 |
+
|
145 |
+
def step4_5(self, step3_list):
|
146 |
+
rank = []
|
147 |
+
res_list = []
|
148 |
+
for i in step3_list:
|
149 |
+
max_rank = 10001
|
150 |
+
for j in i:
|
151 |
+
|
152 |
+
if len(j) == 1:
|
153 |
+
if 10000 < max_rank:
|
154 |
+
max_rank = 10000
|
155 |
+
if j in self.chars_dic.keys():
|
156 |
+
if self.chars_dic[j] < max_rank:
|
157 |
+
max_rank = self.chars_dic[j]
|
158 |
+
rank.append(max_rank)
|
159 |
+
|
160 |
+
top_rank = min(rank)
|
161 |
+
# 斷詞結果裡有單詞
|
162 |
+
if top_rank < 10000:
|
163 |
+
res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank]
|
164 |
+
|
165 |
+
# 斷詞結果裡有單詞 但是辭典裡沒有這個單詞 top_rank = 10000
|
166 |
+
# 斷詞結果裡沒有單詞 top_rank = 10001
|
167 |
+
else:
|
168 |
+
res_list = step3_list[:]
|
169 |
+
|
170 |
+
return res_list
|
171 |
+
|
172 |
+
def step6(self, step5_list):
|
173 |
+
res = []
|
174 |
+
total_list = []
|
175 |
+
|
176 |
+
def Get_Average(l):
|
177 |
+
total = 0
|
178 |
+
for item in l:
|
179 |
+
total += len(item)
|
180 |
+
return total/len(l)
|
181 |
+
|
182 |
+
for i in step5_list:
|
183 |
+
total = 0
|
184 |
+
ave = Get_Average(i)
|
185 |
+
for j in i:
|
186 |
+
total += (len(j) - ave) ** 2
|
187 |
+
total_list.append(total)
|
188 |
+
j = min(total_list)
|
189 |
+
res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j]
|
190 |
+
|
191 |
+
return res
|
192 |
+
|
193 |
+
def step7(self, step6_list):
|
194 |
+
rank = []
|
195 |
+
res_list = []
|
196 |
+
for i in step6_list:
|
197 |
+
max_rank = 100001
|
198 |
+
for j in i:
|
199 |
+
if len(j) == 2:
|
200 |
+
if 100000 < max_rank:
|
201 |
+
max_rank = 100000
|
202 |
+
if j in self.chars_dic_two.keys():
|
203 |
+
if self.chars_dic_two[j] < max_rank:
|
204 |
+
max_rank = self.chars_dic_two[j]
|
205 |
+
rank.append(max_rank)
|
206 |
+
if min(rank) < 100000:
|
207 |
+
for i, j in enumerate(rank):
|
208 |
+
if j == min(rank):
|
209 |
+
res_list.append(step6_list[i])
|
210 |
+
else:
|
211 |
+
for i, j in enumerate(rank):
|
212 |
+
if j == 100001:
|
213 |
+
res_list.append(step6_list[i])
|
214 |
+
elif j == min(rank):
|
215 |
+
res_list.append(step6_list[i])
|
216 |
+
return res_list
|
217 |
+
|
218 |
+
def step1to7(self, contents, max_len):
|
219 |
+
|
220 |
+
res = [text for text in segmentation.step1(self, contents)]
|
221 |
+
|
222 |
+
if len(res) == 1:
|
223 |
+
res = res[0]
|
224 |
+
else:
|
225 |
+
return [res[1]]
|
226 |
+
res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len)
|
227 |
+
|
228 |
+
res = [text for text in segmentation.step2_2(self, res)]
|
229 |
+
if len(res) == 1:
|
230 |
+
return res[0]
|
231 |
+
|
232 |
+
res = segmentation.step3(self, res)
|
233 |
+
if len(res) == 1:
|
234 |
+
return res[0]
|
235 |
+
|
236 |
+
res = segmentation.step4_5(self, res)
|
237 |
+
if len(res) == 1:
|
238 |
+
return res[0]
|
239 |
+
|
240 |
+
res = segmentation.step6(self, res)
|
241 |
+
if len(res) == 1:
|
242 |
+
return res[0]
|
243 |
+
|
244 |
+
res = segmentation.step7(self, res)
|
245 |
+
return res[0]
|
246 |
+
|
247 |
+
def all_step1to7(self, contents, max_len):
|
248 |
+
contents = [i.replace(' ','') for i in str(contents)]
|
249 |
+
contents = ''.join(contents)
|
250 |
+
contents = contents.lower()
|
251 |
+
temp = 0
|
252 |
+
leaf = []
|
253 |
+
log = []
|
254 |
+
while(temp == 0):
|
255 |
+
if(contents[0] == ' '):
|
256 |
+
contents = contents[1:]
|
257 |
+
continuex
|
258 |
+
res = segmentation.step1to7(self, contents, max_len)
|
259 |
+
# 假如辭典沒有這個字,加進log
|
260 |
+
if res == []:
|
261 |
+
log.append(contents[0])
|
262 |
+
contents = contents[1:]
|
263 |
+
else:
|
264 |
+
res = res[0]
|
265 |
+
leaf.append(res)
|
266 |
+
contents = contents[len(res):]
|
267 |
+
if len(contents) <= 0:
|
268 |
+
temp = 1
|
269 |
+
return leaf,log
|
270 |
+
|
271 |
+
def leaf_one(self, contents):
|
272 |
+
|
273 |
+
try:
|
274 |
+
# 全形轉半形
|
275 |
+
contents = unicodedata.normalize('NFKC', str(contents))
|
276 |
+
contents =str(contents).replace(' ','').replace('\t','')
|
277 |
+
leaf,log = segmentation.all_step1to7(self, contents, 10)
|
278 |
+
for i, j in enumerate(leaf):
|
279 |
+
if j in self.word_dic.keys():
|
280 |
+
leaf[i] = self.word_dic[j]
|
281 |
+
|
282 |
+
leaf_line = ' '.join(leaf)
|
283 |
+
|
284 |
+
return leaf_line
|
285 |
+
except Exception as e:
|
286 |
+
print('==============================')
|
287 |
+
print(contents)
|
288 |
+
print(e)
|
289 |
+
print('==============================')
|
290 |
+
|
291 |
+
def seg_one(self, contents):
|
292 |
+
# try:
|
293 |
+
# 全形轉半形
|
294 |
+
contents = unicodedata.normalize('NFKC', str(contents))
|
295 |
+
contents =str(contents).replace(' ','').replace('\t','')
|
296 |
+
leaf,log = segmentation.all_step1to7(self, contents,10)
|
297 |
+
seg_line = ' '.join(leaf)
|
298 |
+
return seg_line
|
299 |
+
# except Exception as e:
|
300 |
+
# print('==============================')
|
301 |
+
# print(contents)
|
302 |
+
# print(e)
|
303 |
+
# print('==============================')
|
304 |
+
|
305 |
+
# if __name__ == "__main__":
|
306 |
+
|
307 |
+
# test = segmentation(["保險法", "刑法"])
|
308 |
+
|
309 |
+
# test_f = test.seg_one("保險法第一條")
|
310 |
+
# print(test_f)
|