Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding: utf-8 | |
import json | |
import unicodedata | |
import pandas as pd | |
from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t | |
class segmentation(): | |
def __init__(self): | |
self.word_dic = userdict | |
self.numberEnglish_dic = numberEnglish_dic_t | |
self.chars_dic = chars_dic_t | |
self.chars_dic_two = chars_dic_t | |
def step1(self, contents): | |
res = [] | |
if contents[0] in self.numberEnglish_dic: | |
flag = 0 | |
i = 0 | |
while flag == 0: | |
if i != len(contents) and contents[i] in self.numberEnglish_dic: | |
#print(contents[0:i]) | |
i += 1 | |
continue | |
else: | |
flag = 1 | |
if contents[0:i] not in self.word_dic.keys(): | |
yield contents[i:] | |
yield contents[0:i] | |
else: | |
yield contents | |
else: | |
yield contents | |
def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len): | |
if len(contents) == 0: | |
if len(nowcomparestr) != 0: | |
chunknum += 1 | |
nowchunk.append(nowcomparestr) | |
if nowchunk not in comparechunklist: | |
comparechunklist.append(nowchunk) | |
return comparechunklist | |
else: | |
if len(nowcomparestr) == 0: | |
temp = 0 | |
for i in range(max_len): | |
if i+1 >= len(contents): | |
temp = 1 | |
if contents[0:i+1] in self.word_dic.keys(): | |
new = nowchunk.copy() | |
segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new, | |
comparechunklist, max_len) | |
chunknum += 1 | |
nowchunk.append(contents[0:i+1]) | |
if chunknum == 3: | |
if nowchunk not in comparechunklist: | |
comparechunklist.append(nowchunk) | |
return comparechunklist | |
else: | |
new = nowchunk.copy() | |
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len) | |
else: | |
if temp == 0 and i+1 != max_len: | |
continue | |
else: | |
new = nowchunk.copy() | |
return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len) | |
if temp == 1: | |
break | |
else: | |
temp = 0 | |
for i in range(max_len - len(nowcomparestr)): | |
if i+1 >= len(contents): | |
temp = 1 | |
if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys(): | |
new = nowchunk.copy() | |
segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new, | |
comparechunklist, max_len) | |
chunknum += 1 | |
nowchunk.append(nowcomparestr + contents[0:i+1]) | |
if(chunknum == 3): | |
if(nowchunk not in comparechunklist): | |
comparechunklist.append(nowchunk) | |
return comparechunklist | |
else: | |
new = nowchunk.copy() | |
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len) | |
else: | |
if temp == 0 and i+1 != max_len - len(nowcomparestr): | |
continue | |
else: | |
chunknum += 1 | |
nowchunk.append(nowcomparestr) | |
if(chunknum == 3): | |
if(nowchunk not in comparechunklist): | |
comparechunklist.append(nowchunk) | |
return comparechunklist | |
else: | |
new = nowchunk.copy() | |
return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len) | |
if temp == 1: | |
break | |
def step2_2(self, chunklist): | |
len_chunklist = [] | |
for i in chunklist: | |
temp = 0 | |
for j in i: | |
temp += len(j) | |
len_chunklist.append(temp) | |
num = 0 | |
ans = [] | |
step3_list = [] | |
for i in len_chunklist: | |
if i == max(len_chunklist): | |
num += 1 | |
for i in range(len(chunklist)): | |
if len_chunklist[i] == max(len_chunklist): | |
yield chunklist[i] | |
def step3(self, input_list): | |
avg_len = {} | |
for sep_list in input_list: | |
sep_len = 0 | |
for sep in sep_list: | |
sep_len += len(sep) | |
avg_len = {**avg_len,**{input_list.index(sep_list):sep_len/len(sep_list)}} | |
out_put = [] | |
count = 0 | |
for key,value in avg_len.items(): | |
if value == max(avg_len.values()): | |
out_put.append(input_list[key]) | |
return out_put | |
def step4_5(self, step3_list): | |
rank = [] | |
res_list = [] | |
for i in step3_list: | |
max_rank = 10001 | |
for j in i: | |
if len(j) == 1: | |
if 10000 < max_rank: | |
max_rank = 10000 | |
if j in self.chars_dic.keys(): | |
if self.chars_dic[j] < max_rank: | |
max_rank = self.chars_dic[j] | |
rank.append(max_rank) | |
top_rank = min(rank) | |
# 斷詞結果裡有單詞 | |
if top_rank < 10000: | |
res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank] | |
# 斷詞結果裡有單詞 但是辭典裡沒有這個單詞 top_rank = 10000 | |
# 斷詞結果裡沒有單詞 top_rank = 10001 | |
else: | |
res_list = step3_list[:] | |
return res_list | |
def step6(self, step5_list): | |
res = [] | |
total_list = [] | |
def Get_Average(l): | |
total = 0 | |
for item in l: | |
total += len(item) | |
return total/len(l) | |
for i in step5_list: | |
total = 0 | |
ave = Get_Average(i) | |
for j in i: | |
total += (len(j) - ave) ** 2 | |
total_list.append(total) | |
j = min(total_list) | |
res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j] | |
return res | |
def step7(self, step6_list): | |
rank = [] | |
res_list = [] | |
for i in step6_list: | |
max_rank = 100001 | |
for j in i: | |
if len(j) == 2: | |
if 100000 < max_rank: | |
max_rank = 100000 | |
if j in self.chars_dic_two.keys(): | |
if self.chars_dic_two[j] < max_rank: | |
max_rank = self.chars_dic_two[j] | |
rank.append(max_rank) | |
if min(rank) < 100000: | |
for i, j in enumerate(rank): | |
if j == min(rank): | |
res_list.append(step6_list[i]) | |
else: | |
for i, j in enumerate(rank): | |
if j == 100001: | |
res_list.append(step6_list[i]) | |
elif j == min(rank): | |
res_list.append(step6_list[i]) | |
return res_list | |
def step1to7(self, contents, max_len): | |
res = [text for text in segmentation.step1(self, contents)] | |
if len(res) == 1: | |
res = res[0] | |
else: | |
return [res[1]] | |
res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len) | |
res = [text for text in segmentation.step2_2(self, res)] | |
if len(res) == 1: | |
return res[0] | |
res = segmentation.step3(self, res) | |
if len(res) == 1: | |
return res[0] | |
res = segmentation.step4_5(self, res) | |
if len(res) == 1: | |
return res[0] | |
res = segmentation.step6(self, res) | |
if len(res) == 1: | |
return res[0] | |
res = segmentation.step7(self, res) | |
return res[0] | |
def all_step1to7(self, contents, max_len): | |
contents = [i.replace(' ','') for i in str(contents)] | |
contents = ''.join(contents) | |
contents = contents.lower() | |
temp = 0 | |
leaf = [] | |
log = [] | |
while(temp == 0): | |
if(contents[0] == ' '): | |
contents = contents[1:] | |
continuex | |
res = segmentation.step1to7(self, contents, max_len) | |
# 假如辭典沒有這個字,加進log | |
if res == []: | |
log.append(contents[0]) | |
contents = contents[1:] | |
else: | |
res = res[0] | |
leaf.append(res) | |
contents = contents[len(res):] | |
if len(contents) <= 0: | |
temp = 1 | |
return leaf,log | |
def leaf_one(self, contents): | |
try: | |
# 全形轉半形 | |
contents = unicodedata.normalize('NFKC', str(contents)) | |
contents =str(contents).replace(' ','').replace('\t','') | |
leaf,log = segmentation.all_step1to7(self, contents, 10) | |
for i, j in enumerate(leaf): | |
if j in self.word_dic.keys(): | |
leaf[i] = self.word_dic[j] | |
leaf_line = ' '.join(leaf) | |
return leaf_line | |
except Exception as e: | |
print('==============================') | |
print(contents) | |
print(e) | |
print('==============================') | |
def seg_one(self, contents): | |
# try: | |
# 全形轉半形 | |
contents = unicodedata.normalize('NFKC', str(contents)) | |
contents =str(contents).replace(' ','').replace('\t','') | |
leaf,log = segmentation.all_step1to7(self, contents,10) | |
seg_line = ' '.join(leaf) | |
return seg_line | |
# except Exception as e: | |
# print('==============================') | |
# print(contents) | |
# print(e) | |
# print('==============================') | |
# if __name__ == "__main__": | |
# test = segmentation(["保險法", "刑法"]) | |
# test_f = test.seg_one("保險法第一條") | |
# print(test_f) |