ChatBot_prototype_streamlit / segmentation.py
Johnathan's picture
add other code
143dcd3
#!/usr/bin/env python
# coding: utf-8
import json
import unicodedata
import pandas as pd
from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t
class segmentation():
def __init__(self):
self.word_dic = userdict
self.numberEnglish_dic = numberEnglish_dic_t
self.chars_dic = chars_dic_t
self.chars_dic_two = chars_dic_t
def step1(self, contents):
res = []
if contents[0] in self.numberEnglish_dic:
flag = 0
i = 0
while flag == 0:
if i != len(contents) and contents[i] in self.numberEnglish_dic:
#print(contents[0:i])
i += 1
continue
else:
flag = 1
if contents[0:i] not in self.word_dic.keys():
yield contents[i:]
yield contents[0:i]
else:
yield contents
else:
yield contents
def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len):
if len(contents) == 0:
if len(nowcomparestr) != 0:
chunknum += 1
nowchunk.append(nowcomparestr)
if nowchunk not in comparechunklist:
comparechunklist.append(nowchunk)
return comparechunklist
else:
if len(nowcomparestr) == 0:
temp = 0
for i in range(max_len):
if i+1 >= len(contents):
temp = 1
if contents[0:i+1] in self.word_dic.keys():
new = nowchunk.copy()
segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new,
comparechunklist, max_len)
chunknum += 1
nowchunk.append(contents[0:i+1])
if chunknum == 3:
if nowchunk not in comparechunklist:
comparechunklist.append(nowchunk)
return comparechunklist
else:
new = nowchunk.copy()
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
else:
if temp == 0 and i+1 != max_len:
continue
else:
new = nowchunk.copy()
return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len)
if temp == 1:
break
else:
temp = 0
for i in range(max_len - len(nowcomparestr)):
if i+1 >= len(contents):
temp = 1
if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys():
new = nowchunk.copy()
segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new,
comparechunklist, max_len)
chunknum += 1
nowchunk.append(nowcomparestr + contents[0:i+1])
if(chunknum == 3):
if(nowchunk not in comparechunklist):
comparechunklist.append(nowchunk)
return comparechunklist
else:
new = nowchunk.copy()
return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
else:
if temp == 0 and i+1 != max_len - len(nowcomparestr):
continue
else:
chunknum += 1
nowchunk.append(nowcomparestr)
if(chunknum == 3):
if(nowchunk not in comparechunklist):
comparechunklist.append(nowchunk)
return comparechunklist
else:
new = nowchunk.copy()
return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len)
if temp == 1:
break
def step2_2(self, chunklist):
len_chunklist = []
for i in chunklist:
temp = 0
for j in i:
temp += len(j)
len_chunklist.append(temp)
num = 0
ans = []
step3_list = []
for i in len_chunklist:
if i == max(len_chunklist):
num += 1
for i in range(len(chunklist)):
if len_chunklist[i] == max(len_chunklist):
yield chunklist[i]
def step3(self, input_list):
avg_len = {}
for sep_list in input_list:
sep_len = 0
for sep in sep_list:
sep_len += len(sep)
avg_len = {**avg_len,**{input_list.index(sep_list):sep_len/len(sep_list)}}
out_put = []
count = 0
for key,value in avg_len.items():
if value == max(avg_len.values()):
out_put.append(input_list[key])
return out_put
def step4_5(self, step3_list):
rank = []
res_list = []
for i in step3_list:
max_rank = 10001
for j in i:
if len(j) == 1:
if 10000 < max_rank:
max_rank = 10000
if j in self.chars_dic.keys():
if self.chars_dic[j] < max_rank:
max_rank = self.chars_dic[j]
rank.append(max_rank)
top_rank = min(rank)
# 斷詞結果裡有單詞
if top_rank < 10000:
res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank]
# 斷詞結果裡有單詞 但是辭典裡沒有這個單詞 top_rank = 10000
# 斷詞結果裡沒有單詞 top_rank = 10001
else:
res_list = step3_list[:]
return res_list
def step6(self, step5_list):
res = []
total_list = []
def Get_Average(l):
total = 0
for item in l:
total += len(item)
return total/len(l)
for i in step5_list:
total = 0
ave = Get_Average(i)
for j in i:
total += (len(j) - ave) ** 2
total_list.append(total)
j = min(total_list)
res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j]
return res
def step7(self, step6_list):
rank = []
res_list = []
for i in step6_list:
max_rank = 100001
for j in i:
if len(j) == 2:
if 100000 < max_rank:
max_rank = 100000
if j in self.chars_dic_two.keys():
if self.chars_dic_two[j] < max_rank:
max_rank = self.chars_dic_two[j]
rank.append(max_rank)
if min(rank) < 100000:
for i, j in enumerate(rank):
if j == min(rank):
res_list.append(step6_list[i])
else:
for i, j in enumerate(rank):
if j == 100001:
res_list.append(step6_list[i])
elif j == min(rank):
res_list.append(step6_list[i])
return res_list
def step1to7(self, contents, max_len):
res = [text for text in segmentation.step1(self, contents)]
if len(res) == 1:
res = res[0]
else:
return [res[1]]
res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len)
res = [text for text in segmentation.step2_2(self, res)]
if len(res) == 1:
return res[0]
res = segmentation.step3(self, res)
if len(res) == 1:
return res[0]
res = segmentation.step4_5(self, res)
if len(res) == 1:
return res[0]
res = segmentation.step6(self, res)
if len(res) == 1:
return res[0]
res = segmentation.step7(self, res)
return res[0]
def all_step1to7(self, contents, max_len):
contents = [i.replace(' ','') for i in str(contents)]
contents = ''.join(contents)
contents = contents.lower()
temp = 0
leaf = []
log = []
while(temp == 0):
if(contents[0] == ' '):
contents = contents[1:]
continuex
res = segmentation.step1to7(self, contents, max_len)
# 假如辭典沒有這個字,加進log
if res == []:
log.append(contents[0])
contents = contents[1:]
else:
res = res[0]
leaf.append(res)
contents = contents[len(res):]
if len(contents) <= 0:
temp = 1
return leaf,log
def leaf_one(self, contents):
try:
# 全形轉半形
contents = unicodedata.normalize('NFKC', str(contents))
contents =str(contents).replace(' ','').replace('\t','')
leaf,log = segmentation.all_step1to7(self, contents, 10)
for i, j in enumerate(leaf):
if j in self.word_dic.keys():
leaf[i] = self.word_dic[j]
leaf_line = ' '.join(leaf)
return leaf_line
except Exception as e:
print('==============================')
print(contents)
print(e)
print('==============================')
def seg_one(self, contents):
# try:
# 全形轉半形
contents = unicodedata.normalize('NFKC', str(contents))
contents =str(contents).replace(' ','').replace('\t','')
leaf,log = segmentation.all_step1to7(self, contents,10)
seg_line = ' '.join(leaf)
return seg_line
# except Exception as e:
# print('==============================')
# print(contents)
# print(e)
# print('==============================')
# if __name__ == "__main__":
# test = segmentation(["保險法", "刑法"])
# test_f = test.seg_one("保險法第一條")
# print(test_f)