Spaces:

Johnathan
/

ChatBot_prototype_streamlit

Runtime error

App Files Files Community

ChatBot_prototype_streamlit / segmentation.py

Johnathan

add other code

143dcd3 almost 3 years ago

raw

history blame contribute delete

10.9 kB

	#!/usr/bin/env python
	# coding: utf-8

	import json
	import unicodedata
	import pandas as pd

	from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t


	class segmentation():
	def __init__(self):

	self.word_dic = userdict
	self.numberEnglish_dic = numberEnglish_dic_t
	self.chars_dic = chars_dic_t
	self.chars_dic_two = chars_dic_t

	def step1(self, contents):

	res = []
	if contents[0] in self.numberEnglish_dic:
	flag = 0
	i = 0
	while flag == 0:
	if i != len(contents) and contents[i] in self.numberEnglish_dic:
	#print(contents[0:i])
	i += 1
	continue
	else:
	flag = 1
	if contents[0:i] not in self.word_dic.keys():
	yield contents[i:]
	yield contents[0:i]
	else:
	yield contents

	else:
	yield contents

	def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len):

	if len(contents) == 0:
	if len(nowcomparestr) != 0:
	chunknum += 1
	nowchunk.append(nowcomparestr)
	if nowchunk not in comparechunklist:
	comparechunklist.append(nowchunk)
	return comparechunklist
	else:
	if len(nowcomparestr) == 0:
	temp = 0
	for i in range(max_len):
	if i+1 >= len(contents):
	temp = 1
	if contents[0:i+1] in self.word_dic.keys():
	new = nowchunk.copy()
	segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new,
	comparechunklist, max_len)
	chunknum += 1
	nowchunk.append(contents[0:i+1])
	if chunknum == 3:
	if nowchunk not in comparechunklist:
	comparechunklist.append(nowchunk)
	return comparechunklist
	else:
	new = nowchunk.copy()
	return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
	else:

	if temp == 0 and i+1 != max_len:
	continue
	else:
	new = nowchunk.copy()
	return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len)
	if temp == 1:
	break
	else:
	temp = 0
	for i in range(max_len - len(nowcomparestr)):
	if i+1 >= len(contents):
	temp = 1
	if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys():
	new = nowchunk.copy()
	segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new,
	comparechunklist, max_len)
	chunknum += 1
	nowchunk.append(nowcomparestr + contents[0:i+1])
	if(chunknum == 3):
	if(nowchunk not in comparechunklist):
	comparechunklist.append(nowchunk)
	return comparechunklist
	else:
	new = nowchunk.copy()
	return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
	else:
	if temp == 0 and i+1 != max_len - len(nowcomparestr):
	continue
	else:
	chunknum += 1
	nowchunk.append(nowcomparestr)
	if(chunknum == 3):
	if(nowchunk not in comparechunklist):
	comparechunklist.append(nowchunk)
	return comparechunklist
	else:
	new = nowchunk.copy()
	return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len)

	if temp == 1:
	break

	def step2_2(self, chunklist):
	len_chunklist = []

	for i in chunklist:
	temp = 0
	for j in i:
	temp += len(j)
	len_chunklist.append(temp)
	num = 0
	ans = []
	step3_list = []
	for i in len_chunklist:
	if i == max(len_chunklist):
	num += 1
	for i in range(len(chunklist)):
	if len_chunklist[i] == max(len_chunklist):
	yield chunklist[i]

	def step3(self, input_list):
	avg_len = {}
	for sep_list in input_list:
	sep_len = 0
	for sep in sep_list:
	sep_len += len(sep)
	avg_len = {avg_len,{input_list.index(sep_list):sep_len/len(sep_list)}}
	out_put = []
	count = 0
	for key,value in avg_len.items():
	if value == max(avg_len.values()):
	out_put.append(input_list[key])
	return out_put

	def step4_5(self, step3_list):
	rank = []
	res_list = []
	for i in step3_list:
	max_rank = 10001
	for j in i:

	if len(j) == 1:
	if 10000 < max_rank:
	max_rank = 10000
	if j in self.chars_dic.keys():
	if self.chars_dic[j] < max_rank:
	max_rank = self.chars_dic[j]
	rank.append(max_rank)

	top_rank = min(rank)
	# 斷詞結果裡有單詞
	if top_rank < 10000:
	res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank]

	# 斷詞結果裡有單詞但是辭典裡沒有這個單詞 top_rank = 10000
	# 斷詞結果裡沒有單詞 top_rank = 10001
	else:
	res_list = step3_list[:]

	return res_list

	def step6(self, step5_list):
	res = []
	total_list = []

	def Get_Average(l):
	total = 0
	for item in l:
	total += len(item)
	return total/len(l)

	for i in step5_list:
	total = 0
	ave = Get_Average(i)
	for j in i:
	total += (len(j) - ave) ** 2
	total_list.append(total)
	j = min(total_list)
	res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j]

	return res

	def step7(self, step6_list):
	rank = []
	res_list = []
	for i in step6_list:
	max_rank = 100001
	for j in i:
	if len(j) == 2:
	if 100000 < max_rank:
	max_rank = 100000
	if j in self.chars_dic_two.keys():
	if self.chars_dic_two[j] < max_rank:
	max_rank = self.chars_dic_two[j]
	rank.append(max_rank)
	if min(rank) < 100000:
	for i, j in enumerate(rank):
	if j == min(rank):
	res_list.append(step6_list[i])
	else:
	for i, j in enumerate(rank):
	if j == 100001:
	res_list.append(step6_list[i])
	elif j == min(rank):
	res_list.append(step6_list[i])
	return res_list

	def step1to7(self, contents, max_len):

	res = [text for text in segmentation.step1(self, contents)]

	if len(res) == 1:
	res = res[0]
	else:
	return [res[1]]
	res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len)

	res = [text for text in segmentation.step2_2(self, res)]
	if len(res) == 1:
	return res[0]

	res = segmentation.step3(self, res)
	if len(res) == 1:
	return res[0]

	res = segmentation.step4_5(self, res)
	if len(res) == 1:
	return res[0]

	res = segmentation.step6(self, res)
	if len(res) == 1:
	return res[0]

	res = segmentation.step7(self, res)
	return res[0]

	def all_step1to7(self, contents, max_len):
	contents = [i.replace(' ','') for i in str(contents)]
	contents = ''.join(contents)
	contents = contents.lower()
	temp = 0
	leaf = []
	log = []
	while(temp == 0):
	if(contents[0] == ' '):
	contents = contents[1:]
	continuex
	res = segmentation.step1to7(self, contents, max_len)
	# 假如辭典沒有這個字，加進log
	if res == []:
	log.append(contents[0])
	contents = contents[1:]
	else:
	res = res[0]
	leaf.append(res)
	contents = contents[len(res):]
	if len(contents) <= 0:
	temp = 1
	return leaf,log

	def leaf_one(self, contents):

	try:
	# 全形轉半形
	contents = unicodedata.normalize('NFKC', str(contents))
	contents =str(contents).replace(' ','').replace('\t','')
	leaf,log = segmentation.all_step1to7(self, contents, 10)
	for i, j in enumerate(leaf):
	if j in self.word_dic.keys():
	leaf[i] = self.word_dic[j]

	leaf_line = ' '.join(leaf)

	return leaf_line
	except Exception as e:
	print('==============================')
	print(contents)
	print(e)
	print('==============================')

	def seg_one(self, contents):
	# try:
	# 全形轉半形
	contents = unicodedata.normalize('NFKC', str(contents))
	contents =str(contents).replace(' ','').replace('\t','')
	leaf,log = segmentation.all_step1to7(self, contents,10)
	seg_line = ' '.join(leaf)
	return seg_line
	# except Exception as e:
	# print('==============================')
	# print(contents)
	# print(e)
	# print('==============================')

	# if __name__ == "__main__":

	# test = segmentation(["保險法", "刑法"])

	# test_f = test.seg_one("保險法第一條")
	# print(test_f)