Spaces:

Johnathan
/

ChatBot_prototype_streamlit

Runtime error

App Files Files Community

Johnathan commited on Aug 19, 2022

Commit

143dcd3

•

1 Parent(s): c7186c4

add other code

Browse files

Files changed (3) hide show

file_setting.py +46 -0
seg_file.py +65 -0
segmentation.py +310 -0

file_setting.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import numpy as np
+import pandas as pd
+import sys
+module_dir = os.path.dirname(__file__)
+data_dir = os.path.join(module_dir, "data")
+leaf_idf_path = os.path.join(data_dir, "leaf_idf_2022-05-26.txt")
+leaf_IDF_path = os.path.join(data_dir, "leaf_IDF_20220609.txt")
+leaf_conversion_path = os.path.join(data_dir, "leaf_conversion_20220520.xlsx")
+# leaf_idf
+leaf_idf_dict = dict()
+with open(leaf_idf_path, mode = "r", encoding = "utf-8") as r:
+    for line in r:
+        if line[0] == ",":
+            leaf_idf_dict[","] = 0.9
+            continue
+        try:
+            tmp = line.split(",")
+            leaf_idf_dict[str(tmp[0].strip())] = float(tmp[1].strip())
+        except:
+            print(line)
+# leaf_IDF
+leaf_IDF_dict = dict()
+with open(leaf_IDF_path, mode = "r", encoding = "utf-8") as r:
+    for line in r:
+        line_lst = line.split(" ")
+        leaf_IDF_dict[str(line_lst[0])] = float(line_lst[1])
+# leaf_conversion
+leaf_conversion_df = pd.read_excel(leaf_conversion_path)
+leafconv_before_lst = []
+for ele in leaf_conversion_df["before leaf"]:
+    tmp = ele.split(" ")
+    leafconv_before_lst.append(tmp)
+leafconv_before_lst = sorted(leafconv_before_lst, key = len, reverse = True)
+leafconv_before = [str(ele) for ele in leaf_conversion_df["before leaf"]]
+leafconv_after = [str(ele) for ele in leaf_conversion_df["after leaf"]]
+leafConv_dict = dict(zip(leafconv_before, leafconv_after))

seg_file.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import pandas as pd
+import json
+import unicodedata as uni
+import sys
+import collections
+module_path = os.path.dirname(__file__)
+dataFolder_path = os.path.join(module_path, "seg_data")
+"""
+斷詞用檔案
+"""
+chars_2gram_path = os.path.join(dataFolder_path, "chars_2gram.txt")
+chars_path = os.path.join(dataFolder_path, "chars_0317.txt")
+numberenglish_path = os.path.join(dataFolder_path, "numberenglish.txt")
+# userdict_path = os.path.join(dataFolder_path, "userdict_20220126.txt")
+# userdict_path = os.path.join(dataFolder_path, "userdict_2022-04-29_remove_apostrophe.txt") # 0505 更新
+userdict_path = os.path.join(dataFolder_path, "userdict_2022-06-16.txt") # 0527 更新
+"""
+以下為開啟檔案
+"""
+#
+#讀取dictionary
+userdict = dict()
+with open(userdict_path, encoding='utf-8') as f:
+    for word in f:
+        tmp_word = (uni.normalize("NFKC", word.strip())).split(" ")
+        userdict[str(tmp_word[0])] = str(tmp_word[1])
+#這個字典為所有英數字單字的集合
+numberEnglish_dic_t = []
+with open(numberenglish_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip('\n')
+        numberEnglish_dic_t.append(line)
+#單一國字的出現頻率
+chars_dic_t = {}
+with open(chars_path, 'r', encoding='utf-8') as f:
+    for i, line in enumerate(f):
+        line = line.strip('\n')
+        chars_dic_t[line] = i
+#兩個中文字的出現頻率
+#chars_dic_two
+chars_dic_two_t = {}
+with open(chars_2gram_path, 'r', encoding='utf-8') as f:
+    for i, line in enumerate(f):
+        line = line.strip('\n')
+        chars_dic_two_t[line] = i
+# print("read files for dictionary done")

segmentation.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/env python
+# coding: utf-8
+import json
+import unicodedata
+import pandas as pd
+from seg_file import userdict, numberEnglish_dic_t, chars_dic_t, chars_dic_two_t
+class segmentation():
+    def __init__(self):
+        self.word_dic = userdict
+        self.numberEnglish_dic = numberEnglish_dic_t
+        self.chars_dic = chars_dic_t
+        self.chars_dic_two = chars_dic_t
+    def step1(self, contents):
+        res = []
+        if contents[0] in self.numberEnglish_dic:
+            flag = 0
+            i = 0
+            while flag == 0:
+                if i != len(contents) and contents[i] in self.numberEnglish_dic:
+                    #print(contents[0:i])
+                    i += 1
+                    continue
+                else:
+                    flag = 1
+                    if contents[0:i] not in self.word_dic.keys():
+                        yield contents[i:]
+                        yield contents[0:i]
+                    else:
+                        yield contents
+        else:
+            yield contents
+    def getChunk_max(self, nowcomparestr, contents, chunknum, nowchunk, comparechunklist, max_len):
+        if len(contents) == 0:
+            if len(nowcomparestr) != 0:
+                chunknum += 1
+                nowchunk.append(nowcomparestr)
+            if nowchunk not in comparechunklist:
+                comparechunklist.append(nowchunk)
+            return comparechunklist
+        else:
+            if len(nowcomparestr) == 0:
+                temp = 0
+                for i in range(max_len):
+                    if i+1 >= len(contents):
+                        temp = 1
+                    if contents[0:i+1] in self.word_dic.keys():
+                        new = nowchunk.copy()
+                        segmentation.getChunk_max(self, contents[0:i+1], contents[i+1:], chunknum, new,
+                                    comparechunklist, max_len)
+                        chunknum += 1
+                        nowchunk.append(contents[0:i+1])
+                        if chunknum == 3:
+                            if nowchunk not in comparechunklist:
+                                comparechunklist.append(nowchunk)
+                            return comparechunklist
+                        else:
+                            new = nowchunk.copy()
+                            return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
+                    else:
+                        if temp == 0 and i+1 != max_len:
+                            continue
+                        else:
+                            new = nowchunk.copy()
+                            return segmentation.getChunk_max(self, '', '', chunknum, new, comparechunklist, max_len)
+                    if temp == 1:
+                        break
+            else:
+                temp = 0
+                for i in range(max_len - len(nowcomparestr)):
+                    if i+1 >= len(contents):
+                        temp = 1
+                    if (nowcomparestr + contents[0:i+1]) in self.word_dic.keys():
+                        new = nowchunk.copy()
+                        segmentation.getChunk_max(self, nowcomparestr + contents[0:i+1], contents[i+1:], chunknum, new,
+                                comparechunklist, max_len)
+                        chunknum += 1
+                        nowchunk.append(nowcomparestr + contents[0:i+1])
+                        if(chunknum == 3):
+                            if(nowchunk not in comparechunklist):
+                                comparechunklist.append(nowchunk)
+                            return comparechunklist
+                        else:
+                            new = nowchunk.copy()
+                            return segmentation.getChunk_max(self, '', contents[i+1:], chunknum, new, comparechunklist, max_len)
+                    else:
+                        if temp == 0 and i+1 != max_len - len(nowcomparestr):
+                            continue
+                        else:
+                            chunknum += 1
+                            nowchunk.append(nowcomparestr)
+                            if(chunknum == 3):
+                                if(nowchunk not in comparechunklist):
+                                    comparechunklist.append(nowchunk)
+                                return comparechunklist
+                            else:
+                                new = nowchunk.copy()
+                                return segmentation.getChunk_max(self, '', contents, chunknum, new, comparechunklist, max_len)
+                    if temp == 1:
+                        break
+    def step2_2(self, chunklist):
+        len_chunklist = []
+        for i in chunklist:
+            temp = 0
+            for j in i:
+                temp += len(j)
+            len_chunklist.append(temp)
+        num = 0
+        ans = []
+        step3_list = []
+        for i in len_chunklist:
+            if i == max(len_chunklist):
+                num += 1
+        for i in range(len(chunklist)):
+            if len_chunklist[i] == max(len_chunklist):
+                yield chunklist[i]
+    def step3(self, input_list):
+        avg_len = {}
+        for sep_list in input_list:
+            sep_len = 0
+            for sep in sep_list:
+                sep_len += len(sep)
+            avg_len = {**avg_len,**{input_list.index(sep_list):sep_len/len(sep_list)}}
+        out_put = []
+        count = 0
+        for key,value in avg_len.items():
+            if value == max(avg_len.values()):
+                out_put.append(input_list[key])
+        return out_put
+    def step4_5(self, step3_list):
+        rank = []
+        res_list = []
+        for i in step3_list:
+            max_rank = 10001
+            for j in i:
+                if len(j) == 1:
+                    if 10000 < max_rank:
+                        max_rank = 10000
+                    if j in self.chars_dic.keys():
+                        if self.chars_dic[j] < max_rank:
+                            max_rank = self.chars_dic[j]
+            rank.append(max_rank)
+        top_rank = min(rank)
+        # 斷詞結果裡有單詞
+        if top_rank < 10000:
+            res_list = [sentence for i, sentence in enumerate(step3_list) if rank[i] == top_rank]
+        # 斷詞結果裡有單詞 但是辭典裡沒有這個單詞 top_rank = 10000
+        # 斷詞結果裡沒有單詞 top_rank = 10001
+        else:
+            res_list = step3_list[:]
+        return res_list
+    def step6(self, step5_list):
+        res = []
+        total_list = []
+        def Get_Average(l):
+            total = 0
+            for item in l:
+                total += len(item)
+            return total/len(l)
+        for i in step5_list:
+            total = 0
+            ave = Get_Average(i)
+            for j in i:
+                total += (len(j) - ave) ** 2
+            total_list.append(total)
+        j = min(total_list)
+        res = [sentence for i, sentence in enumerate(step5_list) if total_list[i] == j]
+        return res
+    def step7(self, step6_list):
+        rank = []
+        res_list = []
+        for i in step6_list:
+            max_rank = 100001
+            for j in i:
+                if len(j) == 2:
+                    if 100000 < max_rank:
+                        max_rank = 100000
+                    if j in self.chars_dic_two.keys():
+                        if self.chars_dic_two[j] < max_rank:
+                            max_rank = self.chars_dic_two[j]
+            rank.append(max_rank)
+        if min(rank) < 100000:
+            for i, j in enumerate(rank):
+                if j == min(rank):
+                    res_list.append(step6_list[i])
+        else:
+            for i, j in enumerate(rank):
+                if j == 100001:
+                    res_list.append(step6_list[i])
+                elif j == min(rank):
+                    res_list.append(step6_list[i])
+        return res_list
+    def step1to7(self, contents, max_len):
+        res = [text for text in segmentation.step1(self, contents)]
+        if len(res) == 1:
+            res = res[0]
+        else:
+            return [res[1]]
+        res = segmentation.getChunk_max(self, '', contents, 0, [], [], max_len)
+        res = [text for text in segmentation.step2_2(self, res)]
+        if len(res) == 1:
+            return res[0]
+        res = segmentation.step3(self, res)
+        if len(res) == 1:
+            return res[0]
+        res = segmentation.step4_5(self, res)
+        if len(res) == 1:
+            return res[0]
+        res = segmentation.step6(self, res)
+        if len(res) == 1:
+            return res[0]
+        res = segmentation.step7(self, res)
+        return res[0]
+    def all_step1to7(self, contents, max_len):
+        contents = [i.replace(' ','') for i in str(contents)]
+        contents = ''.join(contents)
+        contents = contents.lower()
+        temp = 0
+        leaf = []
+        log = []
+        while(temp == 0):
+            if(contents[0] == ' '):
+                contents = contents[1:]
+                continuex
+            res = segmentation.step1to7(self, contents, max_len)
+            # 假如辭典沒有這個字，加進log
+            if res == []:
+                log.append(contents[0])
+                contents = contents[1:]
+            else:
+                res = res[0]
+                leaf.append(res)
+                contents = contents[len(res):]
+            if len(contents) <= 0:
+                temp = 1
+        return leaf,log
+    def leaf_one(self, contents):
+        try:
+            # 全形轉半形
+            contents = unicodedata.normalize('NFKC', str(contents))
+            contents =str(contents).replace(' ','').replace('\t','')
+            leaf,log = segmentation.all_step1to7(self, contents, 10)
+            for i, j in enumerate(leaf):
+                if j in self.word_dic.keys():
+                    leaf[i] = self.word_dic[j]
+            leaf_line = ' '.join(leaf)
+            return leaf_line
+        except Exception as e:
+            print('==============================')
+            print(contents)
+            print(e)
+            print('==============================')
+    def seg_one(self, contents):
+        # try:
+        # 全形轉半形
+        contents = unicodedata.normalize('NFKC', str(contents))
+        contents =str(contents).replace(' ','').replace('\t','')
+        leaf,log = segmentation.all_step1to7(self, contents,10)
+        seg_line = ' '.join(leaf)
+        return seg_line
+        # except Exception as e:
+        #     print('==============================')
+        #     print(contents)
+        #     print(e)
+        #     print('==============================')
+# if __name__ == "__main__":
+#     test = segmentation(["保險法", "刑法"])
+#     test_f = test.seg_one("保險法第一條")
+#     print(test_f)