File size: 5,076 Bytes
d938037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re

import jieba
import pandas as pd
from sentence_transformers import SentenceTransformer, util


class AlgoRule:


    def __init__(self) -> None:
        df_lvchan = pd.read_excel('lvchan.xlsx', sheet_name='Sheet1')
        df_lvchan.columns = df_lvchan.iloc[0]
        df_lvchan = df_lvchan[1:]
        sep = r'[,、]'
        self.dict_rule_index = {
            'kuan': {},
            'wuxiang': {},
            'wuxiang_xianding': {},
        }
        for _, row in df_lvchan.iterrows():
            item = row['三级标题']
            for word in re.split(sep, row['宽口径(复核)']):
                self.dict_rule_index['kuan'].setdefault(word, []).append(item)
            for word in re.split(sep, row['物象关键词(复核)']):
                self.dict_rule_index['wuxiang'].setdefault(word, []).append(item)
                for word2 in re.split(sep, row['限定词(复核)']):
                    self.dict_rule_index['wuxiang_xianding'].setdefault('_'.join([word, word2]), []).append(item)
        for k in self.dict_rule_index.keys():
            for key in self.dict_rule_index[k].keys():
                self.dict_rule_index[k][key] = list(set(self.dict_rule_index[k][key]))


    def _tokenize(self, text):
        tokens = [tok for tok in jieba.cut(text)]
        return tokens


    def _is_match(self, word, query):
        items = self._tokenize(query)
        for item in items:
            if item == word:
                return True
        return False
    

    def _match(self, query):
        result = {}
        matches = {
            'wuxiang_xianding': [],
            'wuxiang': [],
            'kuan': [],
        }
        # Test 1st route: match both wuxiang and xianding
        flag = False
        for key in self.dict_rule_index['wuxiang_xianding'].keys():
            wuxiang, xianding = key.split('_')
            items = self.dict_rule_index['wuxiang_xianding'][key]
            if self._is_match(wuxiang, query) and self._is_match(xianding, query):
            # if wuxiang in query and xianding in query:
                for item in items:
                    r = result.setdefault(item, {})
                    r.setdefault('限定词+物项关键词', []).append('+'.join([xianding, wuxiang]))
                flag = True
        if flag is True:
            # clean result
            for key1 in result.keys():
                for key2 in result[key1].keys():
                    result[key1][key2] = ' ; '.join(result[key1][key2])
            return result
        # Test 2nd route: match wuxiang only
        r2 = ''
        for key in self.dict_rule_index['wuxiang'].keys():
            items = self.dict_rule_index['wuxiang'][key]
            if self._is_match(key, query):
            # if key in query:
                for item in items:
                    r = result.setdefault(item, {})
                    r.setdefault('物项关键词', []).append(key)
        # Test 3rd route: match kuan
        r3 = ''
        for key in self.dict_rule_index['kuan'].keys():
            items = self.dict_rule_index['kuan'][key]
            if self._is_match(key, query):
            # if key in query:
                for item in items:
                    r = result.setdefault(item, {})
                    r.setdefault('宽口径', []).append(key)
        # clean result
        for key1 in result.keys():
            for key2 in result[key1].keys():
                result[key1][key2] = ' ; '.join(result[key1][key2])
        return result
    
    
    def algo(self, query):
        result = self._match(query)
        result = [item.strip() for item in result.keys()]
        return result


class AlgoAI:


    def __init__(self) -> None:
        # self.model = SentenceTransformer('DMetaSoul/sbert-chinese-general-v2')
        self.model = SentenceTransformer('TintinMeimei/menglang_yongtulv_aimatch_v1')
        df_lvchan = pd.read_excel('../lvchan.xlsx', sheet_name='Sheet1')
        df_lvchan.columns = df_lvchan.iloc[0]
        df_lvchan = df_lvchan[1:]
        dict_lvchan = dict((row['三级标题'].strip(), '\n'.join([row['三级标题'].strip(), row['解释说明']])) for _, row in df_lvchan.iterrows())
        self.dict_lvchan_vectors = dict((key, self.model.encode(dict_lvchan[key], convert_to_tensor=True)) for key in dict_lvchan.keys())
        self.thres = 0.25
    

    def _sim(self, query, item):
        emb1 = self.model.encode(query, convert_to_tensor=True)
        emb2 = item
        score = util.cos_sim(emb1, emb2)
        return score
    

    def _match(self, query):
        result = []
        for key in self.dict_lvchan_vectors.keys():
            score = self._sim(query, self.dict_lvchan_vectors[key])
            if score > self.thres:
                result.append(key)
        return result
    

    def algo(self, query):
        result = self._match(query)
        return result


if __name__ == '__main__':
    algo = AlgoRule()
    query = '无害生活垃圾'
    print(algo.algo(query))

    algo2 = AlgoAI()
    print(algo2.algo(query))