Spaces:
Sleeping
Sleeping
File size: 5,076 Bytes
d938037 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import re
import jieba
import pandas as pd
from sentence_transformers import SentenceTransformer, util
class AlgoRule:
def __init__(self) -> None:
df_lvchan = pd.read_excel('lvchan.xlsx', sheet_name='Sheet1')
df_lvchan.columns = df_lvchan.iloc[0]
df_lvchan = df_lvchan[1:]
sep = r'[,、]'
self.dict_rule_index = {
'kuan': {},
'wuxiang': {},
'wuxiang_xianding': {},
}
for _, row in df_lvchan.iterrows():
item = row['三级标题']
for word in re.split(sep, row['宽口径(复核)']):
self.dict_rule_index['kuan'].setdefault(word, []).append(item)
for word in re.split(sep, row['物象关键词(复核)']):
self.dict_rule_index['wuxiang'].setdefault(word, []).append(item)
for word2 in re.split(sep, row['限定词(复核)']):
self.dict_rule_index['wuxiang_xianding'].setdefault('_'.join([word, word2]), []).append(item)
for k in self.dict_rule_index.keys():
for key in self.dict_rule_index[k].keys():
self.dict_rule_index[k][key] = list(set(self.dict_rule_index[k][key]))
def _tokenize(self, text):
tokens = [tok for tok in jieba.cut(text)]
return tokens
def _is_match(self, word, query):
items = self._tokenize(query)
for item in items:
if item == word:
return True
return False
def _match(self, query):
result = {}
matches = {
'wuxiang_xianding': [],
'wuxiang': [],
'kuan': [],
}
# Test 1st route: match both wuxiang and xianding
flag = False
for key in self.dict_rule_index['wuxiang_xianding'].keys():
wuxiang, xianding = key.split('_')
items = self.dict_rule_index['wuxiang_xianding'][key]
if self._is_match(wuxiang, query) and self._is_match(xianding, query):
# if wuxiang in query and xianding in query:
for item in items:
r = result.setdefault(item, {})
r.setdefault('限定词+物项关键词', []).append('+'.join([xianding, wuxiang]))
flag = True
if flag is True:
# clean result
for key1 in result.keys():
for key2 in result[key1].keys():
result[key1][key2] = ' ; '.join(result[key1][key2])
return result
# Test 2nd route: match wuxiang only
r2 = ''
for key in self.dict_rule_index['wuxiang'].keys():
items = self.dict_rule_index['wuxiang'][key]
if self._is_match(key, query):
# if key in query:
for item in items:
r = result.setdefault(item, {})
r.setdefault('物项关键词', []).append(key)
# Test 3rd route: match kuan
r3 = ''
for key in self.dict_rule_index['kuan'].keys():
items = self.dict_rule_index['kuan'][key]
if self._is_match(key, query):
# if key in query:
for item in items:
r = result.setdefault(item, {})
r.setdefault('宽口径', []).append(key)
# clean result
for key1 in result.keys():
for key2 in result[key1].keys():
result[key1][key2] = ' ; '.join(result[key1][key2])
return result
def algo(self, query):
result = self._match(query)
result = [item.strip() for item in result.keys()]
return result
class AlgoAI:
def __init__(self) -> None:
# self.model = SentenceTransformer('DMetaSoul/sbert-chinese-general-v2')
self.model = SentenceTransformer('TintinMeimei/menglang_yongtulv_aimatch_v1')
df_lvchan = pd.read_excel('../lvchan.xlsx', sheet_name='Sheet1')
df_lvchan.columns = df_lvchan.iloc[0]
df_lvchan = df_lvchan[1:]
dict_lvchan = dict((row['三级标题'].strip(), '\n'.join([row['三级标题'].strip(), row['解释说明']])) for _, row in df_lvchan.iterrows())
self.dict_lvchan_vectors = dict((key, self.model.encode(dict_lvchan[key], convert_to_tensor=True)) for key in dict_lvchan.keys())
self.thres = 0.25
def _sim(self, query, item):
emb1 = self.model.encode(query, convert_to_tensor=True)
emb2 = item
score = util.cos_sim(emb1, emb2)
return score
def _match(self, query):
result = []
for key in self.dict_lvchan_vectors.keys():
score = self._sim(query, self.dict_lvchan_vectors[key])
if score > self.thres:
result.append(key)
return result
def algo(self, query):
result = self._match(query)
return result
if __name__ == '__main__':
algo = AlgoRule()
query = '无害生活垃圾'
print(algo.algo(query))
algo2 = AlgoAI()
print(algo2.algo(query)) |