Upload 7 files
Browse files- get_wordlists.py +97 -0
- model.bin +3 -0
- multihead_attention.py +255 -0
- sen2inds.py +151 -0
- textCNN_data.py +53 -0
- train2.py +118 -0
- xlsx2txt.py +41 -0
get_wordlists.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
'''
|
3 |
+
将训练数据使用jieba分词工具进行分词。并且剔除stopList中的词。
|
4 |
+
得到词表:
|
5 |
+
词表的每一行的内容为:词 词的序号 词的频次
|
6 |
+
'''
|
7 |
+
|
8 |
+
import json
|
9 |
+
import jieba
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
trainFile = 'data/output.txt' # trainFile = 'data/train.txt'
|
13 |
+
devFile = 'data/output2.txt' #"data/dev.txt"
|
14 |
+
stopwordFile = 'data/stopword.txt'
|
15 |
+
wordLabelFile = 'wordLabel.txt'
|
16 |
+
lengthFile = 'length.txt'
|
17 |
+
|
18 |
+
|
19 |
+
def read_stopword(file):
|
20 |
+
datas = open(file, 'r', encoding='utf_8').readlines()
|
21 |
+
datas = [data.replace('\n', '') for data in datas]
|
22 |
+
return datas
|
23 |
+
|
24 |
+
|
25 |
+
def main():
|
26 |
+
worddict = {}
|
27 |
+
stoplist = read_stopword(stopwordFile)
|
28 |
+
len_dic = {}
|
29 |
+
data_num = 0
|
30 |
+
# trainFile
|
31 |
+
datas = open(trainFile, 'r', encoding='utf_8').readlines()
|
32 |
+
data_num += len(datas)
|
33 |
+
datas = list(filter(None, datas))
|
34 |
+
for line in tqdm(datas, desc='traindata word to label'):
|
35 |
+
line = line.replace('\n', '').split(':')
|
36 |
+
# line = line.replace('\n', '').split('\t')
|
37 |
+
title_seg = [i for i in line[0]]
|
38 |
+
# title_seg = jieba.cut(line[0], cut_all=False)
|
39 |
+
length = 0
|
40 |
+
for w in title_seg:
|
41 |
+
if w in stoplist:
|
42 |
+
continue
|
43 |
+
length += 1
|
44 |
+
if w in worddict:
|
45 |
+
worddict[w] += 1
|
46 |
+
else:
|
47 |
+
worddict[w] = 1
|
48 |
+
if length in len_dic:
|
49 |
+
len_dic[length] += 1
|
50 |
+
else:
|
51 |
+
len_dic[length] = 1
|
52 |
+
|
53 |
+
# devFile
|
54 |
+
datas = open(devFile, 'r', encoding='utf_8').readlines()
|
55 |
+
datas = list(filter(None, datas))
|
56 |
+
data_num += len(datas)
|
57 |
+
for line in tqdm(datas, desc='devdata word to label'):
|
58 |
+
line = line.replace('\n', '').split(':')
|
59 |
+
# line = line.replace('\n', '').split('\t')
|
60 |
+
title_seg = [i for i in line[0]]
|
61 |
+
# title_seg = jieba.cut(line[0], cut_all=False)
|
62 |
+
length = 0
|
63 |
+
for w in title_seg:
|
64 |
+
if w in stoplist:
|
65 |
+
continue
|
66 |
+
length += 1
|
67 |
+
if w in worddict:
|
68 |
+
worddict[w] += 1
|
69 |
+
else:
|
70 |
+
worddict[w] = 1
|
71 |
+
if length in len_dic:
|
72 |
+
len_dic[length] += 1
|
73 |
+
else:
|
74 |
+
len_dic[length] = 1
|
75 |
+
|
76 |
+
wordlist = sorted(worddict.items(), key=lambda item: item[1], reverse=True)
|
77 |
+
f = open(wordLabelFile, 'w', encoding='utf_8')
|
78 |
+
# ind = 0
|
79 |
+
ind = 1
|
80 |
+
for t in wordlist:
|
81 |
+
d = t[0] + ' ' + str(ind) + ' ' + str(t[1]) + '\n'
|
82 |
+
ind += 1
|
83 |
+
f.write(d)
|
84 |
+
|
85 |
+
for k, v in len_dic.items():
|
86 |
+
len_dic[k] = round(v * 1.0 / data_num, 3) # calculate the probability of each length and round it to 3 decimal places
|
87 |
+
len_list = sorted(len_dic.items(), key=lambda item: item[0], reverse=True)
|
88 |
+
f = open(lengthFile, 'w', encoding='utf_8')
|
89 |
+
for t in len_list:
|
90 |
+
d = str(t[0]) + ' ' + str(t[1]) + '\n'
|
91 |
+
f.write(d)
|
92 |
+
|
93 |
+
|
94 |
+
# lengthFile = 'length.txt'
|
95 |
+
# wordLabelFile = 'wordLabel.txt'
|
96 |
+
if __name__ == "__main__":
|
97 |
+
main()
|
model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a35cbd5939ffd2a3bdda50a86d758e45b6d819811ff693b1c8d8c6ada85fc34c
|
3 |
+
size 6843938
|
multihead_attention.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
from textCNN_data import textCNN_param
|
5 |
+
|
6 |
+
|
7 |
+
class Attention1(nn.Module):
|
8 |
+
"""
|
9 |
+
1.输入 [N,T,C] -> Linear、Tanh
|
10 |
+
2. -> [N,T,1] -> unsqueeze
|
11 |
+
3. -> [N,T] -> Softmax
|
12 |
+
4. -> [N,T] -> unsqueeze
|
13 |
+
5. -> [N,1,T] -> repeat
|
14 |
+
6. -> [N,C,T] -> transpose
|
15 |
+
7. -> [N,T,C]
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self, hidden_dim):
|
19 |
+
super(Attention1, self).__init__()
|
20 |
+
self.hidden_dim = hidden_dim
|
21 |
+
self.dense = nn.Linear(hidden_dim, 1)
|
22 |
+
|
23 |
+
def forward(self, features):
|
24 |
+
batch_size, time_step, hidden_dim = 128, 20, 128 # features.size()
|
25 |
+
weight = nn.Tanh()(self.dense(features)).squeeze(-1)
|
26 |
+
|
27 |
+
# mask给负无穷使得权重为0
|
28 |
+
mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
|
29 |
+
paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
|
30 |
+
weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
|
31 |
+
|
32 |
+
weight = nn.Softmax(dim=1)(weight)
|
33 |
+
weight = weight.unsqueeze(1)
|
34 |
+
weight = weight.repeat(1, hidden_dim, 1)
|
35 |
+
weight = weight.transpose(2, 1)
|
36 |
+
features_attention = weight * features
|
37 |
+
|
38 |
+
return features_attention
|
39 |
+
|
40 |
+
|
41 |
+
class Attention2(nn.Module):
|
42 |
+
"""
|
43 |
+
1.输入 [N,T,C] -> Linear、Tanh
|
44 |
+
2. -> [N,T,C] -> transpose
|
45 |
+
3. -> [N,C,T] -> Softmax
|
46 |
+
4. -> [N,C,T] -> mean
|
47 |
+
5. -> [N,T] -> unsqueeze
|
48 |
+
5. -> [N,1,T] -> expand
|
49 |
+
6. -> [N,C,T] -> transpose
|
50 |
+
7. -> [N,T,C]
|
51 |
+
"""
|
52 |
+
|
53 |
+
def __init__(self, hidden_dim):
|
54 |
+
super(Attention2, self).__init__()
|
55 |
+
self.hidden_dim = hidden_dim
|
56 |
+
self.dense = nn.Linear(hidden_dim, hidden_dim)
|
57 |
+
|
58 |
+
def forward(self, features, mean=True):
|
59 |
+
batch_size, time_step, hidden_dim = features.size()
|
60 |
+
weight = nn.Tanh()(self.dense(features))
|
61 |
+
|
62 |
+
# mask给负无穷使得权重为0
|
63 |
+
mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
|
64 |
+
mask_idx = mask_idx.unsqueeze(-1).expand(batch_size, time_step,
|
65 |
+
hidden_dim)
|
66 |
+
paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
|
67 |
+
weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
|
68 |
+
|
69 |
+
weight = weight.transpose(2, 1)
|
70 |
+
weight = nn.Softmax(dim=2)(weight)
|
71 |
+
if mean:
|
72 |
+
weight = weight.mean(dim=1)
|
73 |
+
weight = weight.unsqueeze(1)
|
74 |
+
weight = weight.repeat(1, hidden_dim, 1)
|
75 |
+
weight = weight.transpose(2, 1)
|
76 |
+
features_attention = weight * features
|
77 |
+
|
78 |
+
return features_attention
|
79 |
+
|
80 |
+
|
81 |
+
class LayerNorm(nn.Module):
|
82 |
+
"""
|
83 |
+
结果和nn.LayerNorm有些出入。
|
84 |
+
"""
|
85 |
+
|
86 |
+
def __init__(self, features, epsilon=1e-8):
|
87 |
+
super(LayerNorm, self).__init__()
|
88 |
+
self.beta = nn.Parameter(torch.zeros(features))
|
89 |
+
self.gamma = nn.Parameter(torch.ones(features))
|
90 |
+
self.epsilon = epsilon
|
91 |
+
|
92 |
+
def forward(self, x):
|
93 |
+
mean = x.mean(-1, keepdim=True)
|
94 |
+
std = x.std(-1, keepdim=True)
|
95 |
+
normalized = (x - mean) / (std + self.epsilon)
|
96 |
+
outputs = self.gamma * normalized + self.beta
|
97 |
+
|
98 |
+
return outputs
|
99 |
+
|
100 |
+
|
101 |
+
class Multihead_Attention(nn.Module):
|
102 |
+
"""
|
103 |
+
multihead_attention
|
104 |
+
根据<https://www.github.com/kyubyong/transformer>修改
|
105 |
+
1.split+cat
|
106 |
+
2.matmul(q,k)
|
107 |
+
3.mask k
|
108 |
+
4.softmax
|
109 |
+
5.mask q
|
110 |
+
6.matmul(attn,v)
|
111 |
+
7.split+cat
|
112 |
+
8.res q
|
113 |
+
9.norm
|
114 |
+
"""
|
115 |
+
|
116 |
+
def __init__(self,
|
117 |
+
hidden_dim,
|
118 |
+
C_q=None,
|
119 |
+
C_k=None,
|
120 |
+
C_v=None,
|
121 |
+
num_heads=1,
|
122 |
+
dropout_rate=0.0):
|
123 |
+
super(Multihead_Attention, self).__init__()
|
124 |
+
self.hidden_dim = hidden_dim
|
125 |
+
C_q = C_q if C_q else hidden_dim
|
126 |
+
C_k = C_k if C_k else hidden_dim
|
127 |
+
C_v = C_v if C_v else hidden_dim
|
128 |
+
self.linear_Q = nn.Linear(C_q, hidden_dim) # W_Q
|
129 |
+
self.linear_K = nn.Linear(C_k, hidden_dim) # W_K
|
130 |
+
self.linear_V = nn.Linear(C_v, hidden_dim) # W_V
|
131 |
+
self.num_heads = num_heads
|
132 |
+
self.norm = nn.LayerNorm(hidden_dim)
|
133 |
+
self.dropout = nn.Dropout(p=dropout_rate)
|
134 |
+
|
135 |
+
def forward(self,
|
136 |
+
Q, K, V):
|
137 |
+
"""
|
138 |
+
:param Q: A 3d tensor with shape of [N, T_q, C_q]
|
139 |
+
:param K: A 3d tensor with shape of [N, T_k, C_k]
|
140 |
+
:param V: A 3d tensor with shape of [N, T_v, C_v]
|
141 |
+
:return:
|
142 |
+
"""
|
143 |
+
num_heads = self.num_heads
|
144 |
+
N = Q.size()[0]
|
145 |
+
|
146 |
+
# Linear projections
|
147 |
+
Q_l = nn.ReLU()(self.linear_Q(Q)) # W_Q x input_Q(x)
|
148 |
+
K_l = nn.ReLU()(self.linear_K(K)) # W_K x input_K(x)
|
149 |
+
V_l = nn.ReLU()(self.linear_V(V)) # W_V x input_V(x)
|
150 |
+
|
151 |
+
# Split and concat
|
152 |
+
Q_split = Q_l.split(split_size=self.hidden_dim // num_heads, dim=2)
|
153 |
+
K_split = K_l.split(split_size=self.hidden_dim // num_heads, dim=2)
|
154 |
+
V_split = V_l.split(split_size=self.hidden_dim // num_heads, dim=2)
|
155 |
+
|
156 |
+
Q_ = torch.cat(Q_split, dim=0) # (h*N, T_q, C/h)
|
157 |
+
K_ = torch.cat(K_split, dim=0) # (h*N, T_k, C/h)
|
158 |
+
V_ = torch.cat(V_split, dim=0) # (h*N, T_v, C/h)
|
159 |
+
|
160 |
+
# Multiplication
|
161 |
+
outputs = torch.bmm(Q_, K_.transpose(2, 1)) # Q x K^T score
|
162 |
+
|
163 |
+
# Scale
|
164 |
+
outputs = outputs / (K_.size()[
|
165 |
+
-1] ** 0.5) # divide by the squared root of dimension K
|
166 |
+
|
167 |
+
# Key Masking
|
168 |
+
key_masks = torch.sign(torch.abs(K).sum(dim=-1)) # (N, T_k)
|
169 |
+
key_masks = key_masks.repeat(num_heads, 1) # (h*N, T_k)
|
170 |
+
key_masks = key_masks.unsqueeze(1).repeat(1, Q.size()[1],
|
171 |
+
1) # (h*N, T_q, T_k)
|
172 |
+
|
173 |
+
paddings = torch.ones_like(key_masks) * (-2 ** 32 + 1)
|
174 |
+
outputs = torch.where(torch.eq(key_masks, 0), paddings,
|
175 |
+
outputs) # (h*N, T_q, T_k)
|
176 |
+
|
177 |
+
# Activation
|
178 |
+
outputs = nn.Softmax(dim=2)(
|
179 |
+
outputs) # (h*N, T_q, T_k) Output is the score, activate function softmax it to probability
|
180 |
+
|
181 |
+
# Query Masking
|
182 |
+
query_masks = torch.sign(torch.abs(Q).sum(dim=-1)) # (N, T_q)
|
183 |
+
query_masks = query_masks.repeat(num_heads, 1) # (h*N, T_q)
|
184 |
+
query_masks = query_masks.unsqueeze(-1).repeat(1, 1, K.size()[
|
185 |
+
1]) # (h*N, T_q, T_k)
|
186 |
+
outputs = outputs * query_masks # broadcasting. (h*N, T_q, T_k)
|
187 |
+
|
188 |
+
# Dropouts
|
189 |
+
outputs = self.dropout(outputs)
|
190 |
+
|
191 |
+
# Weighted sum
|
192 |
+
outputs = torch.bmm(outputs,
|
193 |
+
V_) # ( h*N, T_q, C/h) multiply the V by scores(output)
|
194 |
+
|
195 |
+
# Restore shape
|
196 |
+
outputs = outputs.split(N, dim=0) # (N, T_q, C)
|
197 |
+
outputs = torch.cat(outputs, dim=2)
|
198 |
+
|
199 |
+
# Residual connection
|
200 |
+
outputs = outputs + Q_l
|
201 |
+
|
202 |
+
# Normalize
|
203 |
+
outputs = self.norm(outputs) # (N, T_q, C)
|
204 |
+
|
205 |
+
return outputs
|
206 |
+
|
207 |
+
|
208 |
+
class my_model(nn.Module):
|
209 |
+
def __init__(self):
|
210 |
+
super(my_model, self).__init__()
|
211 |
+
self.my_embed = nn.Embedding(textCNN_param['vocab_size'],
|
212 |
+
textCNN_param['embed_dim'], padding_idx=1)
|
213 |
+
self.my_linear = nn.Linear(256, 5) # 转化后过softmax便代表每个label类别概率
|
214 |
+
# self.my_linear = nn.Linear(256, 4)
|
215 |
+
self.dropout = nn.Dropout(0.1)
|
216 |
+
self.layers = nn.ModuleList(
|
217 |
+
[Multihead_Attention(hidden_dim=textCNN_param['embed_dim'],
|
218 |
+
num_heads=1,
|
219 |
+
dropout_rate=0.1) for _ in range(6)])
|
220 |
+
|
221 |
+
def forward(self, sentences):
|
222 |
+
# sentences = sentences.long()
|
223 |
+
# sentences.to('cuda:0')
|
224 |
+
sentences = self.my_embed(sentences)
|
225 |
+
for layer in self.layers:
|
226 |
+
sentences = layer(sentences, sentences,
|
227 |
+
sentences) # sentence 64x20x128
|
228 |
+
|
229 |
+
model_output = torch.mean(sentences, dim=1) # 64x128
|
230 |
+
model_output = self.dropout(model_output)
|
231 |
+
model_output = self.my_linear(model_output) # 64x4
|
232 |
+
model_output = F.log_softmax(model_output, dim=1)
|
233 |
+
# model_output = self.dropout(model_output)
|
234 |
+
return model_output
|
235 |
+
|
236 |
+
|
237 |
+
if __name__ == '__main__':
|
238 |
+
features = torch.arange(0, 24)
|
239 |
+
|
240 |
+
features = torch.where(features < 20, features,
|
241 |
+
torch.zeros_like(features))
|
242 |
+
features = features.view([2, 3, 4]).float()
|
243 |
+
print(features)
|
244 |
+
print(features.size())
|
245 |
+
attention1 = Attention1(hidden_dim=features.size()[-1])
|
246 |
+
print(attention1(features))
|
247 |
+
print('size is', attention1(features).size()[-1])
|
248 |
+
|
249 |
+
attention2 = Attention2(hidden_dim=features.size()[-1])
|
250 |
+
print(attention2(features))
|
251 |
+
|
252 |
+
attention3 = Multihead_Attention(hidden_dim=features.size()[-1],
|
253 |
+
num_heads=2,
|
254 |
+
dropout_rate=0.0)
|
255 |
+
print(attention3(features, features, features))
|
sen2inds.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf_8 -*-
|
2 |
+
from tqdm import tqdm
|
3 |
+
import jieba
|
4 |
+
import random
|
5 |
+
|
6 |
+
trainFile = 'data/output.txt' # trainFile = 'data/train.txt'
|
7 |
+
trainDataVecFile = 'traindata_vec.txt'
|
8 |
+
|
9 |
+
devFile = 'data/output2.txt' # 'data/dev.txt'
|
10 |
+
devDataVecFile = 'devdata_vec.txt'
|
11 |
+
|
12 |
+
labelFile = 'data/label2.txt' # labelFile = 'data/label.txt'
|
13 |
+
stopwordFile = 'data/stopword.txt'
|
14 |
+
|
15 |
+
wordLabelFile = 'wordLabel.txt'
|
16 |
+
|
17 |
+
maxLen = 20
|
18 |
+
|
19 |
+
title_ind = [1, 2, 3, 4]
|
20 |
+
title_ind.extend([0] * 16)
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def read_labelFile(file):
|
25 |
+
data = open(file, 'r', encoding='utf_8').read().split('\n')
|
26 |
+
data.remove('')
|
27 |
+
label_w2n = {}
|
28 |
+
label_n2w = {}
|
29 |
+
for line in tqdm(data, desc='read label'):
|
30 |
+
line = line.split(' ')
|
31 |
+
name_w = line[0]
|
32 |
+
name_n = int(line[1])
|
33 |
+
label_w2n[name_w] = name_n
|
34 |
+
label_n2w[name_n] = name_w
|
35 |
+
|
36 |
+
return label_w2n, label_n2w
|
37 |
+
|
38 |
+
|
39 |
+
def read_stopword(file):
|
40 |
+
data = open(file, 'r', encoding='utf_8').read().split('\n')
|
41 |
+
|
42 |
+
return data
|
43 |
+
|
44 |
+
|
45 |
+
def get_worddict(file):
|
46 |
+
datas = open(file, 'r', encoding='utf_8').read().split('\n')
|
47 |
+
datas = list(filter(None, datas))
|
48 |
+
word2ind = {}
|
49 |
+
for line in tqdm(datas, desc="get_worddict"):
|
50 |
+
line = line.split(' ')
|
51 |
+
word2ind[line[0]] = int(line[1])
|
52 |
+
|
53 |
+
ind2word = {word2ind[w]: w for w in word2ind}
|
54 |
+
return word2ind, ind2word
|
55 |
+
|
56 |
+
|
57 |
+
def json2txt():
|
58 |
+
label_dict, label_n2w = read_labelFile(labelFile)
|
59 |
+
word2ind, ind2word = get_worddict(wordLabelFile)
|
60 |
+
stoplist = read_stopword(stopwordFile)
|
61 |
+
cla_dict = {}
|
62 |
+
|
63 |
+
# train data to vec
|
64 |
+
traindataTxt = open(trainDataVecFile, 'w')
|
65 |
+
datas = open(trainFile, 'r', encoding='utf_8').readlines()
|
66 |
+
datas = list(filter(None, datas))
|
67 |
+
random.shuffle(datas)
|
68 |
+
for line in tqdm(datas, desc="traindata to vec"):
|
69 |
+
line = line.replace('\n', '').split(':')
|
70 |
+
# line = line.replace('\n','').split('\t')
|
71 |
+
cla = line[1]
|
72 |
+
# if cla in [21, 13, 9, 24, 23, 19, 14]:
|
73 |
+
# continue
|
74 |
+
if cla in cla_dict:
|
75 |
+
cla_dict[cla] += 1
|
76 |
+
else:
|
77 |
+
cla_dict[cla] = 1
|
78 |
+
|
79 |
+
cla_ind = label_dict[cla]
|
80 |
+
title_seg = ['我', '要', '下', '单']
|
81 |
+
title_seg = [i for i in line[0]]
|
82 |
+
# title_seg = jieba.cut(line[0], cut_all=False)
|
83 |
+
title_ind = [cla_ind]
|
84 |
+
for w in title_seg:
|
85 |
+
if w in stoplist:
|
86 |
+
continue
|
87 |
+
title_ind.append(word2ind[w])
|
88 |
+
length = len(title_ind)
|
89 |
+
if length > maxLen + 1:
|
90 |
+
title_ind = title_ind[0:21]
|
91 |
+
if length < maxLen + 1:
|
92 |
+
title_ind.extend([0] * (maxLen - length + 1))
|
93 |
+
|
94 |
+
for n in title_ind:
|
95 |
+
traindataTxt.write(str(n) + ',')
|
96 |
+
traindataTxt.write('\n')
|
97 |
+
|
98 |
+
# dev data to vec
|
99 |
+
traindataTxt = open(devDataVecFile, 'w')
|
100 |
+
datas = open(devFile, 'r', encoding='utf_8').readlines()
|
101 |
+
datas = list(filter(None, datas))
|
102 |
+
random.shuffle(datas)
|
103 |
+
for line in tqdm(datas, desc="dev to vec"):
|
104 |
+
line = line.replace('\n', '').split(':')
|
105 |
+
# line = line.replace('\n', '').split('\t')
|
106 |
+
cla = line[1]
|
107 |
+
# if cla in [21, 13, 9, 24, 23, 19, 14]:
|
108 |
+
# continue
|
109 |
+
if cla in cla_dict:
|
110 |
+
cla_dict[cla] += 1
|
111 |
+
else:
|
112 |
+
cla_dict[cla] = 1
|
113 |
+
|
114 |
+
cla_ind = label_dict[cla]
|
115 |
+
title_seg = [i for i in line[0]]
|
116 |
+
# title_seg = jieba.cut(line[0], cut_all=False)
|
117 |
+
title_ind = [cla_ind]
|
118 |
+
for w in title_seg:
|
119 |
+
if w in stoplist:
|
120 |
+
continue
|
121 |
+
title_ind.append(word2ind[w])
|
122 |
+
length = len(title_ind)
|
123 |
+
if length > maxLen + 1:
|
124 |
+
title_ind = title_ind[0:21]
|
125 |
+
if length < maxLen + 1:
|
126 |
+
title_ind.extend([0] * (maxLen - length + 1))
|
127 |
+
|
128 |
+
for n in title_ind:
|
129 |
+
traindataTxt.write(str(n) + ',')
|
130 |
+
traindataTxt.write('\n')
|
131 |
+
|
132 |
+
cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True)
|
133 |
+
f = open('cla_length.txt', 'w', encoding='utf_8')
|
134 |
+
total = 0
|
135 |
+
for t in cla_list:
|
136 |
+
a = str(t[0])
|
137 |
+
d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n'
|
138 |
+
total += t[1]
|
139 |
+
f.write(d)
|
140 |
+
|
141 |
+
f.write('total: ' + str(total))
|
142 |
+
|
143 |
+
|
144 |
+
# traindata_vec.txt
|
145 |
+
# devdata_vec.txt
|
146 |
+
def main():
|
147 |
+
json2txt()
|
148 |
+
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
main()
|
textCNN_data.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
from tqdm import tqdm
|
5 |
+
import torch
|
6 |
+
|
7 |
+
import sen2inds
|
8 |
+
|
9 |
+
|
10 |
+
class textCNN_data(Dataset):
|
11 |
+
def __init__(self, trainDataFile):
|
12 |
+
trainData = open(trainDataFile, 'r').read().split('\n')
|
13 |
+
trainData = list(filter(None, trainData))
|
14 |
+
|
15 |
+
res = []
|
16 |
+
for data in tqdm(trainData, desc='index to tensor'):
|
17 |
+
data = list(filter(None, data.split(',')))
|
18 |
+
data = [int(x) for x in data]
|
19 |
+
cla = torch.tensor(data[0], dtype=torch.long)
|
20 |
+
sentence = torch.tensor(data[1:], dtype=torch.long)
|
21 |
+
temp = []
|
22 |
+
temp.append(cla)
|
23 |
+
temp.append(sentence)
|
24 |
+
res.append(temp)
|
25 |
+
|
26 |
+
self.trainData = res
|
27 |
+
|
28 |
+
def __len__(self):
|
29 |
+
return len(self.trainData)
|
30 |
+
|
31 |
+
def __getitem__(self, idx):
|
32 |
+
data = self.trainData[idx]
|
33 |
+
cla = data[0]
|
34 |
+
sentence = data[1]
|
35 |
+
|
36 |
+
return cla, sentence
|
37 |
+
|
38 |
+
|
39 |
+
word2ind, ind2word = sen2inds.get_worddict('wordLabel.txt')
|
40 |
+
label_w2n, label_n2w = sen2inds.read_labelFile('data/label2.txt') # sen2inds.read_labelFile('data/label2.txt')
|
41 |
+
|
42 |
+
textCNN_param = {
|
43 |
+
'vocab_size': len(word2ind) + 1, # plus one for 0 padding
|
44 |
+
'embed_dim': 256, # 1 x 128 vector
|
45 |
+
'class_num': len(label_w2n),
|
46 |
+
"kernel_num": 16,
|
47 |
+
"kernel_size": [3, 4, 5],
|
48 |
+
"dropout": 0.5,
|
49 |
+
}
|
50 |
+
dataLoader_param = {
|
51 |
+
'batch_size': 128,
|
52 |
+
'shuffle': True,
|
53 |
+
}
|
train2.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from textCNN_data import textCNN_data, textCNN_param, dataLoader_param
|
5 |
+
from torch.utils.data import DataLoader
|
6 |
+
from multihead_attention import my_model
|
7 |
+
import os
|
8 |
+
from torch.nn import functional as F
|
9 |
+
|
10 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
11 |
+
|
12 |
+
|
13 |
+
def validation(model, val_dataLoader, device):
|
14 |
+
model.eval()
|
15 |
+
total = 0
|
16 |
+
correct = 0
|
17 |
+
with torch.no_grad():
|
18 |
+
for i, (clas, sentences) in enumerate(val_dataLoader):
|
19 |
+
try:
|
20 |
+
# sentences = sentences.type(torch.LongTensor).to(device)
|
21 |
+
# clas = clas.type(torch.LongTensor).to(device)
|
22 |
+
out = model(
|
23 |
+
sentences.to(
|
24 |
+
device)) # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear)
|
25 |
+
# out = F.relu(out.squeeze(-3))
|
26 |
+
# out = F.max_pool1d(out, out.size(2)).squeeze(2)
|
27 |
+
# softmax = nn.Softmax(dim=1)
|
28 |
+
|
29 |
+
pred = torch.argmax(out, dim=1) # 64x4 -> 64x1
|
30 |
+
|
31 |
+
correct += (pred == clas.to(device)).sum()
|
32 |
+
total += clas.size()[0]
|
33 |
+
except IndexError as e:
|
34 |
+
print(i)
|
35 |
+
print('clas', clas)
|
36 |
+
print('clas size', clas.size())
|
37 |
+
print('sentence', sentences)
|
38 |
+
print('sentences size', sentences.size())
|
39 |
+
print(e)
|
40 |
+
print(e.__traceback__)
|
41 |
+
exit()
|
42 |
+
|
43 |
+
acc = correct / total
|
44 |
+
return acc
|
45 |
+
|
46 |
+
|
47 |
+
# seed = 66666666
|
48 |
+
# torch.cuda.manual_seed(seed)
|
49 |
+
# torch.manual_seed(seed)
|
50 |
+
# torch.backends.cudnn.deterministic = True
|
51 |
+
# torch.backends.cudnn.benchmark = False
|
52 |
+
print(torch.cuda.get_device_name())
|
53 |
+
if torch.cuda.is_available():
|
54 |
+
device = 'cuda:0'
|
55 |
+
else:
|
56 |
+
device = 'cpu'
|
57 |
+
|
58 |
+
# device = 'cpu'
|
59 |
+
|
60 |
+
|
61 |
+
# init dataset
|
62 |
+
print('init dataset...')
|
63 |
+
trainDataFile = 'traindata_vec.txt'
|
64 |
+
valDataFile = 'devdata_vec.txt'
|
65 |
+
train_dataset = textCNN_data(trainDataFile)
|
66 |
+
train_dataLoader = DataLoader(train_dataset,
|
67 |
+
batch_size=dataLoader_param['batch_size'],
|
68 |
+
shuffle=True)
|
69 |
+
|
70 |
+
val_dataset = textCNN_data(valDataFile)
|
71 |
+
val_dataLoader = DataLoader(val_dataset,
|
72 |
+
batch_size=dataLoader_param['batch_size'],
|
73 |
+
# batch size 64
|
74 |
+
shuffle=False)
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
# 设置随机种子,保证结果可复现
|
78 |
+
|
79 |
+
# init net
|
80 |
+
print('init net...')
|
81 |
+
model = my_model()
|
82 |
+
model.to(device)
|
83 |
+
print(model)
|
84 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
|
85 |
+
criterion = nn.CrossEntropyLoss()
|
86 |
+
|
87 |
+
print("training...")
|
88 |
+
|
89 |
+
best_dev_acc = 0
|
90 |
+
# embed.train()
|
91 |
+
for epoch in range(100):
|
92 |
+
model.train()
|
93 |
+
for i, (clas, sentences) in enumerate(train_dataLoader):
|
94 |
+
# sentences: batch size 64 x sentence length 20 x embed dimension 128
|
95 |
+
# 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor
|
96 |
+
|
97 |
+
out = model(sentences.to(
|
98 |
+
device)) # out: batch size 64 x word vector 4 (after my_linear)
|
99 |
+
try:
|
100 |
+
loss = criterion(out, clas.to(device))
|
101 |
+
except:
|
102 |
+
print(out.size(), out)
|
103 |
+
print(clas.size(), clas)
|
104 |
+
optimizer.zero_grad()
|
105 |
+
loss.backward()
|
106 |
+
optimizer.step()
|
107 |
+
if (i + 1) % 10 == 0:
|
108 |
+
print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
|
109 |
+
model.eval()
|
110 |
+
dev_acc = validation(model=model, val_dataLoader=val_dataLoader,
|
111 |
+
device=device)
|
112 |
+
|
113 |
+
if best_dev_acc < dev_acc:
|
114 |
+
best_dev_acc = dev_acc
|
115 |
+
print("save model...")
|
116 |
+
torch.save(model.state_dict(), "model.bin")
|
117 |
+
print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
|
118 |
+
print("best dev acc %.4f dev acc %.4f" % (best_dev_acc, dev_acc))
|
xlsx2txt.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.model_selection import train_test_split
|
3 |
+
|
4 |
+
# Load the Excel file into a DataFrame
|
5 |
+
df = pd.read_excel('data\data_excel.xlsx')
|
6 |
+
n_rows = df.shape[0]
|
7 |
+
df = df.iloc[:, [2, 4]]
|
8 |
+
print(df)
|
9 |
+
# Filter out rows where col2 is 'hello' or 'hi'
|
10 |
+
df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])]
|
11 |
+
print(df)
|
12 |
+
# Assume df is the original DataFrame you want to split
|
13 |
+
train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9,
|
14 |
+
random_state=42)
|
15 |
+
|
16 |
+
# Remove the white space from the columns
|
17 |
+
train_df = train_df.apply(lambda x: x.str.strip())
|
18 |
+
test_df = test_df.apply(lambda x: x.str.strip())
|
19 |
+
print(train_df)
|
20 |
+
print(test_df)
|
21 |
+
# Concatenate the columns with a whitespace separator
|
22 |
+
train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply(
|
23 |
+
lambda x: ':' + str(x))
|
24 |
+
test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x))
|
25 |
+
|
26 |
+
# Set the display options for left alignment
|
27 |
+
pd.options.display.max_colwidth = None
|
28 |
+
pd.options.display.colheader_justify = 'left'
|
29 |
+
|
30 |
+
# Print and write the DataFrames to text files
|
31 |
+
with open('data\output.txt', 'w', encoding='utf-8') as f:
|
32 |
+
output = train_df.to_string(index=False, header=False).replace(' ', '')
|
33 |
+
# output = output.replace(':', '\t')
|
34 |
+
f.write(output)
|
35 |
+
f.write('\n')
|
36 |
+
|
37 |
+
with open('data\output2.txt', 'w', encoding='utf-8') as f:
|
38 |
+
output = test_df.to_string(index=False, header=False).replace(' ', '')
|
39 |
+
# output = output.replace(':', '\t')
|
40 |
+
f.write(output)
|
41 |
+
f.write('\n')
|