hoodiexxx commited on
Commit
2c1d053
1 Parent(s): e3fc0f2

Upload 7 files

Browse files
Files changed (7) hide show
  1. get_wordlists.py +97 -0
  2. model.bin +3 -0
  3. multihead_attention.py +255 -0
  4. sen2inds.py +151 -0
  5. textCNN_data.py +53 -0
  6. train2.py +118 -0
  7. xlsx2txt.py +41 -0
get_wordlists.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ '''
3
+ 将训练数据使用jieba分词工具进行分词。并且剔除stopList中的词。
4
+ 得到词表:
5
+ 词表的每一行的内容为:词 词的序号 词的频次
6
+ '''
7
+
8
+ import json
9
+ import jieba
10
+ from tqdm import tqdm
11
+
12
+ trainFile = 'data/output.txt' # trainFile = 'data/train.txt'
13
+ devFile = 'data/output2.txt' #"data/dev.txt"
14
+ stopwordFile = 'data/stopword.txt'
15
+ wordLabelFile = 'wordLabel.txt'
16
+ lengthFile = 'length.txt'
17
+
18
+
19
+ def read_stopword(file):
20
+ datas = open(file, 'r', encoding='utf_8').readlines()
21
+ datas = [data.replace('\n', '') for data in datas]
22
+ return datas
23
+
24
+
25
+ def main():
26
+ worddict = {}
27
+ stoplist = read_stopword(stopwordFile)
28
+ len_dic = {}
29
+ data_num = 0
30
+ # trainFile
31
+ datas = open(trainFile, 'r', encoding='utf_8').readlines()
32
+ data_num += len(datas)
33
+ datas = list(filter(None, datas))
34
+ for line in tqdm(datas, desc='traindata word to label'):
35
+ line = line.replace('\n', '').split(':')
36
+ # line = line.replace('\n', '').split('\t')
37
+ title_seg = [i for i in line[0]]
38
+ # title_seg = jieba.cut(line[0], cut_all=False)
39
+ length = 0
40
+ for w in title_seg:
41
+ if w in stoplist:
42
+ continue
43
+ length += 1
44
+ if w in worddict:
45
+ worddict[w] += 1
46
+ else:
47
+ worddict[w] = 1
48
+ if length in len_dic:
49
+ len_dic[length] += 1
50
+ else:
51
+ len_dic[length] = 1
52
+
53
+ # devFile
54
+ datas = open(devFile, 'r', encoding='utf_8').readlines()
55
+ datas = list(filter(None, datas))
56
+ data_num += len(datas)
57
+ for line in tqdm(datas, desc='devdata word to label'):
58
+ line = line.replace('\n', '').split(':')
59
+ # line = line.replace('\n', '').split('\t')
60
+ title_seg = [i for i in line[0]]
61
+ # title_seg = jieba.cut(line[0], cut_all=False)
62
+ length = 0
63
+ for w in title_seg:
64
+ if w in stoplist:
65
+ continue
66
+ length += 1
67
+ if w in worddict:
68
+ worddict[w] += 1
69
+ else:
70
+ worddict[w] = 1
71
+ if length in len_dic:
72
+ len_dic[length] += 1
73
+ else:
74
+ len_dic[length] = 1
75
+
76
+ wordlist = sorted(worddict.items(), key=lambda item: item[1], reverse=True)
77
+ f = open(wordLabelFile, 'w', encoding='utf_8')
78
+ # ind = 0
79
+ ind = 1
80
+ for t in wordlist:
81
+ d = t[0] + ' ' + str(ind) + ' ' + str(t[1]) + '\n'
82
+ ind += 1
83
+ f.write(d)
84
+
85
+ for k, v in len_dic.items():
86
+ len_dic[k] = round(v * 1.0 / data_num, 3) # calculate the probability of each length and round it to 3 decimal places
87
+ len_list = sorted(len_dic.items(), key=lambda item: item[0], reverse=True)
88
+ f = open(lengthFile, 'w', encoding='utf_8')
89
+ for t in len_list:
90
+ d = str(t[0]) + ' ' + str(t[1]) + '\n'
91
+ f.write(d)
92
+
93
+
94
+ # lengthFile = 'length.txt'
95
+ # wordLabelFile = 'wordLabel.txt'
96
+ if __name__ == "__main__":
97
+ main()
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35cbd5939ffd2a3bdda50a86d758e45b6d819811ff693b1c8d8c6ada85fc34c
3
+ size 6843938
multihead_attention.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ from textCNN_data import textCNN_param
5
+
6
+
7
+ class Attention1(nn.Module):
8
+ """
9
+ 1.输入 [N,T,C] -> Linear、Tanh
10
+ 2. -> [N,T,1] -> unsqueeze
11
+ 3. -> [N,T] -> Softmax
12
+ 4. -> [N,T] -> unsqueeze
13
+ 5. -> [N,1,T] -> repeat
14
+ 6. -> [N,C,T] -> transpose
15
+ 7. -> [N,T,C]
16
+ """
17
+
18
+ def __init__(self, hidden_dim):
19
+ super(Attention1, self).__init__()
20
+ self.hidden_dim = hidden_dim
21
+ self.dense = nn.Linear(hidden_dim, 1)
22
+
23
+ def forward(self, features):
24
+ batch_size, time_step, hidden_dim = 128, 20, 128 # features.size()
25
+ weight = nn.Tanh()(self.dense(features)).squeeze(-1)
26
+
27
+ # mask给负无穷使得权重为0
28
+ mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
29
+ paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
30
+ weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
31
+
32
+ weight = nn.Softmax(dim=1)(weight)
33
+ weight = weight.unsqueeze(1)
34
+ weight = weight.repeat(1, hidden_dim, 1)
35
+ weight = weight.transpose(2, 1)
36
+ features_attention = weight * features
37
+
38
+ return features_attention
39
+
40
+
41
+ class Attention2(nn.Module):
42
+ """
43
+ 1.输入 [N,T,C] -> Linear、Tanh
44
+ 2. -> [N,T,C] -> transpose
45
+ 3. -> [N,C,T] -> Softmax
46
+ 4. -> [N,C,T] -> mean
47
+ 5. -> [N,T] -> unsqueeze
48
+ 5. -> [N,1,T] -> expand
49
+ 6. -> [N,C,T] -> transpose
50
+ 7. -> [N,T,C]
51
+ """
52
+
53
+ def __init__(self, hidden_dim):
54
+ super(Attention2, self).__init__()
55
+ self.hidden_dim = hidden_dim
56
+ self.dense = nn.Linear(hidden_dim, hidden_dim)
57
+
58
+ def forward(self, features, mean=True):
59
+ batch_size, time_step, hidden_dim = features.size()
60
+ weight = nn.Tanh()(self.dense(features))
61
+
62
+ # mask给负无穷使得权重为0
63
+ mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
64
+ mask_idx = mask_idx.unsqueeze(-1).expand(batch_size, time_step,
65
+ hidden_dim)
66
+ paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
67
+ weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
68
+
69
+ weight = weight.transpose(2, 1)
70
+ weight = nn.Softmax(dim=2)(weight)
71
+ if mean:
72
+ weight = weight.mean(dim=1)
73
+ weight = weight.unsqueeze(1)
74
+ weight = weight.repeat(1, hidden_dim, 1)
75
+ weight = weight.transpose(2, 1)
76
+ features_attention = weight * features
77
+
78
+ return features_attention
79
+
80
+
81
+ class LayerNorm(nn.Module):
82
+ """
83
+ 结果和nn.LayerNorm有些出入。
84
+ """
85
+
86
+ def __init__(self, features, epsilon=1e-8):
87
+ super(LayerNorm, self).__init__()
88
+ self.beta = nn.Parameter(torch.zeros(features))
89
+ self.gamma = nn.Parameter(torch.ones(features))
90
+ self.epsilon = epsilon
91
+
92
+ def forward(self, x):
93
+ mean = x.mean(-1, keepdim=True)
94
+ std = x.std(-1, keepdim=True)
95
+ normalized = (x - mean) / (std + self.epsilon)
96
+ outputs = self.gamma * normalized + self.beta
97
+
98
+ return outputs
99
+
100
+
101
+ class Multihead_Attention(nn.Module):
102
+ """
103
+ multihead_attention
104
+ 根据<https://www.github.com/kyubyong/transformer>修改
105
+ 1.split+cat
106
+ 2.matmul(q,k)
107
+ 3.mask k
108
+ 4.softmax
109
+ 5.mask q
110
+ 6.matmul(attn,v)
111
+ 7.split+cat
112
+ 8.res q
113
+ 9.norm
114
+ """
115
+
116
+ def __init__(self,
117
+ hidden_dim,
118
+ C_q=None,
119
+ C_k=None,
120
+ C_v=None,
121
+ num_heads=1,
122
+ dropout_rate=0.0):
123
+ super(Multihead_Attention, self).__init__()
124
+ self.hidden_dim = hidden_dim
125
+ C_q = C_q if C_q else hidden_dim
126
+ C_k = C_k if C_k else hidden_dim
127
+ C_v = C_v if C_v else hidden_dim
128
+ self.linear_Q = nn.Linear(C_q, hidden_dim) # W_Q
129
+ self.linear_K = nn.Linear(C_k, hidden_dim) # W_K
130
+ self.linear_V = nn.Linear(C_v, hidden_dim) # W_V
131
+ self.num_heads = num_heads
132
+ self.norm = nn.LayerNorm(hidden_dim)
133
+ self.dropout = nn.Dropout(p=dropout_rate)
134
+
135
+ def forward(self,
136
+ Q, K, V):
137
+ """
138
+ :param Q: A 3d tensor with shape of [N, T_q, C_q]
139
+ :param K: A 3d tensor with shape of [N, T_k, C_k]
140
+ :param V: A 3d tensor with shape of [N, T_v, C_v]
141
+ :return:
142
+ """
143
+ num_heads = self.num_heads
144
+ N = Q.size()[0]
145
+
146
+ # Linear projections
147
+ Q_l = nn.ReLU()(self.linear_Q(Q)) # W_Q x input_Q(x)
148
+ K_l = nn.ReLU()(self.linear_K(K)) # W_K x input_K(x)
149
+ V_l = nn.ReLU()(self.linear_V(V)) # W_V x input_V(x)
150
+
151
+ # Split and concat
152
+ Q_split = Q_l.split(split_size=self.hidden_dim // num_heads, dim=2)
153
+ K_split = K_l.split(split_size=self.hidden_dim // num_heads, dim=2)
154
+ V_split = V_l.split(split_size=self.hidden_dim // num_heads, dim=2)
155
+
156
+ Q_ = torch.cat(Q_split, dim=0) # (h*N, T_q, C/h)
157
+ K_ = torch.cat(K_split, dim=0) # (h*N, T_k, C/h)
158
+ V_ = torch.cat(V_split, dim=0) # (h*N, T_v, C/h)
159
+
160
+ # Multiplication
161
+ outputs = torch.bmm(Q_, K_.transpose(2, 1)) # Q x K^T score
162
+
163
+ # Scale
164
+ outputs = outputs / (K_.size()[
165
+ -1] ** 0.5) # divide by the squared root of dimension K
166
+
167
+ # Key Masking
168
+ key_masks = torch.sign(torch.abs(K).sum(dim=-1)) # (N, T_k)
169
+ key_masks = key_masks.repeat(num_heads, 1) # (h*N, T_k)
170
+ key_masks = key_masks.unsqueeze(1).repeat(1, Q.size()[1],
171
+ 1) # (h*N, T_q, T_k)
172
+
173
+ paddings = torch.ones_like(key_masks) * (-2 ** 32 + 1)
174
+ outputs = torch.where(torch.eq(key_masks, 0), paddings,
175
+ outputs) # (h*N, T_q, T_k)
176
+
177
+ # Activation
178
+ outputs = nn.Softmax(dim=2)(
179
+ outputs) # (h*N, T_q, T_k) Output is the score, activate function softmax it to probability
180
+
181
+ # Query Masking
182
+ query_masks = torch.sign(torch.abs(Q).sum(dim=-1)) # (N, T_q)
183
+ query_masks = query_masks.repeat(num_heads, 1) # (h*N, T_q)
184
+ query_masks = query_masks.unsqueeze(-1).repeat(1, 1, K.size()[
185
+ 1]) # (h*N, T_q, T_k)
186
+ outputs = outputs * query_masks # broadcasting. (h*N, T_q, T_k)
187
+
188
+ # Dropouts
189
+ outputs = self.dropout(outputs)
190
+
191
+ # Weighted sum
192
+ outputs = torch.bmm(outputs,
193
+ V_) # ( h*N, T_q, C/h) multiply the V by scores(output)
194
+
195
+ # Restore shape
196
+ outputs = outputs.split(N, dim=0) # (N, T_q, C)
197
+ outputs = torch.cat(outputs, dim=2)
198
+
199
+ # Residual connection
200
+ outputs = outputs + Q_l
201
+
202
+ # Normalize
203
+ outputs = self.norm(outputs) # (N, T_q, C)
204
+
205
+ return outputs
206
+
207
+
208
+ class my_model(nn.Module):
209
+ def __init__(self):
210
+ super(my_model, self).__init__()
211
+ self.my_embed = nn.Embedding(textCNN_param['vocab_size'],
212
+ textCNN_param['embed_dim'], padding_idx=1)
213
+ self.my_linear = nn.Linear(256, 5) # 转化后过softmax便代表每个label类别概率
214
+ # self.my_linear = nn.Linear(256, 4)
215
+ self.dropout = nn.Dropout(0.1)
216
+ self.layers = nn.ModuleList(
217
+ [Multihead_Attention(hidden_dim=textCNN_param['embed_dim'],
218
+ num_heads=1,
219
+ dropout_rate=0.1) for _ in range(6)])
220
+
221
+ def forward(self, sentences):
222
+ # sentences = sentences.long()
223
+ # sentences.to('cuda:0')
224
+ sentences = self.my_embed(sentences)
225
+ for layer in self.layers:
226
+ sentences = layer(sentences, sentences,
227
+ sentences) # sentence 64x20x128
228
+
229
+ model_output = torch.mean(sentences, dim=1) # 64x128
230
+ model_output = self.dropout(model_output)
231
+ model_output = self.my_linear(model_output) # 64x4
232
+ model_output = F.log_softmax(model_output, dim=1)
233
+ # model_output = self.dropout(model_output)
234
+ return model_output
235
+
236
+
237
+ if __name__ == '__main__':
238
+ features = torch.arange(0, 24)
239
+
240
+ features = torch.where(features < 20, features,
241
+ torch.zeros_like(features))
242
+ features = features.view([2, 3, 4]).float()
243
+ print(features)
244
+ print(features.size())
245
+ attention1 = Attention1(hidden_dim=features.size()[-1])
246
+ print(attention1(features))
247
+ print('size is', attention1(features).size()[-1])
248
+
249
+ attention2 = Attention2(hidden_dim=features.size()[-1])
250
+ print(attention2(features))
251
+
252
+ attention3 = Multihead_Attention(hidden_dim=features.size()[-1],
253
+ num_heads=2,
254
+ dropout_rate=0.0)
255
+ print(attention3(features, features, features))
sen2inds.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf_8 -*-
2
+ from tqdm import tqdm
3
+ import jieba
4
+ import random
5
+
6
+ trainFile = 'data/output.txt' # trainFile = 'data/train.txt'
7
+ trainDataVecFile = 'traindata_vec.txt'
8
+
9
+ devFile = 'data/output2.txt' # 'data/dev.txt'
10
+ devDataVecFile = 'devdata_vec.txt'
11
+
12
+ labelFile = 'data/label2.txt' # labelFile = 'data/label.txt'
13
+ stopwordFile = 'data/stopword.txt'
14
+
15
+ wordLabelFile = 'wordLabel.txt'
16
+
17
+ maxLen = 20
18
+
19
+ title_ind = [1, 2, 3, 4]
20
+ title_ind.extend([0] * 16)
21
+
22
+
23
+
24
+ def read_labelFile(file):
25
+ data = open(file, 'r', encoding='utf_8').read().split('\n')
26
+ data.remove('')
27
+ label_w2n = {}
28
+ label_n2w = {}
29
+ for line in tqdm(data, desc='read label'):
30
+ line = line.split(' ')
31
+ name_w = line[0]
32
+ name_n = int(line[1])
33
+ label_w2n[name_w] = name_n
34
+ label_n2w[name_n] = name_w
35
+
36
+ return label_w2n, label_n2w
37
+
38
+
39
+ def read_stopword(file):
40
+ data = open(file, 'r', encoding='utf_8').read().split('\n')
41
+
42
+ return data
43
+
44
+
45
+ def get_worddict(file):
46
+ datas = open(file, 'r', encoding='utf_8').read().split('\n')
47
+ datas = list(filter(None, datas))
48
+ word2ind = {}
49
+ for line in tqdm(datas, desc="get_worddict"):
50
+ line = line.split(' ')
51
+ word2ind[line[0]] = int(line[1])
52
+
53
+ ind2word = {word2ind[w]: w for w in word2ind}
54
+ return word2ind, ind2word
55
+
56
+
57
+ def json2txt():
58
+ label_dict, label_n2w = read_labelFile(labelFile)
59
+ word2ind, ind2word = get_worddict(wordLabelFile)
60
+ stoplist = read_stopword(stopwordFile)
61
+ cla_dict = {}
62
+
63
+ # train data to vec
64
+ traindataTxt = open(trainDataVecFile, 'w')
65
+ datas = open(trainFile, 'r', encoding='utf_8').readlines()
66
+ datas = list(filter(None, datas))
67
+ random.shuffle(datas)
68
+ for line in tqdm(datas, desc="traindata to vec"):
69
+ line = line.replace('\n', '').split(':')
70
+ # line = line.replace('\n','').split('\t')
71
+ cla = line[1]
72
+ # if cla in [21, 13, 9, 24, 23, 19, 14]:
73
+ # continue
74
+ if cla in cla_dict:
75
+ cla_dict[cla] += 1
76
+ else:
77
+ cla_dict[cla] = 1
78
+
79
+ cla_ind = label_dict[cla]
80
+ title_seg = ['我', '要', '下', '单']
81
+ title_seg = [i for i in line[0]]
82
+ # title_seg = jieba.cut(line[0], cut_all=False)
83
+ title_ind = [cla_ind]
84
+ for w in title_seg:
85
+ if w in stoplist:
86
+ continue
87
+ title_ind.append(word2ind[w])
88
+ length = len(title_ind)
89
+ if length > maxLen + 1:
90
+ title_ind = title_ind[0:21]
91
+ if length < maxLen + 1:
92
+ title_ind.extend([0] * (maxLen - length + 1))
93
+
94
+ for n in title_ind:
95
+ traindataTxt.write(str(n) + ',')
96
+ traindataTxt.write('\n')
97
+
98
+ # dev data to vec
99
+ traindataTxt = open(devDataVecFile, 'w')
100
+ datas = open(devFile, 'r', encoding='utf_8').readlines()
101
+ datas = list(filter(None, datas))
102
+ random.shuffle(datas)
103
+ for line in tqdm(datas, desc="dev to vec"):
104
+ line = line.replace('\n', '').split(':')
105
+ # line = line.replace('\n', '').split('\t')
106
+ cla = line[1]
107
+ # if cla in [21, 13, 9, 24, 23, 19, 14]:
108
+ # continue
109
+ if cla in cla_dict:
110
+ cla_dict[cla] += 1
111
+ else:
112
+ cla_dict[cla] = 1
113
+
114
+ cla_ind = label_dict[cla]
115
+ title_seg = [i for i in line[0]]
116
+ # title_seg = jieba.cut(line[0], cut_all=False)
117
+ title_ind = [cla_ind]
118
+ for w in title_seg:
119
+ if w in stoplist:
120
+ continue
121
+ title_ind.append(word2ind[w])
122
+ length = len(title_ind)
123
+ if length > maxLen + 1:
124
+ title_ind = title_ind[0:21]
125
+ if length < maxLen + 1:
126
+ title_ind.extend([0] * (maxLen - length + 1))
127
+
128
+ for n in title_ind:
129
+ traindataTxt.write(str(n) + ',')
130
+ traindataTxt.write('\n')
131
+
132
+ cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True)
133
+ f = open('cla_length.txt', 'w', encoding='utf_8')
134
+ total = 0
135
+ for t in cla_list:
136
+ a = str(t[0])
137
+ d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n'
138
+ total += t[1]
139
+ f.write(d)
140
+
141
+ f.write('total: ' + str(total))
142
+
143
+
144
+ # traindata_vec.txt
145
+ # devdata_vec.txt
146
+ def main():
147
+ json2txt()
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()
textCNN_data.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ import random
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ import torch
6
+
7
+ import sen2inds
8
+
9
+
10
+ class textCNN_data(Dataset):
11
+ def __init__(self, trainDataFile):
12
+ trainData = open(trainDataFile, 'r').read().split('\n')
13
+ trainData = list(filter(None, trainData))
14
+
15
+ res = []
16
+ for data in tqdm(trainData, desc='index to tensor'):
17
+ data = list(filter(None, data.split(',')))
18
+ data = [int(x) for x in data]
19
+ cla = torch.tensor(data[0], dtype=torch.long)
20
+ sentence = torch.tensor(data[1:], dtype=torch.long)
21
+ temp = []
22
+ temp.append(cla)
23
+ temp.append(sentence)
24
+ res.append(temp)
25
+
26
+ self.trainData = res
27
+
28
+ def __len__(self):
29
+ return len(self.trainData)
30
+
31
+ def __getitem__(self, idx):
32
+ data = self.trainData[idx]
33
+ cla = data[0]
34
+ sentence = data[1]
35
+
36
+ return cla, sentence
37
+
38
+
39
+ word2ind, ind2word = sen2inds.get_worddict('wordLabel.txt')
40
+ label_w2n, label_n2w = sen2inds.read_labelFile('data/label2.txt') # sen2inds.read_labelFile('data/label2.txt')
41
+
42
+ textCNN_param = {
43
+ 'vocab_size': len(word2ind) + 1, # plus one for 0 padding
44
+ 'embed_dim': 256, # 1 x 128 vector
45
+ 'class_num': len(label_w2n),
46
+ "kernel_num": 16,
47
+ "kernel_size": [3, 4, 5],
48
+ "dropout": 0.5,
49
+ }
50
+ dataLoader_param = {
51
+ 'batch_size': 128,
52
+ 'shuffle': True,
53
+ }
train2.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from textCNN_data import textCNN_data, textCNN_param, dataLoader_param
5
+ from torch.utils.data import DataLoader
6
+ from multihead_attention import my_model
7
+ import os
8
+ from torch.nn import functional as F
9
+
10
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
11
+
12
+
13
+ def validation(model, val_dataLoader, device):
14
+ model.eval()
15
+ total = 0
16
+ correct = 0
17
+ with torch.no_grad():
18
+ for i, (clas, sentences) in enumerate(val_dataLoader):
19
+ try:
20
+ # sentences = sentences.type(torch.LongTensor).to(device)
21
+ # clas = clas.type(torch.LongTensor).to(device)
22
+ out = model(
23
+ sentences.to(
24
+ device)) # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear)
25
+ # out = F.relu(out.squeeze(-3))
26
+ # out = F.max_pool1d(out, out.size(2)).squeeze(2)
27
+ # softmax = nn.Softmax(dim=1)
28
+
29
+ pred = torch.argmax(out, dim=1) # 64x4 -> 64x1
30
+
31
+ correct += (pred == clas.to(device)).sum()
32
+ total += clas.size()[0]
33
+ except IndexError as e:
34
+ print(i)
35
+ print('clas', clas)
36
+ print('clas size', clas.size())
37
+ print('sentence', sentences)
38
+ print('sentences size', sentences.size())
39
+ print(e)
40
+ print(e.__traceback__)
41
+ exit()
42
+
43
+ acc = correct / total
44
+ return acc
45
+
46
+
47
+ # seed = 66666666
48
+ # torch.cuda.manual_seed(seed)
49
+ # torch.manual_seed(seed)
50
+ # torch.backends.cudnn.deterministic = True
51
+ # torch.backends.cudnn.benchmark = False
52
+ print(torch.cuda.get_device_name())
53
+ if torch.cuda.is_available():
54
+ device = 'cuda:0'
55
+ else:
56
+ device = 'cpu'
57
+
58
+ # device = 'cpu'
59
+
60
+
61
+ # init dataset
62
+ print('init dataset...')
63
+ trainDataFile = 'traindata_vec.txt'
64
+ valDataFile = 'devdata_vec.txt'
65
+ train_dataset = textCNN_data(trainDataFile)
66
+ train_dataLoader = DataLoader(train_dataset,
67
+ batch_size=dataLoader_param['batch_size'],
68
+ shuffle=True)
69
+
70
+ val_dataset = textCNN_data(valDataFile)
71
+ val_dataLoader = DataLoader(val_dataset,
72
+ batch_size=dataLoader_param['batch_size'],
73
+ # batch size 64
74
+ shuffle=False)
75
+
76
+ if __name__ == "__main__":
77
+ # 设置随机种子,保证结果可复现
78
+
79
+ # init net
80
+ print('init net...')
81
+ model = my_model()
82
+ model.to(device)
83
+ print(model)
84
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
85
+ criterion = nn.CrossEntropyLoss()
86
+
87
+ print("training...")
88
+
89
+ best_dev_acc = 0
90
+ # embed.train()
91
+ for epoch in range(100):
92
+ model.train()
93
+ for i, (clas, sentences) in enumerate(train_dataLoader):
94
+ # sentences: batch size 64 x sentence length 20 x embed dimension 128
95
+ # 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor
96
+
97
+ out = model(sentences.to(
98
+ device)) # out: batch size 64 x word vector 4 (after my_linear)
99
+ try:
100
+ loss = criterion(out, clas.to(device))
101
+ except:
102
+ print(out.size(), out)
103
+ print(clas.size(), clas)
104
+ optimizer.zero_grad()
105
+ loss.backward()
106
+ optimizer.step()
107
+ if (i + 1) % 10 == 0:
108
+ print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
109
+ model.eval()
110
+ dev_acc = validation(model=model, val_dataLoader=val_dataLoader,
111
+ device=device)
112
+
113
+ if best_dev_acc < dev_acc:
114
+ best_dev_acc = dev_acc
115
+ print("save model...")
116
+ torch.save(model.state_dict(), "model.bin")
117
+ print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
118
+ print("best dev acc %.4f dev acc %.4f" % (best_dev_acc, dev_acc))
xlsx2txt.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+
4
+ # Load the Excel file into a DataFrame
5
+ df = pd.read_excel('data\data_excel.xlsx')
6
+ n_rows = df.shape[0]
7
+ df = df.iloc[:, [2, 4]]
8
+ print(df)
9
+ # Filter out rows where col2 is 'hello' or 'hi'
10
+ df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])]
11
+ print(df)
12
+ # Assume df is the original DataFrame you want to split
13
+ train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9,
14
+ random_state=42)
15
+
16
+ # Remove the white space from the columns
17
+ train_df = train_df.apply(lambda x: x.str.strip())
18
+ test_df = test_df.apply(lambda x: x.str.strip())
19
+ print(train_df)
20
+ print(test_df)
21
+ # Concatenate the columns with a whitespace separator
22
+ train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply(
23
+ lambda x: ':' + str(x))
24
+ test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x))
25
+
26
+ # Set the display options for left alignment
27
+ pd.options.display.max_colwidth = None
28
+ pd.options.display.colheader_justify = 'left'
29
+
30
+ # Print and write the DataFrames to text files
31
+ with open('data\output.txt', 'w', encoding='utf-8') as f:
32
+ output = train_df.to_string(index=False, header=False).replace(' ', '')
33
+ # output = output.replace(':', '\t')
34
+ f.write(output)
35
+ f.write('\n')
36
+
37
+ with open('data\output2.txt', 'w', encoding='utf-8') as f:
38
+ output = test_df.to_string(index=False, header=False).replace(' ', '')
39
+ # output = output.replace(':', '\t')
40
+ f.write(output)
41
+ f.write('\n')