hoodiexxx commited on
Commit
ef00e45
1 Parent(s): 9398cf7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +126 -2
README.md CHANGED
@@ -7,7 +7,8 @@ pipeline_tag: text-classification
7
  # Bert Chinese Text Classification Model
8
  this a Bert Model that train for customer service of logistics companies
9
 
10
- ## Word Label
 
11
  我 1 18719
12
 
13
  个 2 12236
@@ -46,4 +47,127 @@ this a Bert Model that train for customer service of logistics companies
46
 
47
  就 19 3570
48
 
49
- 好 20 3524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Bert Chinese Text Classification Model
8
  this a Bert Model that train for customer service of logistics companies
9
 
10
+ ## Word Label(word, index, number of occurences)
11
+ ```sh
12
  我 1 18719
13
 
14
  个 2 12236
 
47
 
48
  就 19 3570
49
 
50
+ 好 20 3524
51
+ ```
52
+
53
+ ## Tokenizer
54
+ ```python
55
+ label_dict, label_n2w = read_labelFile(labelFile)
56
+ word2ind, ind2word = get_worddict(wordLabelFile)
57
+ stoplist = read_stopword(stopwordFile)
58
+ cla_dict = {}
59
+
60
+ # train data to vec
61
+ traindataTxt = open(trainDataVecFile, 'w')
62
+ datas = open(trainFile, 'r', encoding='utf_8').readlines()
63
+ datas = list(filter(None, datas))
64
+ random.shuffle(datas)
65
+ for line in tqdm(datas, desc="traindata to vec"):
66
+ line = line.replace('\n', '').split(':')
67
+ # line = line.replace('\n','').split('\t')
68
+ cla = line[1]
69
+ # if cla in [21, 13, 9, 24, 23, 19, 14]:
70
+ # continue
71
+ if cla in cla_dict:
72
+ cla_dict[cla] += 1
73
+ else:
74
+ cla_dict[cla] = 1
75
+
76
+ cla_ind = label_dict[cla]
77
+ title_seg = ['我', '要', '下', '单']
78
+ title_seg = [i for i in line[0]]
79
+ # title_seg = jieba.cut(line[0], cut_all=False)
80
+ title_ind = [cla_ind]
81
+ for w in title_seg:
82
+ if w in stoplist:
83
+ continue
84
+ title_ind.append(word2ind[w])
85
+ length = len(title_ind)
86
+ if length > maxLen + 1:
87
+ title_ind = title_ind[0:21]
88
+ if length < maxLen + 1:
89
+ title_ind.extend([0] * (maxLen - length + 1))
90
+
91
+ for n in title_ind:
92
+ traindataTxt.write(str(n) + ',')
93
+ traindataTxt.write('\n')
94
+ ```
95
+
96
+ ## Trainer
97
+ ```python
98
+ print('init net...')
99
+ model = my_model()
100
+ model.to(device)
101
+ print(model)
102
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
103
+ criterion = nn.CrossEntropyLoss()
104
+
105
+ print("training...")
106
+
107
+ best_dev_acc = 0
108
+ # embed.train()
109
+ for epoch in range(100):
110
+ model.train()
111
+ for i, (clas, sentences) in enumerate(train_dataLoader):
112
+ # sentences: batch size 64 x sentence length 20 x embed dimension 128
113
+ # 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor
114
+
115
+ out = model(sentences.to(
116
+ device)) # out: batch size 64 x word vector 4 (after my_linear)
117
+ try:
118
+ loss = criterion(out, clas.to(device))
119
+ except:
120
+ print(out.size(), out)
121
+ print(clas.size(), clas)
122
+ optimizer.zero_grad()
123
+ loss.backward()
124
+ optimizer.step()
125
+ if (i + 1) % 10 == 0:
126
+ print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
127
+ model.eval()
128
+ dev_acc = validation(model=model, val_dataLoader=val_dataLoader,
129
+ device=device)
130
+
131
+ if best_dev_acc < dev_acc:
132
+ best_dev_acc = dev_acc
133
+ print("save model...")
134
+ torch.save(model.state_dict(), "model.bin")
135
+ print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
136
+ print("best dev acc %.4f dev acc %.4f" % (best_dev_acc, dev_acc))
137
+ ```
138
+
139
+ ## Testing
140
+ ```python
141
+ def validation(model, val_dataLoader, device):
142
+ model.eval()
143
+ total = 0
144
+ correct = 0
145
+ with torch.no_grad():
146
+ for i, (clas, sentences) in enumerate(val_dataLoader):
147
+ try:
148
+ # sentences = sentences.type(torch.LongTensor).to(device)
149
+ # clas = clas.type(torch.LongTensor).to(device)
150
+ out = model(
151
+ sentences.to(
152
+ device)) # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear)
153
+ # out = F.relu(out.squeeze(-3))
154
+ # out = F.max_pool1d(out, out.size(2)).squeeze(2)
155
+ # softmax = nn.Softmax(dim=1)
156
+
157
+ pred = torch.argmax(out, dim=1) # 64x4 -> 64x1
158
+
159
+ correct += (pred == clas.to(device)).sum()
160
+ total += clas.size()[0]
161
+ except IndexError as e:
162
+ print(i)
163
+ print('clas', clas)
164
+ print('clas size', clas.size())
165
+ print('sentence', sentences)
166
+ print('sentences size', sentences.size())
167
+ print(e)
168
+ print(e.__traceback__)
169
+ exit()
170
+
171
+ acc = correct / total
172
+ return acc
173
+ ```