--- license: mit language: - zh pipeline_tag: text-classification --- # Bert Chinese Text Classification Model this a Bert Model that train for customer service of logistics companies ### data(with noise since it from ASR text) train: 10878 rows dev:2720 rows total: 13598 rows ### param embed_dim: 128 batch size: 64 contextsize: 20 n_head: 2 epoches: 100 ## Word Label(word, index, number of occurences) ```sh 我 1 18719 个 2 12236 快 3 8152 一 4 8097 递 5 7295 那 6 7118 了 7 6923 的 8 6684 是 9 6632 到 10 6434 你 11 5144 没 12 4989 有 13 4664 下 14 4433 这 15 4219 在 16 4219 么 17 4010 查 18 3964 就 19 3570 好 20 3524 ``` ## Tokenizer ```python label_dict, label_n2w = read_labelFile(labelFile) word2ind, ind2word = get_worddict(wordLabelFile) stoplist = read_stopword(stopwordFile) cla_dict = {} # train data to vec traindataTxt = open(trainDataVecFile, 'w') datas = open(trainFile, 'r', encoding='utf_8').readlines() datas = list(filter(None, datas)) random.shuffle(datas) for line in tqdm(datas, desc="traindata to vec"): line = line.replace('\n', '').split(':') # line = line.replace('\n','').split('\t') cla = line[1] # if cla in [21, 13, 9, 24, 23, 19, 14]: # continue if cla in cla_dict: cla_dict[cla] += 1 else: cla_dict[cla] = 1 cla_ind = label_dict[cla] title_seg = ['我', '要', '下', '单'] title_seg = [i for i in line[0]] # title_seg = jieba.cut(line[0], cut_all=False) title_ind = [cla_ind] for w in title_seg: if w in stoplist: continue title_ind.append(word2ind[w]) length = len(title_ind) if length > maxLen + 1: title_ind = title_ind[0:21] if length < maxLen + 1: title_ind.extend([0] * (maxLen - length + 1)) for n in title_ind: traindataTxt.write(str(n) + ',') traindataTxt.write('\n') ``` ## Trainer ```python # set the seed for ensuring reproducibility seed = 3407 # init net print('init net...') model = my_model() model.to(device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) criterion = nn.CrossEntropyLoss() print("training...") best_dev_acc = 0 # embed.train() for epoch in range(100): model.train() for i, (clas, sentences) in enumerate(train_dataLoader): # sentences: batch size 64 x sentence length 20 x embed dimension 128 # 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor out = model(sentences.to( device)) # out: batch size 64 x word vector 4 (after my_linear) try: loss = criterion(out, clas.to(device)) except: print(out.size(), out) print(clas.size(), clas) optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % 10 == 0: print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item()) model.eval() dev_acc = validation(model=model, val_dataLoader=val_dataLoader, device=device) if best_dev_acc < dev_acc: best_dev_acc = dev_acc print("save model...") torch.save(model.state_dict(), "model.bin") print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item()) print("best dev acc %.4f dev acc %.4f" % (best_dev_acc, dev_acc)) ``` ## Testing ```python def validation(model, val_dataLoader, device): model.eval() total = 0 correct = 0 with torch.no_grad(): for i, (clas, sentences) in enumerate(val_dataLoader): try: # sentences = sentences.type(torch.LongTensor).to(device) # clas = clas.type(torch.LongTensor).to(device) out = model( sentences.to( device)) # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear) # out = F.relu(out.squeeze(-3)) # out = F.max_pool1d(out, out.size(2)).squeeze(2) # softmax = nn.Softmax(dim=1) pred = torch.argmax(out, dim=1) # 64x4 -> 64x1 correct += (pred == clas.to(device)).sum() total += clas.size()[0] except IndexError as e: print(i) print('clas', clas) print('clas size', clas.size()) print('sentence', sentences) print('sentences size', sentences.size()) print(e) print(e.__traceback__) exit() acc = correct / total return acc ```