hoodiexxx
/

Bert_Chinese_Text_Classification_Model

Text Classification

Chinese

Model card Files Files and versions Community

hoodiexxx commited on Feb 26

Commit

ef00e45

•

1 Parent(s): 9398cf7

Update README.md

Browse files

Files changed (1) hide show

README.md +126 -2

README.md CHANGED Viewed

@@ -7,7 +7,8 @@ pipeline_tag: text-classification
 # Bert Chinese Text Classification Model
 this a Bert Model that train for customer service of logistics companies
-## Word Label
 我 1 18719
 个 2 12236
@@ -46,4 +47,127 @@ this a Bert Model that train for customer service of logistics companies
 就 19 3570
-好 20 3524

 # Bert Chinese Text Classification Model
 this a Bert Model that train for customer service of logistics companies
+## Word Label(word, index, number of occurences)
+```sh
 我 1 18719
 个 2 12236
 就 19 3570
+好 20 3524
+```
+## Tokenizer
+```python
+label_dict, label_n2w = read_labelFile(labelFile)
+    word2ind, ind2word = get_worddict(wordLabelFile)
+    stoplist = read_stopword(stopwordFile)
+    cla_dict = {}
+    # train data to vec
+    traindataTxt = open(trainDataVecFile, 'w')
+    datas = open(trainFile, 'r', encoding='utf_8').readlines()
+    datas = list(filter(None, datas))
+    random.shuffle(datas)
+    for line in tqdm(datas, desc="traindata to vec"):
+        line = line.replace('\n', '').split(':')
+        # line = line.replace('\n','').split('\t')
+        cla = line[1]
+        # if cla in [21, 13, 9, 24, 23, 19, 14]:
+        #     continue
+        if cla in cla_dict:
+            cla_dict[cla] += 1
+        else:
+            cla_dict[cla] = 1
+        cla_ind = label_dict[cla]
+        title_seg = ['我', '要', '下', '单']
+        title_seg = [i for i in line[0]]
+        # title_seg = jieba.cut(line[0], cut_all=False)
+        title_ind = [cla_ind]
+        for w in title_seg:
+            if w in stoplist:
+                continue
+            title_ind.append(word2ind[w])
+        length = len(title_ind)
+        if length > maxLen + 1:
+            title_ind = title_ind[0:21]
+        if length < maxLen + 1:
+            title_ind.extend([0] * (maxLen - length + 1))
+        for n in title_ind:
+            traindataTxt.write(str(n) + ',')
+        traindataTxt.write('\n')
+```
+## Trainer
+```python
+print('init net...')
+    model = my_model()
+    model.to(device)
+    print(model)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
+    criterion = nn.CrossEntropyLoss()
+    print("training...")
+    best_dev_acc = 0
+    # embed.train()
+    for epoch in range(100):
+        model.train()
+        for i, (clas, sentences) in enumerate(train_dataLoader):
+            # sentences: batch size 64 x sentence length 20 x embed dimension 128
+            # 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor
+            out = model(sentences.to(
+                device))  # out: batch size 64 x word vector 4 (after my_linear)
+            try:
+                loss = criterion(out, clas.to(device))
+            except:
+                print(out.size(), out)
+                print(clas.size(), clas)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if (i + 1) % 10 == 0:
+                print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
+        model.eval()
+        dev_acc = validation(model=model, val_dataLoader=val_dataLoader,
+                             device=device)
+        if best_dev_acc < dev_acc:
+            best_dev_acc = dev_acc
+            print("save model...")
+            torch.save(model.state_dict(), "model.bin")
+            print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
+        print("best dev acc %.4f  dev acc %.4f" % (best_dev_acc, dev_acc))
+```
+## Testing
+```python
+def validation(model, val_dataLoader, device):
+    model.eval()
+    total = 0
+    correct = 0
+    with torch.no_grad():
+        for i, (clas, sentences) in enumerate(val_dataLoader):
+            try:
+                # sentences = sentences.type(torch.LongTensor).to(device)
+                # clas = clas.type(torch.LongTensor).to(device)
+                out = model(
+                    sentences.to(
+                        device))  # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear)
+                # out = F.relu(out.squeeze(-3))
+                # out = F.max_pool1d(out, out.size(2)).squeeze(2)
+                # softmax = nn.Softmax(dim=1)
+                pred = torch.argmax(out, dim=1)  # 64x4 -> 64x1
+                correct += (pred == clas.to(device)).sum()
+                total += clas.size()[0]
+            except IndexError as e:
+                print(i)
+                print('clas', clas)
+                print('clas size', clas.size())
+                print('sentence', sentences)
+                print('sentences size', sentences.size())
+                print(e)
+                print(e.__traceback__)
+                exit()
+    acc = correct / total
+    return acc
+```