Text CNN Classification Model
Param
textCNN_param = {
'vocab_size': len(word2ind) + 1,
'embed_dim': 128, # 1 x 128 vector
'class_num': len(label_w2n),
"kernel_num": 16,
"kernel_size": [3, 4, 5],
"dropout": 0.5,
}
dataLoader_param = {
'batch_size': 128,
'shuffle': True,
}
Model
textcnn.bin
class textCNN(nn.Module):
def __init__(self, param):
super(textCNN, self).__init__()
ci = 1 # input chanel size
# kernel 卷积核
kernel_num = param['kernel_num'] # output chanel size
kernel_size = param['kernel_size']
vocab_size = param['vocab_size']
embed_dim = param['embed_dim'] # embedding dimension
dropout = param['dropout']
class_num = param['class_num']
self.param = param
# 把token随机向量化
self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=1)
# 三个不同长度的卷积
self.conv11 = nn.Conv2d(ci, kernel_num, (kernel_size[0], embed_dim))
self.conv12 = nn.Conv2d(ci, kernel_num, (kernel_size[1], embed_dim))
self.conv13 = nn.Conv2d(ci, kernel_num, (kernel_size[2], embed_dim))
# 三个不同长度的卷积
# increasing the ability of calculation by dropout
self.dropout = nn.Dropout(dropout)
self.fc1 = nn.Linear(len(kernel_size) * kernel_num, class_num)
def init_embed(self, embed_matrix):
self.embed.weight = nn.Parameter(torch.Tensor(embed_matrix))
@staticmethod
def conv_and_pool(x, conv):
# x: (batch, 1, sentence_length, )
x = conv(x)
# x: (batch, kernel_num, H_out, 1)
x = F.relu(x.squeeze(3))
# x: (batch, kernel_num, H_out)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
# (batch, kernel_num)
return x
def forward(self, x):
# x: (batch, sentence_length)
x = self.embed(x)
# x: (batch, sentence_length, embed_dim)
# TODO init embed matrix with pre-trained
x = x.unsqueeze(1)
# x: (batch, 1, sentence_length, embed_dim)
x1 = self.conv_and_pool(x, self.conv11) # (batch, kernel_num)
x2 = self.conv_and_pool(x, self.conv12) # (batch, kernel_num)
x3 = self.conv_and_pool(x, self.conv13) # (batch, kernel_num)
x = torch.cat((x1, x2, x3), 1) # (batch, 3 * kernel_num)
x = self.dropout(x)
logit = F.log_softmax(self.fc1(x), dim=1)
return logit
Trainer
# set the seed for ensuring reproducibility
seed = 3407
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
word2ind, ind2word = sen2inds.get_worddict('wordLabel.txt')
label_w2n, label_n2w = sen2inds.read_labelFile('data/label.txt')
textCNN_param = {
'vocab_size': len(word2ind) + 1,
'embed_dim': 128, # 1 x 128 vector
'class_num': len(label_w2n),
"kernel_num": 16,
"kernel_size": [3, 4, 5],
"dropout": 0.5,
}
dataLoader_param = {
'batch_size': 128,
'shuffle': True,
}
# # device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
# init dataset
print('init dataset...')
trainDataFile = 'traindata_vec.txt'
valDataFile = 'devdata_vec.txt'
train_dataset = textCNN_data(trainDataFile)
train_dataLoader = DataLoader(train_dataset,
batch_size=dataLoader_param['batch_size'],
shuffle=True)
val_dataset = textCNN_data(valDataFile)
val_dataLoader = DataLoader(val_dataset,
batch_size=dataLoader_param['batch_size'], # batch size 128
shuffle=False)
# init net
print('init net...')
net = textCNN(textCNN_param)
print(net)
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
print("training...")
net.train()
best_dev_acc = 0
for epoch in range(100):
for i, (clas, sentences) in enumerate(train_dataLoader):
out = net(sentences)
loss = criterion(out, clas)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 10 == 0:
print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
dev_acc = validation(model=net, val_dataLoader=val_dataLoader,
device=device)
if best_dev_acc < dev_acc:
best_dev_acc = dev_acc
print("save model...")
torch.save(net.state_dict(), "textcnn.bin")
print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
print("best dev acc %.4f dev acc %.4f" % (best_dev_acc, dev_acc))