File size: 1,422 Bytes
2c1d053 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
from torch.utils.data import Dataset
import random
import numpy as np
from tqdm import tqdm
import torch
import sen2inds
class textCNN_data(Dataset):
def __init__(self, trainDataFile):
trainData = open(trainDataFile, 'r').read().split('\n')
trainData = list(filter(None, trainData))
res = []
for data in tqdm(trainData, desc='index to tensor'):
data = list(filter(None, data.split(',')))
data = [int(x) for x in data]
cla = torch.tensor(data[0], dtype=torch.long)
sentence = torch.tensor(data[1:], dtype=torch.long)
temp = []
temp.append(cla)
temp.append(sentence)
res.append(temp)
self.trainData = res
def __len__(self):
return len(self.trainData)
def __getitem__(self, idx):
data = self.trainData[idx]
cla = data[0]
sentence = data[1]
return cla, sentence
word2ind, ind2word = sen2inds.get_worddict('wordLabel.txt')
label_w2n, label_n2w = sen2inds.read_labelFile('data/label2.txt') # sen2inds.read_labelFile('data/label2.txt')
textCNN_param = {
'vocab_size': len(word2ind) + 1, # plus one for 0 padding
'embed_dim': 256, # 1 x 128 vector
'class_num': len(label_w2n),
"kernel_num": 16,
"kernel_size": [3, 4, 5],
"dropout": 0.5,
}
dataLoader_param = {
'batch_size': 128,
'shuffle': True,
}
|