import numpy as np import torch import torch.nn as nn # import torch.nn.functional as F import torch.optim as optim # from torch.autograd import Variable #import torch.distributed as dist # import time import os import re # import sys # import io from tqdm import tqdm import nltk from lstm_model_new import LSTM_model, BiLSTMModel from max_ent_model import MaxEntropyModel from svm_model import SVM nltk.download('punkt') class Trainer(object): def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type): # vocab_size = 8000 # sequence_len = 150 self.vocab_size = vocab_size self.vocab_sizeb = self.vocab_size + 1 self.sequence_len = sequence_len self.model_type = model_type self.batch_size = batch_size self.nn_epochs = nn_epochs self.processed_data_folder = "../preprocessed_data/" self._load_data() self._get_model() # self._setup_optimizer() pass def _load_data(self, ): dict_fn = "yelp_dictionary.npy" id_to_word = np.load(dict_fn, allow_pickle=True) # .item() print(type(id_to_word)) print(id_to_word[0], len(id_to_word)) word_to_id = { id_to_word[idx]: idx for idx in range(len(id_to_word)) } # word_to_id = {v: k for k, v in id_to_word.items()} self.word_to_id = word_to_id # x_train = np.load('../preprocessed_data/x_train.npy') # y_train = np.load('../preprocessed_data/y_train.npy') # #x_train = x_train[:10000] # #y_train = y_train[:10000] # x_test = np.load('../preprocessed_data/x_test.npy') # y_test = np.load('../preprocessed_data/y_test.npy') # x_train_path = os.path.join(self.processed_data_folder, "x_train.npy") # y_train_path = os.path.join(self.processed_data_folder, "y_train.npy") # x_test_path = os.path.join(self.processed_data_folder, "x_test.npy") # y_test_path = os.path.join(self.processed_data_folder, "y_test.npy") # x_train = np.load(x_train_path) # y_train = np.load(y_train_path) # x_test = np.load(x_test_path) # y_test = np.load(y_test_path) # self.x_train = x_train # self.y_train = y_train # self.x_test = x_test # self.y_test = y_test def _get_model(self, ): if self.model_type == "lstm": self.model = LSTM_model(self.vocab_sizeb, 800) elif self.model_type == "bilstm": self.model = BiLSTMModel(self.vocab_sizeb, 800) elif self.model_type == "max_ent": self.model = MaxEntropyModel() elif self.model_type == "svm": self.model = SVM() else: raise ValueError("Model type not supported") # self.model.cuda() if self.model_type in ['lstm', 'bilstm']: # self.model = self.model.cuda() model_ckpt_fn = f"{self.model_type}.pth" self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu'))) elif self.model_type in ['max_ent']: model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent # model_params = np.load(model_ckpt_fn, allow_pickle=True).item() features = model_params["features"] weights = model_params["weights"] self.model.weights = weights # .tolist() # print(f"self.model.weights: {self.model.weights[:10]}") self.model.last_weights = weights # .tolist() self.model.features = features # print(f"self.model.features: {list(self.model.features.keys())[:10]}") elif self.model_type in ['svm']: model_ckpt_fn = f"{self.model_type}_weights.npy" model_params = np.load(model_ckpt_fn, allow_pickle=True).item() w = model_params['w'] b = model_params['b'] self.model.svm_model.w = w self.model.svm_model.b = b else: raise ValueError("Model type not supported") def _setup_optimizer(self, ): self.lr = 0.001 self.opt = optim.Adam(self.model.parameters(), lr=self.lr) def _train(self, ): train_losses = [] train_accs = [] test_accs = [0.0] for epoch in range(self.nn_epochs): print(f"Epoch: {epoch}") self.model.train() nn_acc = 0 nn_total = 0 epoch_loss = 0.0 train_permutation_idxes = np.random.permutation(self.y_train.shape[0]) for i in tqdm(range(0, len(self.y_train), self.batch_size)): batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]] batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]] data = torch.from_numpy(batched_x).long().cuda() target = torch.from_numpy(batched_y).float().cuda() self.opt.zero_grad() loss, predicted_labels = self.model(data, target) loss.backward() norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0) self.opt.step() predicted_labels = predicted_labels >= 0 gts = target >= 0.5 acc = torch.sum((predicted_labels == gts).float()).item() nn_acc += acc epoch_loss += loss.item() nn_total += len(batched_y) train_acc = float(nn_acc) / float(nn_total) train_loss = epoch_loss / float(self.batch_size) train_losses.append(train_loss) train_accs.append(train_acc) print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}") self._test() def _process_text(self, input_text): text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower() tokens = nltk.word_tokenize(text) token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ] token_ids = np.array(token_ids) token_ids[token_ids > self.vocab_size] = 0 if token_ids.shape[0] > self.sequence_len: start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1) token_ids = token_ids[start_index: (start_index + self.sequence_len)] else: token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])]) return token_ids def _process_text_maxent(self, input_text): text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower() tokens = nltk.word_tokenize(text) token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ] # token_ids = np.array(token_ids) token_ids = [ str(word_idx) for word_idx in token_ids ] return token_ids # token_ids[token_ids > self.vocab_size] = 0 # return token_ids def _process_text_svm(self, input_text): text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower() tokens = self.model.vectorizer.transform([text]).toarray() # tokens = nltk.word_tokenize(text) # token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ] # # token_ids = np.array(token_ids) # token_ids = [ str(word_idx) for word_idx in token_ids ] return tokens def predict_maxent(self, input_text): text_ids = self._process_text_maxent(input_text) prob = self.model.calculate_probability(text_ids) prob.sort(reverse=True) # print(label, prob) print(prob) ##### Calculate whether the prediction is correct ##### maxx_prob_idx = int(prob[0][1]) # data = torch.from_numpy(text_ids).long() # .cuda() # data = data.unsqueeze(0) # target = torch.zeros((data.size(0), ), dtype=torch.float) # # print(f"data: {data.shape}, target: {target.shape}") # with torch.no_grad(): # loss, predicted_labels = self.model(data, target) # predicted_labels = predicted_labels >= 0 if maxx_prob_idx == 2: return "Positive" else: return "Negative" def predict_svm(self, input_text): text_ids = self._process_text_svm(input_text) predicted_label = self.model.svm_model.predict(text_ids) if float(predicted_label[0]) > 0: return "Positive" else: return "Negative" # prob = self.model.calculate_probability(text_ids) # prob.sort(reverse=True) # # print(label, prob) # print(prob) # ##### Calculate whether the prediction is correct ##### # maxx_prob_idx = int(prob[0][1]) # # data = torch.from_numpy(text_ids).long() # .cuda() # # data = data.unsqueeze(0) # # target = torch.zeros((data.size(0), ), dtype=torch.float) # # # print(f"data: {data.shape}, target: {target.shape}") # # with torch.no_grad(): # # loss, predicted_labels = self.model(data, target) # # predicted_labels = predicted_labels >= 0 # if maxx_prob_idx == 2: # return "Positive" # else: # return "Negative" def predict(self, input_text): text_ids = self._process_text(input_text) data = torch.from_numpy(text_ids).long() # .cuda() data = data.unsqueeze(0) target = torch.zeros((data.size(0), ), dtype=torch.float) # print(f"data: {data.shape}, target: {target.shape}") with torch.no_grad(): loss, predicted_labels = self.model(data, target) predicted_labels = predicted_labels >= 0 if predicted_labels.item(): return "Positive" else: return "Negative" # return predicted_labels.item() def _test(self, ): self.model.eval() nn_acc = 0 loss = 0 nn_total = 0 test_permutation_idxes = np.random.permutation(self.y_test.shape[0]) for i in tqdm(range(0, len(self.y_test), self.batch_size)): batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]] batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]] data = torch.from_numpy(batched_x).long().cuda() target = torch.from_numpy(batched_y).float().cuda() with torch.no_grad(): loss, predicted_labels = self.model(data, target) predicted_labels = predicted_labels >= 0 gts = target >= 0.5 acc = torch.sum((predicted_labels == gts).float()).item() nn_acc += acc nn_total += len(batched_y) acc = float(nn_acc) / float(nn_total) print(f"Test Acc: {acc}") if __name__=='__main__': vocab_size = 8000 sequence_len = 150 # batch_size = 1024 batch_size = 256 nn_epochs = 20 model_type = "lstm" model_type = "bilstm" trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type) trainer._train() # CUDA_VISIBLE_DEVICES=0 python trainer.py