Spaces:

xymeow7
/

text-classification-demo-2

Sleeping

App Files Files Community

text-classification-demo-2 / trainer.py

xymeow7

Update trainer.py

d355949 verified 6 months ago

raw

history blame

12.1 kB

	import numpy as np
	import torch
	import torch.nn as nn
	# import torch.nn.functional as F
	import torch.optim as optim
	# from torch.autograd import Variable
	#import torch.distributed as dist

	# import time
	import os
	import re
	# import sys
	# import io

	from tqdm import tqdm
	import nltk

	from lstm_model_new import LSTM_model, BiLSTMModel
	from max_ent_model import MaxEntropyModel
	from svm_model import SVM

	nltk.download('punkt')


	class Trainer(object):
	def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type):

	# vocab_size = 8000
	# sequence_len = 150

	self.vocab_size = vocab_size
	self.vocab_sizeb = self.vocab_size + 1

	self.sequence_len = sequence_len
	self.model_type = model_type

	self.batch_size = batch_size
	self.nn_epochs = nn_epochs

	self.processed_data_folder = "../preprocessed_data/"

	self._load_data()

	self._get_model()

	# self._setup_optimizer()


	pass



	def _load_data(self, ):

	dict_fn = "yelp_dictionary.npy"

	id_to_word = np.load(dict_fn, allow_pickle=True) # .item()

	print(type(id_to_word))
	print(id_to_word[0], len(id_to_word))

	word_to_id = {
	id_to_word[idx]: idx for idx in range(len(id_to_word))
	}

	# word_to_id = {v: k for k, v in id_to_word.items()}
	self.word_to_id = word_to_id

	# x_train = np.load('../preprocessed_data/x_train.npy')
	# y_train = np.load('../preprocessed_data/y_train.npy')

	# #x_train = x_train[:10000]
	# #y_train = y_train[:10000]
	# x_test = np.load('../preprocessed_data/x_test.npy')
	# y_test = np.load('../preprocessed_data/y_test.npy')


	# x_train_path = os.path.join(self.processed_data_folder, "x_train.npy")
	# y_train_path = os.path.join(self.processed_data_folder, "y_train.npy")
	# x_test_path = os.path.join(self.processed_data_folder, "x_test.npy")
	# y_test_path = os.path.join(self.processed_data_folder, "y_test.npy")

	# x_train = np.load(x_train_path)
	# y_train = np.load(y_train_path)
	# x_test = np.load(x_test_path)
	# y_test = np.load(y_test_path)
	# self.x_train = x_train
	# self.y_train = y_train
	# self.x_test = x_test
	# self.y_test = y_test

	def _get_model(self, ):
	if self.model_type == "lstm":
	self.model = LSTM_model(self.vocab_sizeb, 800)
	elif self.model_type == "bilstm":
	self.model = BiLSTMModel(self.vocab_sizeb, 800)
	elif self.model_type == "max_ent":
	self.model = MaxEntropyModel()
	elif self.model_type == "svm":
	self.model = SVM()
	else:
	raise ValueError("Model type not supported")

	# self.model.cuda()

	if self.model_type in ['lstm', 'bilstm']:
	# self.model = self.model.cuda()

	model_ckpt_fn = f"{self.model_type}.pth"
	self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu')))
	elif self.model_type in ['max_ent']:
	model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent #
	model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
	features = model_params["features"]
	weights = model_params["weights"]

	self.model.weights = weights # .tolist()
	# print(f"self.model.weights: {self.model.weights[:10]}")
	self.model.last_weights = weights # .tolist()

	self.model.features = features
	# print(f"self.model.features: {list(self.model.features.keys())[:10]}")

	elif self.model_type in ['svm']:
	model_ckpt_fn = f"{self.model_type}_weights.npy"
	model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
	w = model_params['w']
	b = model_params['b']
	self.model.svm_model.w = w
	self.model.svm_model.b = b

	else:
	raise ValueError("Model type not supported")





	def _setup_optimizer(self, ):
	self.lr = 0.001
	self.opt = optim.Adam(self.model.parameters(), lr=self.lr)

	def _train(self, ):
	train_losses = []
	train_accs = []
	test_accs = [0.0]

	for epoch in range(self.nn_epochs):
	print(f"Epoch: {epoch}")
	self.model.train()

	nn_acc = 0
	nn_total = 0
	epoch_loss = 0.0


	train_permutation_idxes = np.random.permutation(self.y_train.shape[0])

	for i in tqdm(range(0, len(self.y_train), self.batch_size)):
	batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]]
	batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]]

	data = torch.from_numpy(batched_x).long().cuda()
	target = torch.from_numpy(batched_y).float().cuda()

	self.opt.zero_grad()
	loss, predicted_labels = self.model(data, target)
	loss.backward()

	norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0)
	self.opt.step()

	predicted_labels = predicted_labels >= 0
	gts = target >= 0.5
	acc = torch.sum((predicted_labels == gts).float()).item()

	nn_acc += acc
	epoch_loss += loss.item()
	nn_total += len(batched_y)

	train_acc = float(nn_acc) / float(nn_total)
	train_loss = epoch_loss / float(self.batch_size)

	train_losses.append(train_loss)
	train_accs.append(train_acc)

	print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}")

	self._test()


	def _process_text(self, input_text):
	text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
	tokens = nltk.word_tokenize(text)
	token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
	token_ids = np.array(token_ids)

	token_ids[token_ids > self.vocab_size] = 0
	if token_ids.shape[0] > self.sequence_len:
	start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1)
	token_ids = token_ids[start_index: (start_index + self.sequence_len)]
	else:
	token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])])
	return token_ids

	def _process_text_maxent(self, input_text):
	text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
	tokens = nltk.word_tokenize(text)
	token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
	# token_ids = np.array(token_ids)
	token_ids = [ str(word_idx) for word_idx in token_ids ]

	return token_ids

	# token_ids[token_ids > self.vocab_size] = 0
	# return token_ids

	def _process_text_svm(self, input_text):
	text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
	tokens = self.model.vectorizer.transform([text]).toarray()
	# tokens = nltk.word_tokenize(text)
	# token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
	# # token_ids = np.array(token_ids)
	# token_ids = [ str(word_idx) for word_idx in token_ids ]

	return tokens

	def predict_maxent(self, input_text):

	text_ids = self._process_text_maxent(input_text)

	prob = self.model.calculate_probability(text_ids)
	prob.sort(reverse=True)
	# print(label, prob)
	print(prob)
	##### Calculate whether the prediction is correct #####
	maxx_prob_idx = int(prob[0][1])

	# data = torch.from_numpy(text_ids).long() # .cuda()
	# data = data.unsqueeze(0)


	# target = torch.zeros((data.size(0), ), dtype=torch.float)

	# # print(f"data: {data.shape}, target: {target.shape}")

	# with torch.no_grad():
	# loss, predicted_labels = self.model(data, target)
	# predicted_labels = predicted_labels >= 0

	if maxx_prob_idx == 2:
	return "Positive"
	else:
	return "Negative"

	def predict_svm(self, input_text):

	text_ids = self._process_text_svm(input_text)

	predicted_label = self.model.svm_model.predict(text_ids)

	if float(predicted_label[0]) > 0:
	return "Positive"
	else:
	return "Negative"

	# prob = self.model.calculate_probability(text_ids)
	# prob.sort(reverse=True)
	# # print(label, prob)
	# print(prob)
	# ##### Calculate whether the prediction is correct #####
	# maxx_prob_idx = int(prob[0][1])

	# # data = torch.from_numpy(text_ids).long() # .cuda()
	# # data = data.unsqueeze(0)


	# # target = torch.zeros((data.size(0), ), dtype=torch.float)

	# # # print(f"data: {data.shape}, target: {target.shape}")

	# # with torch.no_grad():
	# # loss, predicted_labels = self.model(data, target)
	# # predicted_labels = predicted_labels >= 0

	# if maxx_prob_idx == 2:
	# return "Positive"
	# else:
	# return "Negative"


	def predict(self, input_text):

	text_ids = self._process_text(input_text)

	data = torch.from_numpy(text_ids).long() # .cuda()
	data = data.unsqueeze(0)


	target = torch.zeros((data.size(0), ), dtype=torch.float)

	# print(f"data: {data.shape}, target: {target.shape}")

	with torch.no_grad():
	loss, predicted_labels = self.model(data, target)
	predicted_labels = predicted_labels >= 0

	if predicted_labels.item():
	return "Positive"
	else:
	return "Negative"

	# return predicted_labels.item()


	def _test(self, ):
	self.model.eval()

	nn_acc = 0
	loss = 0

	nn_total = 0

	test_permutation_idxes = np.random.permutation(self.y_test.shape[0])
	for i in tqdm(range(0, len(self.y_test), self.batch_size)):
	batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]]
	batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]]

	data = torch.from_numpy(batched_x).long().cuda()
	target = torch.from_numpy(batched_y).float().cuda()

	with torch.no_grad():
	loss, predicted_labels = self.model(data, target)

	predicted_labels = predicted_labels >= 0
	gts = target >= 0.5
	acc = torch.sum((predicted_labels == gts).float()).item()

	nn_acc += acc
	nn_total += len(batched_y)

	acc = float(nn_acc) / float(nn_total)
	print(f"Test Acc: {acc}")

	if __name__=='__main__':

	vocab_size = 8000
	sequence_len = 150

	# batch_size = 1024
	batch_size = 256
	nn_epochs = 20
	model_type = "lstm"

	model_type = "bilstm"

	trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
	trainer._train()

	# CUDA_VISIBLE_DEVICES=0 python trainer.py