Spaces:

xymeow7
/

text-classification-demo-2

Sleeping

File size: 12,141 Bytes

import numpy as np
import torch
import torch.nn as nn
# import torch.nn.functional as F
import torch.optim as optim
# from torch.autograd import Variable
#import torch.distributed as dist

# import time
import os
import re
# import sys
# import io

from tqdm import tqdm
import nltk

from lstm_model_new import LSTM_model, BiLSTMModel
from max_ent_model import MaxEntropyModel
from svm_model import SVM

nltk.download('punkt')


class Trainer(object):
    def __init__(self, vocab_size, sequence_len, batch_size, nn_epochs, model_type):
        
        # vocab_size = 8000
        # sequence_len = 150
        
        self.vocab_size = vocab_size
        self.vocab_sizeb = self.vocab_size + 1
        
        self.sequence_len = sequence_len
        self.model_type = model_type
        
        self.batch_size = batch_size
        self.nn_epochs = nn_epochs
        
        self.processed_data_folder = "../preprocessed_data/"
        
        self._load_data()
        
        self._get_model()
        
        # self._setup_optimizer()
        
        
        pass
    
    
    
    def _load_data(self, ):
        
        dict_fn = "yelp_dictionary.npy"
        
        id_to_word = np.load(dict_fn, allow_pickle=True) # .item()
        
        print(type(id_to_word))
        print(id_to_word[0], len(id_to_word))
        
        word_to_id = {
            id_to_word[idx]: idx for idx in range(len(id_to_word))
        }
        
        # word_to_id = {v: k for k, v in id_to_word.items()}
        self.word_to_id = word_to_id
        
        # x_train = np.load('../preprocessed_data/x_train.npy')
        # y_train = np.load('../preprocessed_data/y_train.npy')

        # #x_train = x_train[:10000]
        # #y_train = y_train[:10000]
        # x_test = np.load('../preprocessed_data/x_test.npy')
        # y_test = np.load('../preprocessed_data/y_test.npy')


        # x_train_path = os.path.join(self.processed_data_folder, "x_train.npy")
        # y_train_path = os.path.join(self.processed_data_folder, "y_train.npy")
        # x_test_path = os.path.join(self.processed_data_folder, "x_test.npy")
        # y_test_path = os.path.join(self.processed_data_folder, "y_test.npy")
        
        # x_train = np.load(x_train_path)
        # y_train = np.load(y_train_path)
        # x_test = np.load(x_test_path)
        # y_test = np.load(y_test_path)
        # self.x_train = x_train
        # self.y_train = y_train
        # self.x_test = x_test
        # self.y_test = y_test
        
    def _get_model(self, ):
        if self.model_type == "lstm":
            self.model = LSTM_model(self.vocab_sizeb, 800)
        elif self.model_type == "bilstm":
            self.model = BiLSTMModel(self.vocab_sizeb, 800)
        elif self.model_type == "max_ent":
            self.model = MaxEntropyModel()
        elif self.model_type == "svm":
            self.model = SVM()
        else:
            raise ValueError("Model type not supported")

        # self.model.cuda()
        
        if self.model_type in ['lstm', 'bilstm']:
            # self.model = self.model.cuda()
        
            model_ckpt_fn = f"{self.model_type}.pth"
            self.model.load_state_dict(torch.load(model_ckpt_fn, map_location=torch.device('cpu')))
        elif self.model_type in ['max_ent']:
            model_ckpt_fn = f"{self.model_type}_ckpt.npy" # max_ent #
            model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
            features = model_params["features"]
            weights = model_params["weights"]
            
            self.model.weights = weights # .tolist()
            # print(f"self.model.weights: {self.model.weights[:10]}")
            self.model.last_weights = weights # .tolist()
            
            self.model.features = features
            # print(f"self.model.features: {list(self.model.features.keys())[:10]}")
        
        elif self.model_type in ['svm']:
            model_ckpt_fn = f"{self.model_type}_weights.npy"
            model_params = np.load(model_ckpt_fn, allow_pickle=True).item()
            w = model_params['w']
            b = model_params['b']
            self.model.svm_model.w = w
            self.model.svm_model.b = b
        
        else:
            raise ValueError("Model type not supported")
            
            
            
            
    
    def _setup_optimizer(self, ):
        self.lr = 0.001
        self.opt = optim.Adam(self.model.parameters(), lr=self.lr)
        
    def _train(self, ):
        train_losses = []
        train_accs = []
        test_accs = [0.0]
        
        for epoch in range(self.nn_epochs):
            print(f"Epoch: {epoch}")
            self.model.train()
            
            nn_acc = 0
            nn_total = 0
            epoch_loss = 0.0
            
            
            train_permutation_idxes = np.random.permutation(self.y_train.shape[0])
            
            for i in tqdm(range(0, len(self.y_train), self.batch_size)):
                batched_x = self.x_train[train_permutation_idxes[i: i + self.batch_size]]
                batched_y = self.y_train[train_permutation_idxes[i: i + self.batch_size]]
                
                data = torch.from_numpy(batched_x).long().cuda()   
                target = torch.from_numpy(batched_y).float().cuda()
                
                self.opt.zero_grad()
                loss, predicted_labels = self.model(data, target)
                loss.backward()
                
                norm = nn.utils.clip_grad_norm_(self.model.parameters(), 2.0)
                self.opt.step()
                
                predicted_labels = predicted_labels >= 0
                gts = target >= 0.5
                acc = torch.sum((predicted_labels == gts).float()).item()
                
                nn_acc += acc
                epoch_loss += loss.item()
                nn_total += len(batched_y)
            
            train_acc = float(nn_acc) / float(nn_total)   
            train_loss = epoch_loss / float(self.batch_size)
            
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            
            print(f"[Epoch {epoch}] Train Loss: {train_loss}, Train Acc: {train_acc}")       
            
            self._test()          
    
    
    def _process_text(self, input_text):
        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
        tokens = nltk.word_tokenize(text)
        token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
        token_ids = np.array(token_ids)
        
        token_ids[token_ids > self.vocab_size] = 0
        if token_ids.shape[0] > self.sequence_len:
            start_index = np.random.randint(token_ids.shape[0 ]- self.sequence_len + 1)
            token_ids = token_ids[start_index: (start_index + self.sequence_len)]
        else:
            token_ids = np.concatenate([token_ids, np.zeros(self.sequence_len - token_ids.shape[0])])
        return token_ids
    
    def _process_text_maxent(self, input_text):
        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
        tokens = nltk.word_tokenize(text)
        token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
        # token_ids = np.array(token_ids)
        token_ids = [ str(word_idx) for word_idx in token_ids ]
        
        return token_ids
        
        # token_ids[token_ids > self.vocab_size] = 0
        # return token_ids
    
    def _process_text_svm(self, input_text):
        text = re.sub('[^a-zA-Z \']', '', re.sub('\\\\n', ' ', ','.join(input_text))).lower()
        tokens = self.model.vectorizer.transform([text]).toarray()
        # tokens = nltk.word_tokenize(text)
        # token_ids = [ self.word_to_id.get(token, -1) + 1 for token in tokens ]
        # # token_ids = np.array(token_ids)
        # token_ids = [ str(word_idx) for word_idx in token_ids ]
        
        return tokens
        
    def predict_maxent(self, input_text):
        
        text_ids = self._process_text_maxent(input_text)
        
        prob = self.model.calculate_probability(text_ids)
        prob.sort(reverse=True)
        # print(label, prob)
        print(prob)
        ##### Calculate whether the prediction is correct #####
        maxx_prob_idx = int(prob[0][1])
        
        # data = torch.from_numpy(text_ids).long() # .cuda()
        # data = data.unsqueeze(0)
        
        
        # target = torch.zeros((data.size(0), ), dtype=torch.float)
        
        # # print(f"data: {data.shape}, target: {target.shape}")
        
        # with torch.no_grad():
        #     loss, predicted_labels = self.model(data, target)
        # predicted_labels = predicted_labels >= 0
        
        if maxx_prob_idx == 2:
            return "Positive"
        else:
            return "Negative"
        
    def predict_svm(self, input_text):
        
        text_ids = self._process_text_svm(input_text)
        
        predicted_label = self.model.svm_model.predict(text_ids)
        
        if float(predicted_label[0]) > 0:
            return "Positive"
        else:
            return "Negative"
        
        # prob = self.model.calculate_probability(text_ids)
        # prob.sort(reverse=True)
        # # print(label, prob)
        # print(prob)
        # ##### Calculate whether the prediction is correct #####
        # maxx_prob_idx = int(prob[0][1])
        
        # # data = torch.from_numpy(text_ids).long() # .cuda()
        # # data = data.unsqueeze(0)
        
        
        # # target = torch.zeros((data.size(0), ), dtype=torch.float)
        
        # # # print(f"data: {data.shape}, target: {target.shape}")
        
        # # with torch.no_grad():
        # #     loss, predicted_labels = self.model(data, target)
        # # predicted_labels = predicted_labels >= 0
        
        # if maxx_prob_idx == 2:
        #     return "Positive"
        # else:
        #     return "Negative"
        
    
    def predict(self, input_text):
        
        text_ids = self._process_text(input_text)
        
        data = torch.from_numpy(text_ids).long() # .cuda()
        data = data.unsqueeze(0)
        
        
        target = torch.zeros((data.size(0), ), dtype=torch.float)
        
        # print(f"data: {data.shape}, target: {target.shape}")
        
        with torch.no_grad():
            loss, predicted_labels = self.model(data, target)
        predicted_labels = predicted_labels >= 0
        
        if predicted_labels.item():
            return "Positive"
        else:
            return "Negative"
        
        # return predicted_labels.item()
        
        
    def _test(self, ):
        self.model.eval()
        
        nn_acc = 0
        loss = 0
        
        nn_total = 0
        
        test_permutation_idxes = np.random.permutation(self.y_test.shape[0])
        for i in tqdm(range(0, len(self.y_test), self.batch_size)):
            batched_x = self.x_test[test_permutation_idxes[i: i + self.batch_size]]
            batched_y = self.y_test[test_permutation_idxes[i: i + self.batch_size]]
            
            data = torch.from_numpy(batched_x).long().cuda()
            target = torch.from_numpy(batched_y).float().cuda()
            
            with torch.no_grad():
                loss, predicted_labels = self.model(data, target)
            
            predicted_labels = predicted_labels >= 0
            gts = target >= 0.5
            acc = torch.sum((predicted_labels == gts).float()).item()
            
            nn_acc += acc
            nn_total += len(batched_y)
        
        acc = float(nn_acc) / float(nn_total)
        print(f"Test Acc: {acc}")

if __name__=='__main__':
    
    vocab_size = 8000
    sequence_len = 150
    
    # batch_size = 1024
    batch_size = 256
    nn_epochs = 20
    model_type = "lstm"
    
    model_type = "bilstm"
    
    trainer = Trainer(vocab_size, sequence_len, batch_size, nn_epochs, model_type)
    trainer._train()
    
    # CUDA_VISIBLE_DEVICES=0 python trainer.py