# -*- coding: utf-8 -*- import pickle char_to_ix = pickle.load(open("chardict.pickle", "rb")) # print(char_to_ix) tag_to_ix = {'N':0, 'B':1, 'I':2} import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.autograd as autograd torch.manual_seed(1) class LSTMTagger(nn.Module): def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size): super(LSTMTagger, self).__init__() self.hidden_dim = hidden_dim #self.batch_size = batch_size self.char_embeddings = nn.Embedding(vocab_size, embedding_dim) # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # <- change here # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size) def forward(self, sentence): embeds = self.char_embeddings(sentence) lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1)) tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) tag_scores = F.log_softmax(tag_space, dim=1) return tag_scores #Train the model model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix)) loss_function = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr = 0.1) model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt' model.load_state_dict(torch.load(model_save_name)) def prepare_sequence(seq, to_ix): idxs = [to_ix[ch] for ch in seq] return torch.tensor(idxs, dtype = torch.long) def prob_to_tag(out): _sentence_tag_list = [] _prob_to_tag = [] for ch in out: chlist = list(ch) maxi = max(chlist) ind = chlist.index(maxi) _prob_to_tag.append((list(tag_to_ix.keys())[ind])) _sentence_tag_list.append(_prob_to_tag) return _sentence_tag_list def _char_to_token(samplesent, _sentence_tag_list): token_list = [] token = [] for j in range(len(_sentence_tag_list[0])): #for each character of a sentence ch = _sentence_tag_list[0][j] ach = samplesent[j] if ch == 'I': token.append(ach) if j == len(_sentence_tag_list[0]) -1: token_list.append(token) else: if ch =='B': if j == 0: token.append(ach) else: token_list.append(token) token=[] token.append(ach) if j == len(_sentence_tag_list[0]) -1: token_list.append(token) elif ch == 'N': continue return token_list def char_unifier(_token_list): for item in range(len(_token_list)): _token_list[item]= ''.join(_token_list[item]) return _token_list def tokenize(sentence): input = prepare_sequence(sentence, char_to_ix) out = model(input) sentence_tag_list = prob_to_tag(out) token_char_list = _char_to_token(sentence, sentence_tag_list) token_list = char_unifier(token_char_list) return token_list