tokenizer / bap_preprocessing.py
Furkan Akkurt
update app
cca1792
# -*- coding: utf-8 -*-
import pickle
char_to_ix = pickle.load(open("chardict.pickle", "rb"))
# print(char_to_ix)
tag_to_ix = {'N':0, 'B':1, 'I':2}
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
torch.manual_seed(1)
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
#self.batch_size = batch_size
self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) # <- change here
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
def forward(self, sentence):
embeds = self.char_embeddings(sentence)
lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space, dim=1)
return tag_scores
#Train the model
model = LSTMTagger(100, 256, len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)
model_save_name = 'classifier_bidirectional_emb100_hid256_epoch20.pt'
model.load_state_dict(torch.load(model_save_name))
def prepare_sequence(seq, to_ix):
idxs = [to_ix[ch] for ch in seq]
return torch.tensor(idxs, dtype = torch.long)
def prob_to_tag(out):
_sentence_tag_list = []
_prob_to_tag = []
for ch in out:
chlist = list(ch)
maxi = max(chlist)
ind = chlist.index(maxi)
_prob_to_tag.append((list(tag_to_ix.keys())[ind]))
_sentence_tag_list.append(_prob_to_tag)
return _sentence_tag_list
def _char_to_token(samplesent, _sentence_tag_list):
token_list = []
token = []
for j in range(len(_sentence_tag_list[0])): #for each character of a sentence
ch = _sentence_tag_list[0][j]
ach = samplesent[j]
if ch == 'I':
token.append(ach)
if j == len(_sentence_tag_list[0]) -1:
token_list.append(token)
else:
if ch =='B':
if j == 0:
token.append(ach)
else:
token_list.append(token)
token=[]
token.append(ach)
if j == len(_sentence_tag_list[0]) -1:
token_list.append(token)
elif ch == 'N':
continue
return token_list
def char_unifier(_token_list):
for item in range(len(_token_list)):
_token_list[item]= ''.join(_token_list[item])
return _token_list
def tokenize(sentence):
input = prepare_sequence(sentence, char_to_ix)
out = model(input)
sentence_tag_list = prob_to_tag(out)
token_char_list = _char_to_token(sentence, sentence_tag_list)
token_list = char_unifier(token_char_list)
return token_list