NER_RC / src /scripts /model_rc.py
SantiagoMoreno-UdeA's picture
Add files to repo
42d6a0f
raw
history blame
16.7 kB
# -*- coding: utf-8 -*-
"""
Created on Thu May 4 11:19:59 2023
@author: gita
"""
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import json
import os
import gc
from distutils.dir_util import copy_tree
import argparse
import flair
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings
from torch import nn, tanh, sigmoid, relu, FloatTensor, rand, stack, optim, cuda, softmax, save, device, tensor, int64, no_grad, concat
from flair.data import Sentence
default_path = os.path.dirname(os.path.abspath(__file__))
tagger_document = 0
embeddings = 0
json_data = 0
train_loader = 0
val_loader = 0
cnn = 0
optimizer = 0
criterion = 0
device = 0
class MyDataset(Dataset):
def __init__(self, len_c1=7, len_c2=5, len_c3=11):
global json_data
def create_vector(c1,sentence):
#print("Hola mundo")
if len(c1): c1 = torch.cat([c1,sentence], dim=0)
else: c1 = sentence
return c1
def fix_tensor(tensor, size):
while tensor.shape[2] < size:
tensor = torch.cat([tensor,torch.zeros(1,1,1,1024)], dim=2)
tensor = tensor[:,:,:size,:]
return tensor
tensor_temp = torch.Tensor(json_data['flat_emb'])
data = tensor_temp.reshape((tensor_temp.shape[0],1,-1,1024))
self.targets = create_vector(self.targets,torch.Tensor(json_data['relation']))
for n_sen in range(tensor_temp.shape[0]):
tensor_temp = data[n_sen,0,:json_data['h_pos'][n_sen][0],:].reshape((1, 1,-1,1024))
self.c1 = create_vector(self.c1,fix_tensor(tensor_temp, len_c1))
tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][0]:json_data['h_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024))
self.h1 = create_vector(self.h1,tensor_temp)
tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][-1]+1:json_data['t_pos'][n_sen][0],:].reshape((1,1,-1,1024))
self.c2 = create_vector(self.c2,fix_tensor(tensor_temp, len_c2))
tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][0]:json_data['t_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024))
self.h2 = create_vector(self.h2,tensor_temp)
tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][-1]+1:,:].reshape((1, 1,-1,1024))
self.c3 = create_vector(self.c3,fix_tensor(tensor_temp, len_c3))
del data
del tensor_temp
del json_data
gc.collect()
self.targets = self.targets.to(torch.int64)
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
c1x = self.c1[index]
h1x = self.h1[index]
c2x = self.c2[index]
h2x = self.h2[index]
c3x = self.c3[index]
y = self.targets[index]
return c1x,h1x,c2x,h2x,c3x, y
def update_step(c1, h1,c2,h2,c3, label):
global cnn
global optimizer
global criterion
prediction = cnn(c1, h1,c2,h2,c3)
optimizer.zero_grad()
loss = criterion(prediction, label)
loss.backward()
optimizer.step()
acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item()
#print(acc)
return loss.item(), acc
def evaluate_step(c1, h1,c2,h2,c3, label):
global cnn
global optimizer
global criterion
prediction = cnn(c1, h1,c2,h2,c3)
loss = criterion(prediction, label)
acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item()
return loss.item(), acc
def train_one_epoch(epoch):
global train_loader
global val_loader
global device
if (device == torch.device('cuda:0')): cnn.cuda()
train_loss, valid_loss, acc_train, acc_valid = 0.0, 0.0, 0.0, 0.0
for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(train_loader):
train_loss_temp, acc_train_temp = update_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device))
train_loss += train_loss_temp
acc_train += acc_train_temp
for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(val_loader):
valid_loss_temp, acc_valid_temp = evaluate_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device))
valid_loss += valid_loss_temp
acc_valid += acc_valid_temp
# Guardar modelo si es el mejor hasta ahora
global best_valid_loss
if epoch % 10 == 0:
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save({'epoca': epoch,
'model_state_dict': cnn.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': valid_loss},
'/../../RC/model/best_model.pt')
return train_loss/len(train_loader.dataset), valid_loss/len(val_loader.dataset), acc_train/len(train_loader.dataset), acc_valid/len(val_loader.dataset)
def FocalLoss(input, target, gamma=0, alpha=None, size_average=True):
from torch.autograd import Variable
if input.dim()>2:
input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W
input = input.transpose(1,2) # N,C,H*W => N,H*W,C
input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C
target = target.view(-1,1)
logpt = nn.functional.log_softmax(input)
logpt = logpt.gather(1,target)
logpt = logpt.view(-1)
pt = Variable(logpt.data.exp())
if alpha is not None:
if alpha.type()!=input.data.type():
alpha = alpha.type_as(input.data)
at = alpha.gather(0,target.data.view(-1))
logpt = logpt * Variable(at)
loss = -1 * (1-pt)**gamma * logpt
if size_average: return loss.mean()
else: return loss.sum()
class EarlyStopping:
def __init__(self, patience=5, min_delta=0):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.min_validation_loss = np.inf
self.early_stop = False
def __call__(self, validation_loss):
if validation_loss < self.min_validation_loss:
self.min_validation_loss = validation_loss
self.counter = 0
self.early_stop = False
elif validation_loss > (self.min_validation_loss + self.min_delta):
print('Less')
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
def SoftmaxModified(x):
input_softmax = x.transpose(0,1)
function_activation = nn.Softmax(dim=1)
output = function_activation(input_softmax)
output = output.transpose(0,1)
return output
class MultiModalGMUAdapted(nn.Module):
def __init__(self, input_size_array, hidden_size, dropoutProbability):
"""Initialize params."""
super(MultiModalGMUAdapted, self).__init__()
self.input_size_array = input_size_array
self.hidden_size = hidden_size
self.dropout = nn.Dropout(dropoutProbability)
self.h_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False)
self.h_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False)
self.h_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False)
self.h_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False)
self.h_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False)
self.z_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False)
self.z_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False)
self.z_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False)
self.z_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False)
self.z_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False)
#self.z_weights = [nn.Linear(input_size_array[m], hidden_size, bias=False) for m in range(modalities_number)]
#self.input_weights = [nn.Linear(size, hidden_size, bias=False) for size in input_size_array]
def forward(self, inputModalities):
"""Propogate input through the network."""
# h_modalities = [self.dropout(self.input_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)]
# h_modalities = [tanh(h) for h in h_modalities]
h1 = tanh(self.dropout(self.h_1_layer(inputModalities[0])))
h2 = tanh(self.dropout(self.h_2_layer(inputModalities[1])))
h3 = tanh(self.dropout(self.h_3_layer(inputModalities[2])))
h4 = tanh(self.dropout(self.h_4_layer(inputModalities[3])))
h5 = tanh(self.dropout(self.h_5_layer(inputModalities[4])))
z1 = self.dropout(self.z_1_layer(inputModalities[0]))
z2 = self.dropout(self.z_2_layer(inputModalities[1]))
z3 = self.dropout(self.z_3_layer(inputModalities[2]))
z4 = self.dropout(self.z_4_layer(inputModalities[3]))
z5 = self.dropout(self.z_5_layer(inputModalities[4]))
#z_modalities = [self.dropout(self.z_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)]
z_modalities = stack([z1, z2, z3, z4, z5])
z_normalized = SoftmaxModified(z_modalities)
final = z_normalized[0] * h1 + z_normalized[1] * h2 + z_normalized[2] * h3 + z_normalized[3] * h4 + z_normalized[4] * h5
return final
class MyCNN(nn.Module):
def __init__(self, num_classes=10, len_c1=7, len_c2=5, len_c3=11):
super(MyCNN, self).__init__()
shape1 = (((len_c1-2)))#-2)#//2)-2)//2)
shape2 = (((len_c2-2)))#-2)#//2)-2)//2)
shape3 = (((len_c3-2)))#-2)#//2)-2)//2)
# Define convolutional layers
self.conv_layers1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(shape1,1)),
)
self.conv_layers2 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(shape2,1)),
)
self.conv_layers3 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(shape3,1)),
)
self.multi_gmu = MultiModalGMUAdapted([1024,1024,1024,1024,1024], 1024, 0.5)
self.fc_simple_layers_multi = nn.Sequential(
nn.Linear(1024 , 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, c1, h1,c2,h2,c3):
# Pass inputs through convolutional layers
c1 = self.conv_layers1(c1)
c2 = self.conv_layers2(c2)
c3 = self.conv_layers3(c3)
#print(c1.shape)
h1 = tanh(h1)
h2 = tanh(h2)
#print(c1.shape)
c1 = torch.flatten(c1, start_dim=1)
c2 = torch.flatten(c2, start_dim=1)
c3 = torch.flatten(c3, start_dim=1)
#print(c1.shape)
mgmu_out, mgmu_weigths = self.multi_gmu([c1,h1,c2,h2,c3])
# Multi GMU
x = self.fc_simple_layers_multi(mgmu_out)
# Return final output
return x
def define_model():
global cnn
global optimizer
global criterion
cnn = MyCNN()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
criterion = lambda pred,tar: FocalLoss(input=pred,target=tar,gamma=0.7)
def train_model():
max_epochs, best_valid_loss = 200, np.inf
running_loss = np.zeros(shape=(max_epochs, 4))
early_stopping = EarlyStopping(patience=10, min_delta=0.01)
for epoch in range(max_epochs):
running_loss[epoch] = train_one_epoch(epoch)
early_stopping(running_loss[epoch, 1])
print(f"Epoch {epoch} \t Train_loss = {running_loss[epoch, 0]:.4f} \t Valid_loss = {running_loss[epoch, 1]:.4f} \n\t\t\t Train_acc = {running_loss[epoch, 2]:.4f} \t Valid_acc = {running_loss[epoch, 3]:.4f}")
if early_stopping.early_stop:
print("We are at epoch:", epoch)
break
def usage_cuda_rc(cuda):
global device
if cuda:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
flair.device = device
if flair.device == torch.device('cpu'): return 'Error handling GPU, CPU will be used'
elif flair.device == torch.device('cuda:0'): return 'GPU detected, GPU will be used'
else:
device = torch.device('cpu')
flair.device = device
return 'CPU will be used'
def create_embbedings():
global embeddings
if (not embeddings):
embeddings = TransformerWordEmbeddings(
model='xlm-roberta-large',
layers="-1",
subtoken_pooling="first",
fine_tune=True,
use_context=True,
)
def prepare_data():
create_embbedings()
global embeddings
global json_data
#Embbeb data
path_files = default_path + '/../../data/RC/'
rel2id_file = path_files + 'rel2id.json'
with open(rel2id_file, mode='r') as f:
rel2id = json.load(f)
path_data = path_files+"train.txt"
#Json to save the data
json_data = {'flat_emb':[], 'relation':[], 'h_pos':[], 't_pos':[]}
PADDING = np.zeros(1024)
doc=0
with open(path_data, mode='r', encoding='utf-8') as f:
sentence_temp = []
h_pos = []
t_pos = []
current_ent=''
cont=0
for n,line in enumerate(f.readlines()):
if line != '\n':
sentence_temp.append(line.split('\t')[0])
if line.split('\t')[1] != 'O':
if current_ent == '':
h_pos.append(cont)
current_ent = line.split('\t')[1]
elif line.split('\t')[1] == current_ent:
h_pos.append(cont)
else:
t_pos.append(cont)
if line.split('\t')[2].replace('\n','') != '-' : relation = line.split('\t')[2].replace('\n','')
cont += 1
else:
#Embbedding sentence
sentence = Sentence(sentence_temp)
embeddings.embed(sentence)
sentence_emb_flatten = []
for tk in sentence:
#flatten_embeddings
if len(sentence_emb_flatten): sentence_emb_flatten = np.hstack((sentence_emb_flatten,
tk.embedding.detach().to('cpu').numpy()))
else: sentence_emb_flatten = tk.embedding.detach().to('cpu').numpy()
number_padding = 100 - len(sentence)
if number_padding > 0:
for pd in range(number_padding):
sentence_emb_flatten = np.hstack((sentence_emb_flatten,
PADDING))
#Save embeddings information
json_data['flat_emb'].append(list(sentence_emb_flatten))
json_data['h_pos'].append(h_pos)
json_data['t_pos'].append(t_pos)
json_data['relation'].append(rel2id[relation])
sentence_temp = []
h_pos = []
t_pos = []
current_ent=''
cont=0
dataset = MyDataset()
train_set_size = int(len(dataset) * 0.9)
valid_set_size = len(dataset) - train_set_size
train_dataset, val_dataset = random_split(dataset, [train_set_size, valid_set_size ])
del dataset
global train_loader
global val_loader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)