# -*- coding: utf-8 -*- """ Created on Thu May 4 11:19:59 2023 @author: gita """ import numpy as np import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, random_split import pandas as pd import json import os import gc from distutils.dir_util import copy_tree import argparse import flair from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings from torch import nn, tanh, sigmoid, relu, FloatTensor, rand, stack, optim, cuda, softmax, save, device, tensor, int64, no_grad, concat from flair.data import Sentence default_path = os.path.dirname(os.path.abspath(__file__)) tagger_document = 0 embeddings = 0 json_data = 0 train_loader = 0 val_loader = 0 cnn = 0 optimizer = 0 criterion = 0 device = 0 class MyDataset(Dataset): def __init__(self, len_c1=7, len_c2=5, len_c3=11): global json_data def create_vector(c1,sentence): #print("Hola mundo") if len(c1): c1 = torch.cat([c1,sentence], dim=0) else: c1 = sentence return c1 def fix_tensor(tensor, size): while tensor.shape[2] < size: tensor = torch.cat([tensor,torch.zeros(1,1,1,1024)], dim=2) tensor = tensor[:,:,:size,:] return tensor tensor_temp = torch.Tensor(json_data['flat_emb']) data = tensor_temp.reshape((tensor_temp.shape[0],1,-1,1024)) self.targets = create_vector(self.targets,torch.Tensor(json_data['relation'])) for n_sen in range(tensor_temp.shape[0]): tensor_temp = data[n_sen,0,:json_data['h_pos'][n_sen][0],:].reshape((1, 1,-1,1024)) self.c1 = create_vector(self.c1,fix_tensor(tensor_temp, len_c1)) tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][0]:json_data['h_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024)) self.h1 = create_vector(self.h1,tensor_temp) tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][-1]+1:json_data['t_pos'][n_sen][0],:].reshape((1,1,-1,1024)) self.c2 = create_vector(self.c2,fix_tensor(tensor_temp, len_c2)) tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][0]:json_data['t_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024)) self.h2 = create_vector(self.h2,tensor_temp) tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][-1]+1:,:].reshape((1, 1,-1,1024)) self.c3 = create_vector(self.c3,fix_tensor(tensor_temp, len_c3)) del data del tensor_temp del json_data gc.collect() self.targets = self.targets.to(torch.int64) def __len__(self): return len(self.targets) def __getitem__(self, index): c1x = self.c1[index] h1x = self.h1[index] c2x = self.c2[index] h2x = self.h2[index] c3x = self.c3[index] y = self.targets[index] return c1x,h1x,c2x,h2x,c3x, y def update_step(c1, h1,c2,h2,c3, label): global cnn global optimizer global criterion prediction = cnn(c1, h1,c2,h2,c3) optimizer.zero_grad() loss = criterion(prediction, label) loss.backward() optimizer.step() acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item() #print(acc) return loss.item(), acc def evaluate_step(c1, h1,c2,h2,c3, label): global cnn global optimizer global criterion prediction = cnn(c1, h1,c2,h2,c3) loss = criterion(prediction, label) acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item() return loss.item(), acc def train_one_epoch(epoch): global train_loader global val_loader global device if (device == torch.device('cuda:0')): cnn.cuda() train_loss, valid_loss, acc_train, acc_valid = 0.0, 0.0, 0.0, 0.0 for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(train_loader): train_loss_temp, acc_train_temp = update_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device)) train_loss += train_loss_temp acc_train += acc_train_temp for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(val_loader): valid_loss_temp, acc_valid_temp = evaluate_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device)) valid_loss += valid_loss_temp acc_valid += acc_valid_temp # Guardar modelo si es el mejor hasta ahora global best_valid_loss if epoch % 10 == 0: if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save({'epoca': epoch, 'model_state_dict': cnn.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': valid_loss}, '/../../RC/model/best_model.pt') return train_loss/len(train_loader.dataset), valid_loss/len(val_loader.dataset), acc_train/len(train_loader.dataset), acc_valid/len(val_loader.dataset) def FocalLoss(input, target, gamma=0, alpha=None, size_average=True): from torch.autograd import Variable if input.dim()>2: input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W input = input.transpose(1,2) # N,C,H*W => N,H*W,C input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C target = target.view(-1,1) logpt = nn.functional.log_softmax(input) logpt = logpt.gather(1,target) logpt = logpt.view(-1) pt = Variable(logpt.data.exp()) if alpha is not None: if alpha.type()!=input.data.type(): alpha = alpha.type_as(input.data) at = alpha.gather(0,target.data.view(-1)) logpt = logpt * Variable(at) loss = -1 * (1-pt)**gamma * logpt if size_average: return loss.mean() else: return loss.sum() class EarlyStopping: def __init__(self, patience=5, min_delta=0): self.patience = patience self.min_delta = min_delta self.counter = 0 self.min_validation_loss = np.inf self.early_stop = False def __call__(self, validation_loss): if validation_loss < self.min_validation_loss: self.min_validation_loss = validation_loss self.counter = 0 self.early_stop = False elif validation_loss > (self.min_validation_loss + self.min_delta): print('Less') self.counter += 1 if self.counter >= self.patience: self.early_stop = True def SoftmaxModified(x): input_softmax = x.transpose(0,1) function_activation = nn.Softmax(dim=1) output = function_activation(input_softmax) output = output.transpose(0,1) return output class MultiModalGMUAdapted(nn.Module): def __init__(self, input_size_array, hidden_size, dropoutProbability): """Initialize params.""" super(MultiModalGMUAdapted, self).__init__() self.input_size_array = input_size_array self.hidden_size = hidden_size self.dropout = nn.Dropout(dropoutProbability) self.h_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False) self.h_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False) self.h_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False) self.h_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False) self.h_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False) self.z_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False) self.z_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False) self.z_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False) self.z_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False) self.z_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False) #self.z_weights = [nn.Linear(input_size_array[m], hidden_size, bias=False) for m in range(modalities_number)] #self.input_weights = [nn.Linear(size, hidden_size, bias=False) for size in input_size_array] def forward(self, inputModalities): """Propogate input through the network.""" # h_modalities = [self.dropout(self.input_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)] # h_modalities = [tanh(h) for h in h_modalities] h1 = tanh(self.dropout(self.h_1_layer(inputModalities[0]))) h2 = tanh(self.dropout(self.h_2_layer(inputModalities[1]))) h3 = tanh(self.dropout(self.h_3_layer(inputModalities[2]))) h4 = tanh(self.dropout(self.h_4_layer(inputModalities[3]))) h5 = tanh(self.dropout(self.h_5_layer(inputModalities[4]))) z1 = self.dropout(self.z_1_layer(inputModalities[0])) z2 = self.dropout(self.z_2_layer(inputModalities[1])) z3 = self.dropout(self.z_3_layer(inputModalities[2])) z4 = self.dropout(self.z_4_layer(inputModalities[3])) z5 = self.dropout(self.z_5_layer(inputModalities[4])) #z_modalities = [self.dropout(self.z_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)] z_modalities = stack([z1, z2, z3, z4, z5]) z_normalized = SoftmaxModified(z_modalities) final = z_normalized[0] * h1 + z_normalized[1] * h2 + z_normalized[2] * h3 + z_normalized[3] * h4 + z_normalized[4] * h5 return final class MyCNN(nn.Module): def __init__(self, num_classes=10, len_c1=7, len_c2=5, len_c3=11): super(MyCNN, self).__init__() shape1 = (((len_c1-2)))#-2)#//2)-2)//2) shape2 = (((len_c2-2)))#-2)#//2)-2)//2) shape3 = (((len_c3-2)))#-2)#//2)-2)//2) # Define convolutional layers self.conv_layers1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape1,1)), ) self.conv_layers2 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape2,1)), ) self.conv_layers3 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape3,1)), ) self.multi_gmu = MultiModalGMUAdapted([1024,1024,1024,1024,1024], 1024, 0.5) self.fc_simple_layers_multi = nn.Sequential( nn.Linear(1024 , 256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, num_classes) ) def forward(self, c1, h1,c2,h2,c3): # Pass inputs through convolutional layers c1 = self.conv_layers1(c1) c2 = self.conv_layers2(c2) c3 = self.conv_layers3(c3) #print(c1.shape) h1 = tanh(h1) h2 = tanh(h2) #print(c1.shape) c1 = torch.flatten(c1, start_dim=1) c2 = torch.flatten(c2, start_dim=1) c3 = torch.flatten(c3, start_dim=1) #print(c1.shape) mgmu_out, mgmu_weigths = self.multi_gmu([c1,h1,c2,h2,c3]) # Multi GMU x = self.fc_simple_layers_multi(mgmu_out) # Return final output return x def define_model(): global cnn global optimizer global criterion cnn = MyCNN() optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001) criterion = lambda pred,tar: FocalLoss(input=pred,target=tar,gamma=0.7) def train_model(): max_epochs, best_valid_loss = 200, np.inf running_loss = np.zeros(shape=(max_epochs, 4)) early_stopping = EarlyStopping(patience=10, min_delta=0.01) for epoch in range(max_epochs): running_loss[epoch] = train_one_epoch(epoch) early_stopping(running_loss[epoch, 1]) print(f"Epoch {epoch} \t Train_loss = {running_loss[epoch, 0]:.4f} \t Valid_loss = {running_loss[epoch, 1]:.4f} \n\t\t\t Train_acc = {running_loss[epoch, 2]:.4f} \t Valid_acc = {running_loss[epoch, 3]:.4f}") if early_stopping.early_stop: print("We are at epoch:", epoch) break def usage_cuda_rc(cuda): global device if cuda: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") flair.device = device if flair.device == torch.device('cpu'): return 'Error handling GPU, CPU will be used' elif flair.device == torch.device('cuda:0'): return 'GPU detected, GPU will be used' else: device = torch.device('cpu') flair.device = device return 'CPU will be used' def create_embbedings(): global embeddings if (not embeddings): embeddings = TransformerWordEmbeddings( model='xlm-roberta-large', layers="-1", subtoken_pooling="first", fine_tune=True, use_context=True, ) def prepare_data(): create_embbedings() global embeddings global json_data #Embbeb data path_files = default_path + '/../../data/RC/' rel2id_file = path_files + 'rel2id.json' with open(rel2id_file, mode='r') as f: rel2id = json.load(f) path_data = path_files+"train.txt" #Json to save the data json_data = {'flat_emb':[], 'relation':[], 'h_pos':[], 't_pos':[]} PADDING = np.zeros(1024) doc=0 with open(path_data, mode='r', encoding='utf-8') as f: sentence_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 for n,line in enumerate(f.readlines()): if line != '\n': sentence_temp.append(line.split('\t')[0]) if line.split('\t')[1] != 'O': if current_ent == '': h_pos.append(cont) current_ent = line.split('\t')[1] elif line.split('\t')[1] == current_ent: h_pos.append(cont) else: t_pos.append(cont) if line.split('\t')[2].replace('\n','') != '-' : relation = line.split('\t')[2].replace('\n','') cont += 1 else: #Embbedding sentence sentence = Sentence(sentence_temp) embeddings.embed(sentence) sentence_emb_flatten = [] for tk in sentence: #flatten_embeddings if len(sentence_emb_flatten): sentence_emb_flatten = np.hstack((sentence_emb_flatten, tk.embedding.detach().to('cpu').numpy())) else: sentence_emb_flatten = tk.embedding.detach().to('cpu').numpy() number_padding = 100 - len(sentence) if number_padding > 0: for pd in range(number_padding): sentence_emb_flatten = np.hstack((sentence_emb_flatten, PADDING)) #Save embeddings information json_data['flat_emb'].append(list(sentence_emb_flatten)) json_data['h_pos'].append(h_pos) json_data['t_pos'].append(t_pos) json_data['relation'].append(rel2id[relation]) sentence_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 dataset = MyDataset() train_set_size = int(len(dataset) * 0.9) valid_set_size = len(dataset) - train_set_size train_dataset, val_dataset = random_split(dataset, [train_set_size, valid_set_size ]) del dataset global train_loader global val_loader train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)