# -*- coding: utf-8 -*- """ Created on Mon May 1 20:54:14 2023 @author: sanmo """ import numpy as np import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, random_split import pandas as pd import json import os import gc from distutils.dir_util import copy_tree import shutil import argparse import flair from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings from torch import nn, tanh, sigmoid, relu, FloatTensor, rand, stack, optim, cuda, softmax, save, device, tensor, int64, no_grad, concat from flair.data import Sentence default_path = 0 entities = 0 tagger_document = 0 embeddings = 0 json_data = 0 train_loader = 0 val_loader = 0 test_loader = 0 cnn = 0 optimizer = 0 criterion = 0 device = 0 test_sentences = 0 best_valid_loss = np.inf def check_create(path): import os if not (os.path.isdir(path)): os.makedirs(path) def str2bool(v): if isinstance(v, bool): return v if v.lower() in ('yes', 'True','true', 't', 'y', '1'): return True elif v.lower() in ('no', 'False', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') class MyDataset(Dataset): def __init__(self, len_c1=7, len_c2=5, len_c3=11): global json_data def create_vector(c1,sentence): #print("Hola mundo") if len(c1): c1 = torch.cat([c1,sentence], dim=0) else: c1 = sentence return c1 def fix_tensor(tensor, size): while tensor.shape[2] < size: tensor = torch.cat([tensor,torch.zeros(1,1,1,1024)], dim=2) tensor = tensor[:,:,:size,:] return tensor data = [] self.targets = [] self.c1=[] self.h1=[] self.c2=[] self.h2=[] self.c3=[] tensor_temp = torch.Tensor(json_data['flat_emb']) data = tensor_temp.reshape((tensor_temp.shape[0],1,-1,1024)) self.targets = create_vector(self.targets,torch.Tensor(json_data['relation'])) for n_sen in range(tensor_temp.shape[0]): tensor_temp = data[n_sen,0,:json_data['h_pos'][n_sen][0],:].reshape((1, 1,-1,1024)) self.c1 = create_vector(self.c1,fix_tensor(tensor_temp, len_c1)) tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][0]:json_data['h_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024)) self.h1 = create_vector(self.h1,tensor_temp) tensor_temp = data[n_sen,0,json_data['h_pos'][n_sen][-1]+1:json_data['t_pos'][n_sen][0],:].reshape((1,1,-1,1024)) self.c2 = create_vector(self.c2,fix_tensor(tensor_temp, len_c2)) tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][0]:json_data['t_pos'][n_sen][-1]+1,:].mean(dim=0).reshape((1,1024)) self.h2 = create_vector(self.h2,tensor_temp) tensor_temp = data[n_sen,0,json_data['t_pos'][n_sen][-1]+1:,:].reshape((1, 1,-1,1024)) self.c3 = create_vector(self.c3,fix_tensor(tensor_temp, len_c3)) del data del tensor_temp del json_data gc.collect() self.targets = self.targets.to(torch.int64) #print('Dataset class') def __len__(self): return len(self.targets) def __getitem__(self, index): c1x = self.c1[index] h1x = self.h1[index] c2x = self.c2[index] h2x = self.h2[index] c3x = self.c3[index] y = self.targets[index] return c1x,h1x,c2x,h2x,c3x, y def update_step(c1, h1,c2,h2,c3, label): global cnn global optimizer global criterion prediction = cnn(c1, h1,c2,h2,c3) optimizer.zero_grad() loss = criterion(prediction, label) loss.backward() optimizer.step() acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item() #print(acc) return loss.item(), acc def evaluate_step(c1, h1,c2,h2,c3, label): global cnn global optimizer global criterion prediction = cnn(c1, h1,c2,h2,c3) loss = criterion(prediction, label) acc = (nn.Softmax(dim=1)(prediction).detach().argmax(dim=1) == label).type(torch.float).sum().item() return loss.item(), acc def train_one_epoch(epoch, name, rel2id_file): global train_loader global val_loader global device global best_valid_loss global optimizer global cnn global default_path if (device == torch.device('cuda:0')): cnn.cuda() train_loss, valid_loss, acc_train, acc_valid = 0.0, 0.0, 0.0, 0.0 for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(train_loader): train_loss_temp, acc_train_temp = update_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device)) train_loss += train_loss_temp acc_train += acc_train_temp for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(val_loader): valid_loss_temp, acc_valid_temp = evaluate_step(c1.to(device), h1.to(device),c2.to(device),h2.to(device),c3.to(device), targets.to(device)) valid_loss += valid_loss_temp acc_valid += acc_valid_temp # Guardar modelo si es el mejor hasta ahora if epoch % 10 == 0: # path_save = os.path.normpath(default_path +'/../../models/RC/{}/best_model.pt'.format(name)) # path_save = path_save.replace('\\', '/') # print(os.path.abspath(__file__)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save({'epoca': epoch, 'model_state_dict': cnn.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': valid_loss}, '../../models/RC/{}/best_model.pt'.format(name)) #path_files = default_path + '/../../data/RC/' a=0 #rel2id_file = path_files + 'rel2id.json' return train_loss/len(train_loader.dataset), valid_loss/len(val_loader.dataset), acc_train/len(train_loader.dataset), acc_valid/len(val_loader.dataset) def FocalLoss(input, target, gamma=0, alpha=None, size_average=True): from torch.autograd import Variable if input.dim()>2: input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W input = input.transpose(1,2) # N,C,H*W => N,H*W,C input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C target = target.view(-1,1) logpt = nn.functional.log_softmax(input) logpt = logpt.gather(1,target) logpt = logpt.view(-1) pt = Variable(logpt.data.exp()) if alpha is not None: if alpha.type()!=input.data.type(): alpha = alpha.type_as(input.data) at = alpha.gather(0,target.data.view(-1)) logpt = logpt * Variable(at) loss = -1 * (1-pt)**gamma * logpt if size_average: return loss.mean() else: return loss.sum() class EarlyStopping: def __init__(self, patience=5, min_delta=0): self.patience = patience self.min_delta = min_delta self.counter = 0 self.min_validation_loss = np.inf self.early_stop = False def __call__(self, validation_loss): if validation_loss < self.min_validation_loss: self.min_validation_loss = validation_loss self.counter = 0 self.early_stop = False elif validation_loss > (self.min_validation_loss + self.min_delta): print('Less') self.counter += 1 if self.counter >= self.patience: self.early_stop = True def SoftmaxModified(x): input_softmax = x.transpose(0,1) function_activation = nn.Softmax(dim=1) output = function_activation(input_softmax) output = output.transpose(0,1) return output class MultiModalGMUAdapted(nn.Module): def __init__(self, input_size_array, hidden_size, dropoutProbability): """Initialize params.""" super(MultiModalGMUAdapted, self).__init__() self.input_size_array = input_size_array self.hidden_size = hidden_size self.dropout = nn.Dropout(dropoutProbability) self.h_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False) self.h_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False) self.h_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False) self.h_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False) self.h_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False) self.z_1_layer = nn.Linear(input_size_array[0], hidden_size, bias=False) self.z_2_layer = nn.Linear(input_size_array[1], hidden_size, bias=False) self.z_3_layer = nn.Linear(input_size_array[2], hidden_size, bias=False) self.z_4_layer = nn.Linear(input_size_array[3], hidden_size, bias=False) self.z_5_layer = nn.Linear(input_size_array[4], hidden_size, bias=False) #self.z_weights = [nn.Linear(input_size_array[m], hidden_size, bias=False) for m in range(modalities_number)] #self.input_weights = [nn.Linear(size, hidden_size, bias=False) for size in input_size_array] def forward(self, inputModalities): """Propogate input through the network.""" # h_modalities = [self.dropout(self.input_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)] # h_modalities = [tanh(h) for h in h_modalities] h1 = tanh(self.dropout(self.h_1_layer(inputModalities[0]))) h2 = tanh(self.dropout(self.h_2_layer(inputModalities[1]))) h3 = tanh(self.dropout(self.h_3_layer(inputModalities[2]))) h4 = tanh(self.dropout(self.h_4_layer(inputModalities[3]))) h5 = tanh(self.dropout(self.h_5_layer(inputModalities[4]))) z1 = self.dropout(self.z_1_layer(inputModalities[0])) z2 = self.dropout(self.z_2_layer(inputModalities[1])) z3 = self.dropout(self.z_3_layer(inputModalities[2])) z4 = self.dropout(self.z_4_layer(inputModalities[3])) z5 = self.dropout(self.z_5_layer(inputModalities[4])) #z_modalities = [self.dropout(self.z_weights[i](i_mod)) for i,i_mod in enumerate(inputModalities)] z_modalities = stack([z1, z2, z3, z4, z5]) z_normalized = SoftmaxModified(z_modalities) final = z_normalized[0] * h1 + z_normalized[1] * h2 + z_normalized[2] * h3 + z_normalized[3] * h4 + z_normalized[4] * h5 return final class MyCNN(nn.Module): def __init__(self, num_classes=10, len_c1=7, len_c2=5, len_c3=11): super(MyCNN, self).__init__() shape1 = (((len_c1-2)))#-2)#//2)-2)//2) shape2 = (((len_c2-2)))#-2)#//2)-2)//2) shape3 = (((len_c3-2)))#-2)#//2)-2)//2) # Define convolutional layers self.conv_layers1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape1,1)), ) self.conv_layers2 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape2,1)), ) self.conv_layers3 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3,1)), nn.ReLU(), nn.MaxPool2d(kernel_size=(shape3,1)), ) self.multi_gmu = MultiModalGMUAdapted([1024,1024,1024,1024,1024], 1024, 0.5) self.fc_simple_layers_multi = nn.Sequential( nn.Linear(1024 , 256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, num_classes) ) def forward(self, c1, h1,c2,h2,c3): # Pass inputs through convolutional layers c1 = self.conv_layers1(c1) c2 = self.conv_layers2(c2) c3 = self.conv_layers3(c3) #print(c1.shape) h1 = tanh(h1) h2 = tanh(h2) #print(c1.shape) c1 = torch.flatten(c1, start_dim=1) c2 = torch.flatten(c2, start_dim=1) c3 = torch.flatten(c3, start_dim=1) #print(c1.shape) # Multi GMU mgmu_out = self.multi_gmu([c1,h1,c2,h2,c3]) x = self.fc_simple_layers_multi(mgmu_out) # Return final output return x def define_model(): global cnn global optimizer global criterion cnn = MyCNN() optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001) criterion = lambda pred,tar: FocalLoss(input=pred,target=tar,gamma=0.7) def train_model(name, epocs, rel2id_path): max_epochs, best_valid_loss = epocs, np.inf running_loss = np.zeros(shape=(max_epochs, 4)) early_stopping = EarlyStopping(patience=10, min_delta=0.01) for epoch in range(max_epochs): running_loss[epoch] = train_one_epoch(epoch, name, rel2id_path) early_stopping(running_loss[epoch, 1]) print(f"Epoch {epoch} \t Train_loss = {running_loss[epoch, 0]:.4f} \t Valid_loss = {running_loss[epoch, 1]:.4f} \n\t\t\t Train_acc = {running_loss[epoch, 2]:.4f} \t Valid_acc = {running_loss[epoch, 3]:.4f}") if early_stopping.early_stop: print("We are at epoch:", epoch) break def usage_cuda_rc(cuda): global device if cuda: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") flair.device = device if flair.device == torch.device('cpu'): return 'Error handling GPU, CPU will be used' elif flair.device == torch.device('cuda:0'): return 'GPU detected, GPU will be used' else: device = torch.device('cpu') flair.device = device return 'CPU will be used' def create_embbedings(): global embeddings if (not embeddings): embeddings = TransformerWordEmbeddings( model='xlm-roberta-large', layers="-1", subtoken_pooling="first", fine_tune=True, use_context=True, ) def prepare_data(rel2id_path, path_data): create_embbedings() global embeddings global json_data #Embbeb data global default_path #path_files #rel2id_file = path_files + 'rel2id.json' #shutil.copy(rel2id_path, rel2id_file) with open(rel2id_path, mode='r') as f: rel2id = json.load(f) #path_data = path_files+"train.txt" #Json to save the data json_data = {'flat_emb':[], 'relation':[], 'h_pos':[], 't_pos':[]} PADDING = np.zeros(1024) doc=0 with open(path_data, mode='r', encoding='utf-8') as f: sentence_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 for n,line in enumerate(f.readlines()): if line != '\n': sentence_temp.append(line.split('\t')[0]) if line.split('\t')[1] != 'O': if current_ent == '': h_pos.append(cont) current_ent = line.split('\t')[1] elif line.split('\t')[1] == current_ent: h_pos.append(cont) else: t_pos.append(cont) if line.split('\t')[2].replace('\n','') != '-' : relation = line.split('\t')[2].replace('\n','') cont += 1 else: #Embbedding sentence sentence = Sentence(sentence_temp) embeddings.embed(sentence) sentence_emb_flatten = [] for tk in sentence: #flatten_embeddings if len(sentence_emb_flatten): sentence_emb_flatten = np.hstack((sentence_emb_flatten, tk.embedding.detach().to('cpu').numpy())) else: sentence_emb_flatten = tk.embedding.detach().to('cpu').numpy() number_padding = 100 - len(sentence) if number_padding > 0: for pd in range(number_padding): sentence_emb_flatten = np.hstack((sentence_emb_flatten, PADDING)) #Save embeddings information json_data['flat_emb'].append(list(sentence_emb_flatten)) json_data['h_pos'].append(h_pos) json_data['t_pos'].append(t_pos) json_data['relation'].append(rel2id[relation]) sentence_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 dataset = MyDataset() train_set_size = int(len(dataset) * 0.9) valid_set_size = len(dataset) - train_set_size train_dataset, val_dataset = random_split(dataset, [train_set_size, valid_set_size ]) del dataset global train_loader global val_loader train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True) def prepare_data_test(name, path_data): create_embbedings() global embeddings global json_data global test_sentences global entities #Embbeb data global default_path test_sentences = [] entities = [] #path_files = default_path + '/../../data/RC/' # path_model = default_path + '/../../models/RC/{}/'.format(name) #path_data = path_files+"test.txt" #Json to save the data json_data = {'flat_emb':[], 'relation':[], 'h_pos':[], 't_pos':[]} PADDING = np.zeros(1024) doc=0 with open(path_data, mode='r', encoding='utf-8') as f: sentence_temp = [] entities_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 for n,line in enumerate(f.readlines()): #print(line) if line != '\n': sentence_temp.append(line.split('\t')[0]) entities_temp.append(line.split('\t')[1]) if line.split('\t')[1] != 'O': if current_ent == '': h_pos.append(cont) current_ent = line.split('\t')[1] elif line.split('\t')[1] == current_ent: h_pos.append(cont) else: t_pos.append(cont) # if line.split('\t')[2].replace('\n','') != '-' : relation = '-' cont += 1 else: #Embbedding sentence sentence = Sentence(sentence_temp) test_sentences.append(sentence_temp) entities.append(entities_temp) #print('mid while') embeddings.embed(sentence) sentence_emb_flatten = [] for tk in sentence: #flatten_embeddings if len(sentence_emb_flatten): sentence_emb_flatten = np.hstack((sentence_emb_flatten, tk.embedding.detach().to('cpu').numpy())) else: sentence_emb_flatten = tk.embedding.detach().to('cpu').numpy() number_padding = 100 - len(sentence) if number_padding > 0: for pd in range(number_padding): sentence_emb_flatten = np.hstack((sentence_emb_flatten, PADDING)) #Save embeddings information json_data['flat_emb'].append(list(sentence_emb_flatten)) json_data['h_pos'].append(h_pos) json_data['t_pos'].append(t_pos) json_data['relation'].append(1) sentence_temp = [] entities_temp = [] h_pos = [] t_pos = [] current_ent='' cont=0 dataset = MyDataset() global test_loader test_loader = DataLoader(dataset, batch_size=64, shuffle=False) del dataset #------------------Backend functions---------------------------------------- def training_model_rc(name, path_data, rel2id_path, epochs=200): global default_path default_path = os.path.dirname(os.path.abspath(__file__)) default_path = default_path.replace('\\', '/') print(name) #FUNCION try: define_model() except: return 13 print('Model defined') check_create(default_path + '/../../models/RC/{}/'.format(name)) try: prepare_data(rel2id_path, path_data) except: return 12 print('Data prepared') #Train the model try: train_model(name, epochs, rel2id_path) except: return 7 #save the model in path_model = default_path + '/../../models/RC/{}/best_model.pt'.format(name) shutil.copy(rel2id_path, default_path + '/../../models/RC/{}/rel2id.json'.format(name)) return "model trined and saved at {}".format(path_model) def use_model_rc(name, path_data, output_dir): global default_path default_path = os.path.dirname(os.path.abspath(__file__)) default_path = default_path.replace('\\', '/') #--------------Load the trained model------------------------- path_model = default_path + '/../../models/RC/{}/best_model.pt'.format(name) rel2id_file = default_path + '/../../models/RC/{}/rel2id.json'.format(name) with open(rel2id_file, mode='r') as f: rel2id = json.load(f) id2rel = [m for _,m in sorted(zip(list(rel2id.values()),list(rel2id.keys())), key=lambda pair: pair[0])] if not (os.path.isfile(path_model)): print('Model does not exists') return 10 print(path_data) if not os.path.isfile(path_data): print('Input file is not a file') return 9 global cnn try: cnn = MyCNN() except: return 13 print('Model defined') try: saved_model = torch.load(path_model) cnn.load_state_dict(saved_model['model_state_dict']) except: return 1 print('Model loaded') #-----------------Load the document------------------------- try: prepare_data_test(name, path_data) except: return 12 global json_data print('Data prepared') #-----------------Predict------------------------- global test_loader ypred = [] relations = [] for batch_idx, (c1, h1,c2,h2,c3, targets) in enumerate(test_loader): x = cnn(c1, h1,c2,h2,c3) ypred.append(nn.Softmax(dim=1)(x).detach().argmax(dim=1)) ypred = np.concatenate(ypred) relations = [id2rel[rel] for rel in ypred] print('prediction') #-----------------Tagged the document------------------------- global test_sentences global entities results = {'sentences':{'tokens':test_sentences, 'entities':entities}, 'relations': relations} #-----------------Save the results------------------------- try: with open(output_dir, "w", encoding='utf-8') as write_file: json.dump(results, write_file) print('-'*20,'Tagged complete','-'*20) print('Document tagged saved in {}'.format(output_dir)) except: print('Error in output file') return 11 return results