Spaces:

meghanaraok
/

LongLAT

Running

App Files Files Community

LongLAT / LongHiLATmain /models /modeling.py

meghanaraok

Update LongHiLATmain/models/modeling.py

f3e474e verified 6 months ago

raw

history blame contribute delete

No virus

17.3 kB

	import collections
	import logging

	import torch
	from torch.nn import BCEWithLogitsLoss, Dropout, Linear
	from transformers import AutoModel, XLNetModel, LongformerConfig
	from transformers.models.longformer.modeling_longformer import LongformerEncoder
	from huggingface_hub import PyTorchModelHubMixin
	from LongLAT.models.utils import initial_code_title_vectors

	logger = logging.getLogger("lwat")


	class CodingModelConfig:
	def __init__(self,
	transformer_model_name_or_path,
	transformer_tokenizer_name,
	transformer_layer_update_strategy,
	num_chunks,
	max_seq_length,
	dropout,
	dropout_att,
	d_model,
	label_dictionary,
	num_labels,
	use_code_representation,
	code_max_seq_length,
	code_batch_size,
	multi_head_att,
	chunk_att,
	linear_init_mean,
	linear_init_std,
	document_pooling_strategy,
	multi_head_chunk_attention,
	num_hidden_layers):
	super(CodingModelConfig, self).__init__()
	self.transformer_model_name_or_path = transformer_model_name_or_path
	self.transformer_tokenizer_name = transformer_tokenizer_name
	self.transformer_layer_update_strategy = transformer_layer_update_strategy
	self.num_chunks = num_chunks
	self.max_seq_length = max_seq_length
	self.dropout = dropout
	self.dropout_att = dropout_att
	self.d_model = d_model
	# labels_dictionary is a dataframe with columns: icd9_code, long_title
	self.label_dictionary = label_dictionary
	self.num_labels = num_labels
	self.use_code_representation = use_code_representation
	self.code_max_seq_length = code_max_seq_length
	self.code_batch_size = code_batch_size
	self.multi_head_att = multi_head_att
	self.chunk_att = chunk_att
	self.linear_init_mean = linear_init_mean
	self.linear_init_std = linear_init_std
	self.document_pooling_strategy = document_pooling_strategy
	self.multi_head_chunk_attention = multi_head_chunk_attention
	self.num_hidden_layers = num_hidden_layers


	class LableWiseAttentionLayer(torch.nn.Module):
	def __init__(self, coding_model_config, args):
	super(LableWiseAttentionLayer, self).__init__()

	self.config = coding_model_config
	self.args = args

	# layers
	self.l1_linear = torch.nn.Linear(self.config.d_model,
	self.config.d_model, bias=False)
	self.tanh = torch.nn.Tanh()
	self.l2_linear = torch.nn.Linear(self.config.d_model, self.config.num_labels, bias=False)
	self.softmax = torch.nn.Softmax(dim=1)

	# Mean pooling last hidden state of code title from transformer model as the initial code vectors
	self._init_linear_weights(mean=self.config.linear_init_mean, std=self.config.linear_init_std)

	def _init_linear_weights(self, mean, std):
	# normalize the l1 weights
	torch.nn.init.normal_(self.l1_linear.weight, mean, std)
	if self.l1_linear.bias is not None:
	self.l1_linear.bias.data.fill_(0)
	# initialize the l2
	if self.config.use_code_representation:
	code_vectors = initial_code_title_vectors(self.config.label_dictionary,
	self.config.transformer_model_name_or_path,
	self.config.transformer_tokenizer_name
	if self.config.transformer_tokenizer_name
	else self.config.transformer_model_name_or_path,
	self.config.code_max_seq_length,
	self.config.code_batch_size,
	self.config.d_model,
	self.args.device)

	self.l2_linear.weight = torch.nn.Parameter(code_vectors, requires_grad=True)
	torch.nn.init.normal_(self.l2_linear.weight, mean, std)
	if self.l2_linear.bias is not None:
	self.l2_linear.bias.data.fill_(0)

	def forward(self, x):
	# input: (batch_size, max_seq_length, transformer_hidden_size)
	# output: (batch_size, max_seq_length, transformer_hidden_size)
	# Z = Tan(WH)
	l1_output = self.tanh(self.l1_linear(x))
	# softmax(UZ)
	# l2_linear output shape: (batch_size, max_seq_length, num_labels)
	# attention_weight shape: (batch_size, num_labels, max_seq_length)
	attention_weight = self.softmax(self.l2_linear(l1_output)).transpose(1, 2)
	# attention_output shpae: (batch_size, num_labels, transformer_hidden_size)
	attention_output = torch.matmul(attention_weight, x)

	return attention_output, attention_weight

	class ChunkAttentionLayer(torch.nn.Module):
	def __init__(self, coding_model_config, args):
	super(ChunkAttentionLayer, self).__init__()

	self.config = coding_model_config
	self.args = args

	# layers
	self.l1_linear = torch.nn.Linear(self.config.d_model,
	self.config.d_model, bias=False)
	self.tanh = torch.nn.Tanh()
	self.l2_linear = torch.nn.Linear(self.config.d_model, 1, bias=False)
	self.softmax = torch.nn.Softmax(dim=1)

	self._init_linear_weights(mean=self.config.linear_init_mean, std=self.config.linear_init_std)

	def _init_linear_weights(self, mean, std):
	# initialize the l1
	torch.nn.init.normal_(self.l1_linear.weight, mean, std)
	if self.l1_linear.bias is not None:
	self.l1_linear.bias.data.fill_(0)
	# initialize the l2
	torch.nn.init.normal_(self.l2_linear.weight, mean, std)
	if self.l2_linear.bias is not None:
	self.l2_linear.bias.data.fill_(0)

	def forward(self, x):
	# input: (batch_size, num_chunks, transformer_hidden_size)
	# output: (batch_size, num_chunks, transformer_hidden_size)
	# Z = Tan(WH)
	l1_output = self.tanh(self.l1_linear(x))
	# softmax(UZ)
	# l2_linear output shape: (batch_size, num_chunks, 1)
	# attention_weight shape: (batch_size, 1, num_chunks)
	attention_weight = self.softmax(self.l2_linear(l1_output)).transpose(1, 2)
	# attention_output shpae: (batch_size, 1, transformer_hidden_size)
	attention_output = torch.matmul(attention_weight, x)

	return attention_output, attention_weight

	# define the model class
	class CodingModel(torch.nn.Module, PyTorchModelHubMixin):
	def __init__(self, coding_model_config, args, **kwargs):
	super(CodingModel, self).__init__()
	self.coding_model_config = coding_model_config
	self.args = args
	# layers
	self.transformer_layer = AutoModel.from_pretrained(self.coding_model_config.transformer_model_name_or_path)
	if isinstance(self.transformer_layer, XLNetModel):
	self.transformer_layer.config.use_mems_eval = False
	self.dropout = Dropout(p=self.coding_model_config.dropout)

	if self.coding_model_config.multi_head_att:
	# initial multi head attention according to the num_chunks
	self.label_wise_attention_layer = torch.nn.ModuleList(
	[LableWiseAttentionLayer(coding_model_config, args)
	for _ in range(self.coding_model_config.num_chunks)])
	else:
	self.label_wise_attention_layer = LableWiseAttentionLayer(coding_model_config, args)
	self.dropout_att = Dropout(p=self.coding_model_config.dropout_att)

	# initial chunk attention
	if self.coding_model_config.chunk_att:
	if self.coding_model_config.multi_head_chunk_attention:
	self.chunk_attention_layer = torch.nn.ModuleList([ChunkAttentionLayer(coding_model_config, args)
	for _ in range(self.coding_model_config.num_labels)])
	else:
	self.chunk_attention_layer = ChunkAttentionLayer(coding_model_config, args)

	self.classifier_layer = Linear(self.coding_model_config.d_model,
	self.coding_model_config.num_labels)
	else:
	if self.coding_model_config.document_pooling_strategy == "flat":
	self.classifier_layer = Linear(self.coding_model_config.num_chunks * self.coding_model_config.d_model,
	self.coding_model_config.num_labels)
	else: # max or mean pooling
	self.classifier_layer = Linear(self.coding_model_config.d_model,
	self.coding_model_config.num_labels)
	self.sigmoid = torch.nn.Sigmoid()

	if self.coding_model_config.transformer_layer_update_strategy == "no":
	self.freeze_all_transformer_layers()
	elif self.coding_model_config.transformer_layer_update_strategy == "last":
	self.freeze_all_transformer_layers()
	self.unfreeze_transformer_last_layers()

	# initialize the weights of classifier
	self._init_linear_weights(mean=self.coding_model_config.linear_init_mean, std=self.coding_model_config.linear_init_std)

	def _init_linear_weights(self, mean, std):
	torch.nn.init.normal_(self.classifier_layer.weight, mean, std)

	def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
	# longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
	# (global_attention_mask + 1) => 1 for local attention, 2 for global attention
	# => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
	if attention_mask is not None:
	attention_mask = attention_mask * (global_attention_mask + 1)
	else:
	# simply use `global_attention_mask` as `attention_mask`
	# if no `attention_mask` is given
	attention_mask = global_attention_mask + 1
	return attention_mask

	def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, targets=None):
	# input ids/mask/type_ids shape: (batch_size, num_chunks, max_seq_length)
	# labels shape: (batch_size, num_labels)
	transformer_output = []

	# pass chunk by chunk into transformer layer in the batches.
	# input (batch_size, sequence_length)
	for i in range(self.coding_model_config.num_chunks):
	l1_output = self.transformer_layer(input_ids=input_ids[:, i, :],
	attention_mask=attention_mask[:, i, :],
	token_type_ids=token_type_ids[:, i, :])
	# output hidden state shape: (batch_size, sequence_length, hidden_size)
	transformer_output.append(l1_output[0])

	# transpose back chunk and batch size dimensions
	transformer_output = torch.stack(transformer_output)
	transformer_output = transformer_output.transpose(0, 1)
	# dropout transformer output
	l2_dropout = self.dropout(transformer_output)

	config = LongformerConfig.from_pretrained("allenai/longformer-base-4096")
	config.num_labels =5
	config.num_hidden_layers = 2
	# self.coding_model_config.num_hidden_layers
	config.hidden_size = self.coding_model_config.d_model
	config.attention_window = [512, 512]
	longformer_layer = LongformerEncoder(config)
	# longformer_layer = longformer_layer(config)
	l2_dropout= l2_dropout.reshape(l2_dropout.shape[0], l2_dropout.shape[1]*l2_dropout.shape[2], l2_dropout.shape[3])
	attention_mask = attention_mask.reshape(attention_mask.shape[0], attention_mask.shape[1]*attention_mask.shape[2])
	# is_index_masked = attention_mask < 0

	global_attention_mask = torch.zeros_like(attention_mask)
	# global attention on cls token
	global_attention_positions = [0, 512, 1024, 1536, 2048, 2560, 3072, 3584, 4095]
	global_attention_mask[:, global_attention_positions] = 1

	if global_attention_mask is not None:
	attention_mask = self._merge_to_attention_mask(attention_mask, global_attention_mask)
	output = longformer_layer(l2_dropout, attention_mask=attention_mask,output_attentions=True)
	l2_dropout = self.dropout_att(output[0])
	l2_dropout = l2_dropout.reshape(l2_dropout.shape[0], self.coding_model_config.num_chunks, self.coding_model_config.max_seq_length, self.coding_model_config.d_model)

	# Label-wise attention layers
	# output: (batch_size, num_chunks, num_labels, hidden_size)
	attention_output = []
	attention_weights = []

	for i in range(self.coding_model_config.num_chunks):
	# input: (batch_size, max_seq_length, transformer_hidden_size)
	if self.coding_model_config.multi_head_att:
	attention_layer = self.label_wise_attention_layer[i]
	else:
	attention_layer = self.label_wise_attention_layer
	l3_attention, attention_weight = attention_layer(l2_dropout[:, i, :])
	# l3_attention shape: (batch_size, num_labels, hidden_size)
	# attention_weight: (batch_size, num_labels, max_seq_length)
	attention_output.append(l3_attention)
	attention_weights.append(attention_weight)

	attention_output = torch.stack(attention_output)
	attention_output = attention_output.transpose(0, 1)
	attention_weights = torch.stack(attention_weights)
	attention_weights = attention_weights.transpose(0, 1)

	l3_dropout = self.dropout_att(attention_output)

	if self.coding_model_config.chunk_att:
	# Chunk attention layers
	# output: (batch_size, num_labels, hidden_size)
	chunk_attention_output = []
	chunk_attention_weights = []

	for i in range(self.coding_model_config.num_labels):
	if self.coding_model_config.multi_head_chunk_attention:
	chunk_attention = self.chunk_attention_layer[i]
	else:
	chunk_attention = self.chunk_attention_layer
	l4_chunk_attention, l4_chunk_attention_weights = chunk_attention(l3_dropout[:, :, i])
	chunk_attention_output.append(l4_chunk_attention.squeeze(dim=1))
	chunk_attention_weights.append(l4_chunk_attention_weights.squeeze(dim=1))

	chunk_attention_output = torch.stack(chunk_attention_output)
	chunk_attention_output = chunk_attention_output.transpose(0, 1)
	chunk_attention_weights = torch.stack(chunk_attention_weights)
	chunk_attention_weights = chunk_attention_weights.transpose(0, 1)
	# output shape: (batch_size, num_labels, hidden_size)
	l4_dropout = self.dropout_att(chunk_attention_output)
	else:
	# output shape: (batch_size, num_labels, hidden_size*num_chunks)
	l4_dropout = l3_dropout.transpose(1, 2)
	if self.coding_model_config.document_pooling_strategy == "flat":
	# Flatten layer. concatenate representation by labels
	l4_dropout = torch.flatten(l4_dropout, start_dim=2)
	elif self.coding_model_config.document_pooling_strategy == "max":
	l4_dropout = torch.amax(l4_dropout, 2)
	elif self.coding_model_config.document_pooling_strategy == "mean":
	l4_dropout = torch.mean(l4_dropout, 2)
	else:
	raise ValueError("Not supported pooling strategy")

	# classifier layer
	# each code has a binary linear formula
	logits = self.classifier_layer.weight.mul(l4_dropout).sum(dim=2).add(self.classifier_layer.bias)

	loss_fct = BCEWithLogitsLoss()
	loss = loss_fct(logits, targets)

	return {
	"loss": loss,
	"logits": logits,
	"label_attention_weights": attention_weights,
	"chunk_attention_weights": chunk_attention_weights if self.coding_model_config.chunk_att else []
	}

	def freeze_all_transformer_layers(self):
	"""
	Freeze all layer weight parameters. They will not be updated during training.
	"""
	for param in self.transformer_layer.parameters():
	param.requires_grad = False

	def unfreeze_all_transformer_layers(self):
	"""
	Unfreeze all layers weight parameters. They will be updated during training.
	"""
	for param in self.transformer_layer.parameters():
	param.requires_grad = True

	def unfreeze_transformer_last_layers(self):
	for name, param in self.transformer_layer.named_parameters():
	if "layer.11" in name or "pooler" in name:
	param.requires_grad = True