Spaces:

Angione-Lab
/

FateFormerExplorer

Running

App Files Files Community

FateFormerExplorer / models /transformers.py

kaveh

init

ef814bf 6 days ago

raw

history blame contribute delete

15.8 kB

	import torch
	from torch import nn
	import math


	class CustomTransformerEncoderLayer(nn.TransformerEncoderLayer):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	def forward(self, src, src_mask=None, src_key_padding_mask=None):
	# Obtain the output and attention weights directly from self.self_attn
	src2, attn_weights = self.self_attn(
	src, src, src,
	attn_mask=src_mask,
	key_padding_mask=src_key_padding_mask,
	average_attn_weights=False,
	need_weights=True
	)
	src = src + self.dropout1(src2)
	src = self.norm1(src)
	src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
	src = src + self.dropout2(src2)
	src = self.norm2(src)
	return src, attn_weights

	class SingleTransformer(nn.Module):

	"""
	Transformer-based model for each modality.
	Args:
	vocab_size (int): Vocabulary size. (set 1 if projection is used.)
	seq_len (int): Sequence length.
	n_encoder_layers (int): Number of transformer encoder layers.
	n_heads (int): Number of attention heads.
	n_batches (int): Number of batches.
	d_tokens (int): Dimension of the token embeddings.
	d_ff (int): Dimension of the feedforward layer.
	d_batch (int): Dimension of the batch embeddings.
	dropout_rate (float, optional): Dropout rate. Defaults to 0.1.
	Attributes:
	count_embedding (torch.Tensor): Count embeddings.
	id_embeddings (torch.Tensor): ID embeddings.
	batch_embedding (nn.Embedding): Batch embeddings.
	layer_norm (nn.LayerNorm): Layer normalization.
	cls_token (torch.Tensor): CLS token.
	encoder (nn.TransformerEncoder): Transformer encoder.
	mask_output_layer (nn.Linear): Mask output layer.
	cls_attention (nn.MultiheadAttention): Multihead attention for CLS token.
	cls_norm1 (nn.LayerNorm): Layer normalization for CLS token.
	cls_norm2 (nn.LayerNorm): Layer normalization for CLS token.
	cls_ffn (nn.Sequential): Feedforward network for CLS token.
	cls_output_layer (nn.Linear): Output layer for CLS token.
	pretrained (bool): Flag indicating if pretrained weights are frozen.
	Methods:
	forward(x, batch_indices, masked_lm=False, return_attention=False, return_embeddings=False):
	Forward pass of the module.
	freeze_pretrained_weights():
	Freeze the pretrained weights.
	unfreeze_pretrained_weights():
	Unfreeze the pretrained weights.
	create_count_embeddings(max_count, embed_size):
	Create count embeddings.
	get_latent_space(inputs, batch_indices, batch_size=32):
	Get the latent space representation and predictions.
	"""
	def __init__(self, model_type, vocab_size, seq_len,
	n_encoder_layers, n_heads, n_batches,
	d_model, d_ff,
	dropout_rate=0.0):
	super(SingleTransformer, self).__init__()

	if model_type not in ['RNA', 'ATAC', 'Flux']:
	raise ValueError("model_type must be one of 'RNA', 'ATAC', 'Flux'")

	self.model_type = model_type

	if self.model_type == 'RNA':
	self.count_embedding_fix = self.create_count_embeddings(vocab_size, d_model)
	else:
	self.count_embedding_proj = nn.Linear(1, d_model)

	self.id_embeddings = nn.Parameter(torch.zeros(1, seq_len, d_model))
	nn.init.normal_(self.id_embeddings, mean=0.0, std=0.02)
	self.batch_embedding = nn.Embedding(n_batches, d_model)

	self.layer_norm = nn.LayerNorm(d_model)
	self.token_layer_norm = nn.LayerNorm(d_model)
	self.batch_layer_norm = nn.LayerNorm(d_model)
	# self.alpha = nn.Parameter(torch.tensor(1.0))
	# self.beta = nn.Parameter(torch.tensor(1.0))

	self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
	nn.init.normal_(self.cls_token, mean=0.0, std=0.02)

	# encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_ff, dropout=dropout_rate, batch_first=True)
	encoder_layer = CustomTransformerEncoderLayer(
	d_model=d_model,
	nhead=n_heads,
	dim_feedforward=d_ff,
	dropout=dropout_rate,
	batch_first=True
	)
	self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_encoder_layers)

	self.mask_output_layer = nn.Linear(d_model, vocab_size)

	self.cls_attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads, batch_first=True)
	self.cls_norm1 = nn.LayerNorm(d_model)
	self.cls_norm2 = nn.LayerNorm(d_model)
	self.cls_ffn = nn.Sequential(
	nn.Linear(d_model, d_ff),
	nn.ReLU(),
	nn.Dropout(dropout_rate),
	nn.Linear(d_ff, d_model)
	)
	self.dropout = nn.Dropout(dropout_rate)
	self.cls_output_layer = nn.Linear(d_model, 1)

	def forward(self, x, batch_indices, masked_lm=False, return_attention=False, return_embeddings=False, return_flow_attention=False):

	# [batch_dim, seq_dim, embed_dim]

	if self.model_type == 'RNA':
	self.count_embedding_fix = self.count_embedding_fix.to(x.device)
	x = x.long()
	x = self.count_embedding_fix[x]
	else:
	x = x.unsqueeze(-1).float()
	x = self.count_embedding_proj(x)

	x = x + self.id_embeddings[:, :x.size(1), :]

	batch_embeddings = self.batch_embedding(batch_indices).unsqueeze(1)#.expand(-1, x.size(1), -1) # repeat for the token dim

	# token_embeddings = self.token_layer_norm(x)
	# batch_embeddings = self.batch_layer_norm(batch_embeddings)
	# x = token_embeddings + batch_embeddings
	# print(batch_embeddings.shape, x.shape)
	# print(torch.max(batch_embeddings.flatten()), torch.max(token_embeddings.flatten()))
	# print(torch.min(batch_embeddings.flatten()), torch.min(token_embeddings.flatten()))
	# print("===")
	x = torch.cat((x, batch_embeddings), dim=1) #x + batch_embeddings #

	x = self.layer_norm(x)

	attention_flow = []
	for layer in self.encoder.layers:
	x, attn_weights = layer(x)
	if return_flow_attention:
	attention_flow.append(attn_weights)

	other_tokens = x #self.encoder(x)

	if return_embeddings:
	return other_tokens, attention_flow

	if masked_lm:
	# exclude the batch embeddings
	other_tokens = other_tokens[:, :-1, :]
	return self.mask_output_layer(other_tokens)

	cls_token = self.cls_token.expand(x.size(0), -1, -1) # repeat for the batch dim
	attended_cls, attention_weights = self.cls_attention(cls_token, other_tokens, other_tokens, need_weights=True, average_attn_weights=False)
	attended_cls = attended_cls.squeeze(1)

	cls_output = self.cls_norm1(cls_token.squeeze(1) + self.dropout(attended_cls))
	cls_output = self.cls_norm2(cls_output + self.dropout(self.cls_ffn(cls_output)))

	preds = self.cls_output_layer(cls_output)
	preds = torch.sigmoid(preds)

	if return_flow_attention:
	return preds, cls_output, attention_weights, attention_flow
	elif return_attention:
	return preds, cls_output, attention_weights
	else:
	return preds, cls_output

	def freeze_pretrained_weights(self):
	for name, param in self.named_parameters():
	if not any(x in name for x in ['cls_attention', 'cls_norm', 'cls_ffn', 'cls_token', 'cls_ff_dim', 'cls_output_layer']):
	param.requires_grad = False
	self.pretrained = True

	def unfreeze_pretrained_weights(self):
	for param in self.parameters():
	param.requires_grad = True
	self.pretrained = False

	def create_count_embeddings(self, max_count, embed_size):
	embeddings = torch.zeros(max_count + 1, embed_size)
	for i in range(max_count + 1):
	embeddings[i] = torch.tensor([math.sin(i / (10000 ** (2 * (j // 2) / embed_size)))
	if j % 2 == 0 else math.cos(i / (10000 ** (2 * (j // 2) / embed_size)))
	for j in range(embed_size)])
	return embeddings

	def get_latent_space(self, inputs, batch_indices, batch_size=32):
	"""
	Get the latent space representation and predictions.
	Args:
	inputs (torch.Tensor): Input tensor.
	batch_indices (torch.Tensor): Batch indices tensor.
	batch_size (int, optional): Batch size. Defaults to 32.
	Returns:
	torch.Tensor: Latent space representation.
	torch.Tensor: Predictions.
	"""
	self.eval()
	latent_space_list, preds_list = [], []
	with torch.no_grad():
	for i in range(0, inputs.shape[0], batch_size):
	inputs_batch = inputs[i:i + batch_size].float()
	batch_indices_batch = batch_indices[i:i + batch_size].int()
	preds, reduced_dim = self(inputs_batch, batch_indices_batch)
	latent_space_list.append(reduced_dim)
	preds_list.append(preds)
	latent_space = torch.cat(latent_space_list, dim=0)
	preds = torch.cat(preds_list, dim=0)
	return latent_space, preds


	class MultiModalTransformer(nn.Module):
	def __init__(self, rna_model, atac_model, flux_model, d_model, n_heads_cls, d_ff_cls, dropout_rate=0.0):
	super(MultiModalTransformer, self).__init__()

	self.rna_model = rna_model
	self.atac_model = atac_model
	self.flux_model = flux_model

	self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
	nn.init.normal_(self.cls_token, mean=0.0, std=0.02)
	# self.modality_embeddings = nn.Embedding(3, d_model)
	self.layer_norm = nn.LayerNorm(d_model)

	self.cls_attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads_cls, dropout=dropout_rate, batch_first=True)
	self.cls_norm1 = nn.LayerNorm(d_model)
	self.cls_norm2 = nn.LayerNorm(d_model)
	self.cls_ffn = nn.Sequential(
	nn.Linear(d_model, d_ff_cls),
	nn.ReLU(),
	nn.Dropout(dropout_rate),
	nn.Linear(d_ff_cls, d_model))
	self.cls_output_layer = nn.Linear(d_model, 1)

	self.dropout = nn.Dropout(dropout_rate)

	def forward(self, x, batch_indices, return_attention=False, return_embeddings=False, return_flow_attention=False):
	rna_input, atac_input, flux_input = x[0], x[1], x[2]

	rna_tokens, rna_attention = self.rna_model(rna_input, batch_indices, return_embeddings=True, return_flow_attention=return_flow_attention) # [32, 944, 128]
	atac_tokens, atac_attention = self.atac_model(atac_input, batch_indices, return_embeddings=True, return_flow_attention=return_flow_attention) # [32, 883, 128]
	flux_tokens, flux_attention = self.flux_model(flux_input, batch_indices, return_embeddings=True, return_flow_attention=return_flow_attention) # [32, 168, 128]
	# rna_tokens += self.modality_embeddings(torch.tensor([0]).to(rna_tokens.device))
	# atac_tokens += self.modality_embeddings(torch.tensor([1]).to(atac_tokens.device))
	# flux_tokens += self.modality_embeddings(torch.tensor([2]).to(flux_tokens.device))
	other_tokens = torch.cat((rna_tokens, atac_tokens, flux_tokens), dim=-2) # [32, 1995, 128]

	if return_embeddings:
	return other_tokens

	# create mask
	rna_mask = (rna_input.sum(dim=1) != 0).float() # [32]
	# b1 = rna_mask.sum()
	atac_mask = (atac_input.sum(dim=1) != 0).float() # [32]
	# b2 = atac_mask.sum()
	flux_mask = (flux_input.sum(dim=1) != 0).float() # [32]

	rna_mask = rna_mask.unsqueeze(-1).expand(-1, rna_tokens.size(1)) # [32, 944]
	atac_mask = atac_mask.unsqueeze(-1).expand(-1, atac_tokens.size(1)) # [32, 883]
	flux_mask = flux_mask.unsqueeze(-1).expand(-1, flux_tokens.size(1)) # [32, 168]
	other_tokens_mask = torch.cat((rna_mask, atac_mask, flux_mask), dim=1) # [32, 1995]

	other_tokens = self.layer_norm(other_tokens)
	cls_token = self.cls_token.expand(other_tokens.size(0), -1, -1) # [32, 1, 128]
	attended_cls, attention_weights = self.cls_attention(cls_token, other_tokens, other_tokens,
	key_padding_mask=(1 - other_tokens_mask).bool(),
	need_weights=True, average_attn_weights=False)

	attended_cls = attended_cls.squeeze(1)
	cls_output = self.cls_norm1(cls_token.squeeze(1) + self.dropout(attended_cls))
	cls_output = self.cls_norm2(cls_output + self.dropout(self.cls_ffn(cls_output)))

	preds = self.cls_output_layer(cls_output)

	preds = torch.sigmoid(preds)

	if return_flow_attention:
	return preds, cls_output, {
	'rna': rna_attention,
	'atac': atac_attention,
	'flux': flux_attention,
	'cls': attention_weights
	}
	elif return_attention:
	return preds, cls_output, attention_weights
	else:
	return preds, cls_output

	def freeze_pretrained_weights(self):
	self.rna_model.freeze_pretrained_weights()
	self.atac_model.freeze_pretrained_weights()
	self.flux_model.freeze_pretrained_weights()
	for name, param in self.named_parameters():
	if not any(x in name for x in ['cls_attention', 'cls_norm', 'cls_ffn', 'cls_token', 'cls_output_layer']):
	param.requires_grad = False

	def unfreeze_pretrained_weights(self):
	self.rna_model.unfreeze_pretrained_weights()
	self.atac_model.unfreeze_pretrained_weights()
	self.flux_model.unfreeze_pretrained_weights()
	for param in self.parameters():
	param.requires_grad = True

	def get_latent_space(self, X, batch_indices, batch_size=32):
	self.eval()
	latent_space_list, preds_list = [], []
	rna_input, atac_input, flux_input = X[0], X[1], X[2]
	with torch.no_grad():
	for i in range(0, rna_input.shape[0], batch_size):
	rna_input_batch = rna_input[i:i + batch_size].float()
	atac_input_batch = atac_input[i:i + batch_size].float()
	flux_input_batch = flux_input[i:i + batch_size].float()
	batch_indices_batch = batch_indices[i:i + batch_size].int()
	preds, reduced_dim = self((rna_input_batch, atac_input_batch, flux_input_batch), batch_indices_batch)
	latent_space_list.append(reduced_dim)
	preds_list.append(preds)
	latent_space = torch.cat(latent_space_list, dim=0)
	preds = torch.cat(preds_list, dim=0)
	return latent_space, preds


	if __name__=='__main__':
	model = SingleTransformer(model_type='ATAC', vocab_size=1, seq_len=883, n_encoder_layers=2, n_heads=2, n_batches=3, d_tokens=508, d_ff=128, d_batch=4)
	x = torch.rand(32, 883)
	batch_indices = torch.randint(1, 3, (32,))
	print(model(x, batch_indices, masked_lm=True).shape)
	print(model(x, batch_indices, return_attention=True)[0].shape)
	print(model(x, batch_indices, return_embeddings=True).shape)
	print(model(x, batch_indices).shape)