persian-poem-recommender-based-on-image

Runtime error

App Files Files Community

persian-poem-recommender-based-on-image / modules.py

mojtaba-nafez

Duplicate from mojtaba-nafez/persian-poem-recommender-based-on-text

1bc9b9d over 1 year ago

raw

history blame contribute delete

No virus

6.88 kB

	import torch
	from torch import nn
	import timm
	import config as CFG


	class TextEncoder(nn.Module):
	"""
	Text/Poem encoder used in PoemTextModel and CLIPModel
	...
	Attributes:
	-----------
	model : a torch.nn.Module model
	The image encoder model

	Methods:
	--------
	forward(x)
	returns model embeddings of x (batch of texts/poems) (of the CLS token)
	__init__()
	creates the encoder model using huggingface transformers,
	also freezes the model if it's not trainable.
	"""
	def __init__(self, encoder_model, encoder_pretrained_name, pretrained, trainable):
	"""
	creates the poem or text encoder model using transformers and loads weights from pretrained model if needed.
	Also freezes the model if it's not trainable.

	Parameters:
	-----------
	pretrained: bool
	if pretrained=True, get pretrained model's weights. else create a fresh untrained model.
	trainable: bool
	if trainable=False, the model's weights will be frozen.
	encoder_model: str
	image encoder model name used as input to get the right model from configs.
	encoder_pretrained_name: str
	image encoder model to get weights from. (not used when pretrained=False)
	"""
	super().__init__()

	if pretrained:
	self.model = CFG.encoders[encoder_model].from_pretrained(encoder_pretrained_name)
	else:
	self.model = CFG.encoders[encoder_model](config=CFG.configs[encoder_model]())

	for p in self.model.parameters():
	p.requires_grad = trainable

	# Using the CLS token hidden representation as the sentence's embedding
	self.target_token_idx = 0

	def forward(self, input_ids, attention_mask):
	"""
	forwards and calculates embeddings of the input using attention mask.

	Parameters:
	-----------
	input_ids: input ids (output of tokenizer)
	attention masks: input masks (for example for padding, pad tokens will be masked)

	Returns:
	--------
	the embedding of the CLS (or target) token of the encoder's last hidden state
	"""
	output = self.model(input_ids=input_ids, attention_mask=attention_mask)
	last_hidden_state = output.last_hidden_state
	return last_hidden_state[:, self.target_token_idx, :]



	class ProjectionHead(nn.Module):
	"""
	Projection head used to project embeddings from each encoder to a shared embedding space
	...
	Attributes:
	-----------
	projection : torch.nn.Linear
	The main Dense projection (from encoder's embedding dim to shared embedding projection dim)
	gelu: torch.nn.GELU
	activation function
	fc: torch.nn.Linear
	a dense layer after projection (projection_dim to projection_dim)
	dropout: torch.nn.Dropout
	dropout after fc
	layer_norm: torch.nn.LayerNorm
	layer norm after dropout

	Methods:
	--------
	forward(x)
	returns projection embeddings from x (encoder output embeddings)
	__init__()
	creates the projection head
	"""
	def __init__(
	self,
	embedding_dim,
	projection_dim=CFG.projection_dim,
	dropout=CFG.dropout
	):
	"""
	Creates the projection head used after an encoder.

	Parameters:
	-----------
	embedding_dim: int
	dimension of the output embeddings of the encoder.
	projection_dim: int, optional
	dimension to project embeddings to.
	dropout: float
	fraction of the output of fc layer to be zeroed.
	"""
	super().__init__()
	self.projection = nn.Linear(embedding_dim, projection_dim)
	self.gelu = nn.GELU()
	self.fc = nn.Linear(projection_dim, projection_dim)
	self.dropout = nn.Dropout(dropout)
	self.layer_norm = nn.LayerNorm(projection_dim)

	def forward(self, x):
	"""
	Forwards and calculates projected embeddings from encoder embeddings.

	Parameters:
	-----------
	x: input (of shape (batch_size, embedding_dim))
	the output embedding of this projection head's encoder

	Returns:
	--------
	the embeddings in a shared embedding space (of shape (batch_size, projection_dim))
	"""
	projected = self.projection(x) #main projection layer
	x = self.gelu(projected)
	x = self.fc(x)
	x = self.dropout(x)
	# the projected outputs are added to x as a residual connection
	x = x + projected
	x = self.layer_norm(x)
	return x


	class ImageEncoder(nn.Module):
	"""
	Image encoder used in CLIPModel
	...
	Attributes:
	-----------
	model : a torch.nn.Module model from timm (pytorch-image-models)
	The image encoder model

	Methods:
	--------
	forward(x)
	returns model embeddings of x (batch of images)
	__init__()
	creates the encoder model using timm and loads fine-tuned model's state dict if needed.
	also freezes the model if it's not trainable.
	"""
	def __init__(
	self, pretrained, trainable, model_name=CFG.image_encoder_model
	):
	"""
	creates the encoder model using timm and loads fine-tuned model's state dict if needed.
	Also freezes the model if it's not trainable.

	Parameters:
	-----------
	pretrained: bool
	if pretrained=True, get SOTA weights (or weights saved in image_encoder_weights_load_path).
	else create a fresh untrained model.
	trainable: bool
	if trainable=False, the model's weights will be frozen.
	model_name: str
	image encoder model name used as input to timm.create_model.
	"""
	super().__init__()
	self.model = timm.create_model(
	model_name, pretrained, num_classes=0, global_pool="avg"
	)
	if pretrained and CFG.image_encoder_weights_load_path:
	self.model.load_state_dict(torch.load(CFG.image_encoder_weights_load_path, map_location=CFG.device))
	for p in self.model.parameters():
	p.requires_grad = trainable

	def forward(self, x):
	"""
	forwards and calculates embeddings of the input.

	Parameters:
	-----------
	x: input (batch of transformed images)

	Returns:
	--------
	embeddings of the model for the input (of shape (batch_size, image_embedding))
	"""
	return self.model(x)