alverciito

fix docstrings

00f1b20 about 2 months ago

10.9 kB

	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	# #
	# This file was created by: Alberto Palomo Alonso #
	# Universidad de Alcalá - Escuela Politécnica Superior #
	# #
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	import torch
	from transformers import PreTrainedModel, PretrainedConfig
	from src.model import SegmentationNetwork
	from src.model.config import ModelConfig, TransformerConfig, CoSeNetConfig


	class SentenceCoseNetConfig(PretrainedConfig):
	"""
	Configuration class for SentenceCoseNet.

	This class stores all hyperparameters needed to initialize
	a `SentenceCoseNet` model. It follows Hugging Face's
	`PretrainedConfig` interface so the model can be saved,
	loaded, and shared via the Hub.

	Attributes:
	model_type (str):
	Identifier used by Hugging Face to register the model.
	vocab_size (int):
	Size of the tokenizer vocabulary.
	emb_dim (int):
	Dimensionality of token embeddings.
	seq_len (int):
	Maximum input sequence length supported by the model.
	dropout (float):
	Dropout probability applied in Transformer blocks.
	valid_padding (bool):
	Whether padding tokens are treated as valid positions.
	cosenet (dict):
	Configuration of the cosine-similarity network head.
	transformers (list[dict]):
	List of Transformer encoder block configurations.
	"""

	model_type = "sentence_cosenet"

	def __init__(
	self,
	vocab_size: int = 32768,
	emb_dim: int = 256,
	seq_len: int = 382,
	dropout: float = 0.0,
	valid_padding: bool = True,
	cosenet: dict \| None = None,
	transformers: list \| None = None,
	**kwargs,
	):
	"""
	Initialize SentenceCoseNet configuration.

	Args:
	vocab_size:
	Size of the tokenizer vocabulary.
	emb_dim:
	Dimension of token embeddings.
	seq_len:
	Maximum number of tokens per input sequence.
	dropout:
	Dropout probability used throughout the network.
	valid_padding:
	Whether padded tokens should be considered valid.
	cosenet:
	Optional configuration dictionary for the cosine
	similarity network head.
	transformers:
	Optional list of dictionaries describing each
	Transformer encoder block.
	**kwargs:
	Additional keyword arguments passed to
	`PretrainedConfig`.
	"""
	super().__init__(**kwargs)

	self.vocab_size = vocab_size
	self.emb_dim = emb_dim
	self.seq_len = seq_len
	self.dropout = dropout
	self.valid_padding = valid_padding

	self.cosenet = cosenet or {
	"trainable": True,
	"init_scale": 5.0
	}

	self.transformers = transformers or [
	{
	"attention_heads": 16,
	"feed_forward_multiplier": 8,
	"dropout": 0.0,
	"pre_normalize": True
	},
	{
	"attention_heads": 16,
	"feed_forward_multiplier": 8,
	"dropout": 0.0,
	"pre_normalize": True
	}
	]

	self.hidden_size = emb_dim
	self.max_position_embeddings = seq_len


	class SentenceCoseNet(PreTrainedModel):
	"""
	Sentence-level encoder model based on CoseNet.

	This class wraps a custom PyTorch segmentation network
	and exposes it as a Hugging Face `PreTrainedModel`,
	enabling interoperability with the Transformers ecosystem.

	The model is intended for:
	- Sentence embeddings
	- Semantic search
	- Information retrieval
	- Similarity learning
	"""

	config_class = SentenceCoseNetConfig
	base_model_prefix = "cosenet"

	def __init__(self, config: SentenceCoseNetConfig):
	"""
	Initialize the SentenceCoseNet model.

	Args:
	config:
	Instance of `SentenceCoseNetConfig` containing
	model hyperparameters.
	"""
	super().__init__(config)

	# Core PyTorch model
	self.model = SegmentationNetwork(self.to_model_config(config))

	# Initialize weights following HF conventions
	self.post_init()

	# Set evaluation mode by default
	self.model.eval()

	def encode(
	self,
	input_ids: torch.Tensor,
	attention_mask=None
	) -> torch.Tensor:
	"""
	Encode input token sequences into contextualized embeddings.

	This method performs embedding lookup, positional encoding,
	and Transformer-based contextualization, returning token-level
	representations.

	Args:
	input_ids:
	Tensor of token IDs with shape
	`(batch_size, sequence_length)`.
	attention_mask:
	Optional attention mask indicating valid (1) and
	padded (0) positions. Shape:
	`(batch_size, sequence_length)`.

	Returns:
	torch.Tensor:
	Contextualized token embeddings with shape
	`(batch_size, sequence_length, emb_dim)`.
	"""
	# Set the model task:
	self.model.task = 'token_encoding'
	# Convert to type:
	if len(input_ids.shape) == 2:
	x = input_ids.int().unsqueeze(1)
	mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
	output = self.model(x=x, mask=mask).squeeze(1)
	elif len(input_ids.shape) == 3:
	x = input_ids.int()
	mask = attention_mask if attention_mask is not None else None
	output = self.model(x=x, mask=mask)
	else:
	raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
	return output

	def get_sentence_embedding(
	self,
	input_ids: torch.Tensor,
	attention_mask=None,
	normalize: bool = False,
	) -> torch.Tensor:
	"""
	Compute sentence embeddings for zero-shot transfer and
	information retrieval.

	Args:
	input_ids (torch.Tensor):
	Tensor of shape (B, T)
	attention_mask (torch.Tensor, optional):
	Boolean or binary mask of shape (B, T)
	normalize (bool, optional):
	Whether to L2-normalize the output embeddings.

	Returns:
	torch.Tensor:
	Sentence embeddings of shape (B, D)
	"""
	# Set the model task:
	self.model.task = 'sentence_encoding'
	output = self.call(input_ids, attention_mask)

	if normalize:
	output = torch.nn.functional.normalize(output, p=2, dim=-1)

	return output

	def similarity(self, embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
	"""
	Compute cosine similarity scores between two sets of embeddings.

	Args:
	embeddings_1 (torch.Tensor):
	Tensor of shape (B, S, D) containing the first set of
	embeddings concatenated along the first dimension.

	embeddings_2 (torch.Tensor):
	Tensor of shape (B, S, D) containing the second set of
	embeddings concatenated along the first dimension.

	Returns:
	torch.Tensor:
	Similarity scores of shape (B, S)
	"""
	# Concatenate embeddings (B, S, 2, D)
	embeddings = torch.stack([embeddings_1, embeddings_2], dim=-2)
	# Compute distances (B, S, 2, 2):
	embeddings = self.model.distance_layer(embeddings)
	# Return cosine similarities (B, S):
	return (embeddings[..., 0, 1] + embeddings[..., 1, 0]) / 2

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask=None,
	candidate_mask=None,
	**kwargs,
	):
	"""
	Forward pass of the SentenceCoseNet model.

	This method delegates execution to the underlying
	`SegmentationNetwork`.

	Args:
	input_ids:
	Tensor of token IDs with shape
	`(batch_size, sequence_length)`.
	attention_mask:
	Optional attention mask tensor.
	candidate_mask:
	Optional mask indicating candidate segments or spans.
	**kwargs:
	Additional arguments forwarded to the core model.

	Returns:
	Model-specific output as produced by `SegmentationNetwork`.
	"""
	self.model.task = 'segmentation'
	return self.model(
	x=input_ids,
	mask=attention_mask,
	candidate_mask=candidate_mask,
	**kwargs,
	)

	def call(self, input_ids: torch.Tensor, attention_mask=None) -> torch.Tensor:
	"""
	Internal method to handle different input shapes (task already selected).
	Args:
	input_ids:
	Tensor of token IDs with shape
	`(batch_size, sequence_length)`.
	attention_mask:
	Optional attention mask tensor.
	"""
	# Convert to type:
	if len(input_ids.shape) == 2:
	x = input_ids.int().unsqueeze(1)
	mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
	output = self.model(x=x, mask=mask).squeeze(1)
	elif len(input_ids.shape) == 3:
	x = input_ids.int()
	mask = attention_mask if attention_mask is not None else None
	output = self.model(x=x, mask=mask)
	else:
	raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
	return output

	@staticmethod
	def to_model_config(config: SentenceCoseNetConfig) -> ModelConfig:
	"""
	Convert Hugging Face config to internal ModelConfig.
	"""
	mc = ModelConfig()

	# Core dimensions
	mc.vocab_size = config.vocab_size
	mc.model_dim = config.emb_dim
	mc.valid_padding = config.valid_padding

	# CoSeNet config
	mc.cosenet = CoSeNetConfig(**config.cosenet)

	# Transformer stack
	mc.transformers = [
	TransformerConfig(**cfg)
	for cfg in config.transformers
	]

	return mc
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	# END OF FILE #
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #