google
/

tipsv2-l14

+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text encoder implementation in PyTorch."""
+import typing as t
+import numpy as np
+import sentencepiece as spm
+import torch
+from torch import nn
+import torch.nn.functional as F
+class Tokenizer(object):
+  """A simple tokenizer using SentencePiece."""
+  def __init__(self, tokenizer_path: str):
+    self.sp = spm.SentencePieceProcessor(model_file=tokenizer_path)
+    # Match tensorflow_text.SentencepieceTokenizer(add_bos=False, add_eos=False)
+    self.sp.SetEncodeExtraOptions("")
+    # Explicitly disable BOS/EOS to match the reference Colab implementation.
+    self._add_bos = False
+    self._add_eos = False
+  def tokenize(self, input_texts, max_len=64):
+    if isinstance(input_texts, str):
+      input_texts = [input_texts]
+    batch_ids = [
+        self.sp.encode(t.lower(), add_bos=self._add_bos, add_eos=self._add_eos)
+        for t in input_texts
+    ]
+    tokens = np.zeros((len(batch_ids), max_len), dtype=np.int64)
+    for i, ids in enumerate(batch_ids):
+      length = min(len(ids), max_len)
+      tokens[i, :length] = ids[:length]
+    is_padding = (tokens == 0).astype(np.int32)
+    return tokens, is_padding
+class PositionalEmbedding(nn.Module):
+  """Generates position embedding for a given 1-d sequence.
+  Attributes:
+    min_timescale: Start of the geometric index. Determines the periodicity of
+      the added signal.
+    max_timescale: End of the geometric index. Determines the frequency of the
+      added signal.
+    embedding_dim: Dimension of the embedding to be generated.
+  """
+  min_timescale: int = 1
+  max_timescale: int = 10_000
+  embedding_dim: int = 0
+  def __init__(self, embedding_dim: int):
+    super().__init__()
+    self.embedding_dim = embedding_dim
+  def __call__(self, seq_length: int = None, position: torch.tensor = None):
+    """Generates a torch.tensor of sinusoids with different frequencies.
+    Args:
+      seq_length: an optional Python int defining the output sequence length.
+        if the `position` argument is specified.
+      position:   [B, seq_length], optional position for each token in the
+        sequence, only required when the sequence is packed.
+    Returns:
+      [B, seqlen, D] if `position` is specified, else [1, seqlen, D]
+    """
+    if position is None:
+      assert seq_length is not None
+      # [1, seqlen]
+      position = torch.arange(seq_length, dtype=torch.float32)[None, :]
+    else:
+      assert position.ndim == 2, position.shape
+    num_timescales = self.embedding_dim // 2
+    log_timescale_increment = torch.log(
+        torch.tensor(float(self.max_timescale) / float(self.min_timescale))
+    ) / torch.maximum(
+        torch.tensor(num_timescales, dtype=torch.float32) - 1, torch.tensor(1)
+    )
+    inv_timescales = self.min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float32)
+        * -log_timescale_increment
+    )
+    scaled_time = position[:, :, None] * inv_timescales[None, None, :]
+    signal = torch.cat((torch.sin(scaled_time), torch.cos(scaled_time)), dim=2)
+    # Force usage of `np` rather than `jnp` to compute static values at trace
+    # time.
+    signal = F.pad(signal, (0, self.embedding_dim % 2, 0, 0, 0, 0))
+    return signal
+class MlpBlockWithMask(nn.Module):
+  """Transformer MLP / feed-forward block that supports masking."""
+  def __init__(
+      self,
+      mlp_dim: int,
+      d_model: int,
+      use_bias: bool = True,
+      dtype: torch.dtype = torch.float32,
+      activation_fn: nn.Module = nn.GELU,
+  ):
+    super().__init__()
+    self.mlp_dim = mlp_dim
+    self.d_model = d_model
+    self.use_bias = use_bias
+    self.dtype = dtype
+    self.activation_fn = activation_fn
+    self.c_fc = nn.Linear(
+        in_features=self.d_model,
+        out_features=self.mlp_dim,
+        dtype=self.dtype,
+        bias=self.use_bias,
+    )
+    self.c_proj = nn.Linear(
+        in_features=self.mlp_dim,
+        out_features=self.d_model,
+        dtype=self.dtype,
+        bias=self.use_bias,
+    )
+  def __call__(
+      self, inputs: torch.Tensor, mlp_mask: torch.Tensor
+  ) -> torch.Tensor:
+    """Applies Transformer MlpBlock with mask module."""
+    x = self.c_fc(inputs)
+    x = self.activation_fn()(x)
+    x = x * mlp_mask[..., None]  # First masking.
+    x = self.c_proj(x)
+    x = x * mlp_mask[..., None]  # Second masking.
+    return x
+class ResidualAttentionBlock(nn.Module):
+  """Transformer residual attention block."""
+  def __init__(
+      self,
+      d_model: int,
+      n_head: int,
+      mlp_dim: int,
+      dtype: torch.dtype = torch.float32,
+  ):
+    super().__init__()
+    self.d_model = d_model
+    self.n_head = n_head
+    self.mlp_dim = mlp_dim
+    self.dtype = dtype
+    self.attn = nn.MultiheadAttention(d_model, n_head, dtype=self.dtype)
+    self.ln_1 = nn.LayerNorm(d_model, dtype=self.dtype)
+    self.mlp = MlpBlockWithMask(
+        self.mlp_dim,
+        d_model,
+        use_bias=True,
+        dtype=self.dtype,
+        activation_fn=nn.ReLU,
+    )
+    self.ln_2 = nn.LayerNorm(d_model, dtype=self.dtype)
+  def attention(self, x: torch.Tensor, mask: torch.Tensor):
+    attn_mask = (
+        mask[:, None, None, :]
+        .repeat(1, self.n_head, x.shape[0], 1)
+        .flatten(0, 1)
+    )
+    attn_mask[attn_mask == 0] = float('-inf')
+    attn_mask[attn_mask == 1] = 0
+    return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
+  def forward(self, x: torch.Tensor, mask: torch.Tensor):
+    x = x + self.attention(self.ln_1(x), mask.permute(1, 0))
+    x = x + self.mlp(self.ln_2(x), mask)
+    return x, mask
+class SequentialMultiInput(nn.Sequential):
+  """Sequential module that can take multiple inputs."""
+  def forward(self, *inputs):
+    for module in self._modules.values():
+      if isinstance(inputs, tuple):
+        inputs = module(*inputs)
+      else:
+        inputs = module(inputs)
+    return inputs
+class Transformer(nn.Module):
+  """Transformer implementation."""
+  def __init__(
+      self,
+      width: int,
+      layers: int,
+      heads: int,
+      mlp_dim: int,
+      dtype: torch.dtype = torch.float32,
+  ):
+    super().__init__()
+    self.width = width
+    self.layers = layers
+    self.heads = heads
+    self.mlp_dim = mlp_dim
+    self.dtype = dtype
+    self.resblocks = SequentialMultiInput(*[
+        ResidualAttentionBlock(self.width, self.heads, self.mlp_dim, self.dtype)
+        for _ in range(self.layers)
+    ])
+  def forward(self, x: torch.Tensor, mask: torch.Tensor):
+    return self.resblocks(x, mask)[0]
+class GlobalAvgPooling(nn.Module):
+  """Performs a simple global pooling over the input with optional paddings.
+  Attributes:
+    pooling_dims: A list of dims to perform pooling over.
+    keepdims: If True, keep dimension of inputs after pooling.
+  """
+  pooling_dims: t.Sequence[int]
+  epsilon: float = 1e-8
+  def __init__(
+      self, pooling_dims: t.Sequence[int], epsilon: float = 1e-8
+  ):
+    super().__init__()
+    self.pooling_dims = pooling_dims
+    self.epsilon = epsilon
+    if not all([p_dims >= 0 for p_dims in self.pooling_dims]):
+      raise ValueError('pooling_dims must be non-negative integers.')
+  def __call__(
+      self,
+      inputs: torch.tensor,
+      compatible_paddings: torch.tensor,
+  ):
+    """Applies global average spatial pooling to inputs.
+    Args:
+      inputs: An input tensor.
+      compatible_paddings: paddings of inputs with shapes compatible with
+        inputs, e.g. compatible_paddings with shape [B, 1] for inputs with shape
+        [B, D].
+    Returns:
+      Output tensor with global pooling applied.
+    """
+    padded_value = torch.zeros_like(inputs)
+    padded_value = torch.ones_like(inputs) * padded_value
+    inputs = torch.where(compatible_paddings > 0, padded_value, inputs)
+    valid_inputs = (
+        torch.sum(
+            1.0 - compatible_paddings,
+            self.pooling_dims,
+            keepdims=True,
+            dtype=inputs.dtype,
+        )
+        + self.epsilon
+    )
+    inputs_sum = torch.sum(inputs, self.pooling_dims, keepdims=True)
+    outputs = torch.divide(inputs_sum, valid_inputs).type(inputs.dtype)
+    outputs = torch.squeeze(outputs, axis=self.pooling_dims)
+    return outputs
+class TextEncoder(nn.Module):
+  """Text encoder implementation."""
+  def __init__(
+      self,
+      config: t.Dict[str, int],
+      vocab_size: int,
+      dtype: torch.dtype = torch.float32,
+      scale_sqrt_depth: bool = True,
+  ):
+    super().__init__()
+    self.vocab_size = vocab_size
+    self.dtype = dtype
+    self.scale_sqrt_depth = scale_sqrt_depth
+    # The text tower layers are fixed independent of vision tower size.
+    self.transformer_layers = config['num_layers']
+    self.embedding_dim = config['hidden_size']
+    self.transformer_width = config['hidden_size']
+    self.mlp_dim = config['mlp_dim']
+    self.transformer_heads = config['num_heads']
+    self.token_embedding = nn.Embedding(
+        self.vocab_size, self.embedding_dim, dtype=self.dtype
+    )
+    self.pos_embedder = PositionalEmbedding(embedding_dim=self.embedding_dim)
+    self.transformer = Transformer(
+        width=self.transformer_width,
+        layers=self.transformer_layers,
+        heads=self.transformer_heads,
+        mlp_dim=self.mlp_dim,
+        dtype=self.dtype,
+    )
+    self.pooling = GlobalAvgPooling(pooling_dims=[1])
+    self.ln_final = nn.LayerNorm(self.transformer_width, dtype=self.dtype)
+  def __call__(
+      self,
+      ids: torch.tensor,
+      paddings: torch.tensor,
+  ):
+    """Applies TextEncoder module."""
+    _, seq_length = ids.shape
+    mask = (paddings == 0).type(torch.float32)
+    mask = mask.permute(1, 0)  # NL -> LN
+    x = self.token_embedding(ids)
+    if self.scale_sqrt_depth:
+      x = x * (self.embedding_dim**0.5)
+    x = x + self.pos_embedder(seq_length=seq_length).to(x.device)
+    x = x.permute(1, 0, 2)  # NLD -> LND
+    x = self.transformer(x, mask)
+    x = x.permute(1, 0, 2)  # LND -> NLD
+    x = self.ln_final(x)
+    x = self.pooling(x, compatible_paddings=paddings[:, :, None])
+    return x