Upload AMPLIFY

Browse files

Files changed (7) hide show

README.md +199 -0
amplify.py +347 -0
config.json +37 -0
model.safetensors +3 -0
rmsnorm.py +34 -0
rotary.py +80 -0
tokenizer.py +133 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

amplify.py ADDED Viewed

	@@ -0,0 +1,347 @@

+# From https://stackoverflow.com/a/23689767
+# From https://github.com/pytorch/pytorch/issues/97899
+# From https://github.com/facebookresearch/llama/blob/main/llama/model.py
+import yaml
+import safetensors
+import torch
+from torch import nn
+from torch.nn.functional import scaled_dot_product_attention
+from flash_attn.flash_attn_interface import flash_attn_varlen_func
+from xformers.ops import SwiGLU
+from .rmsnorm import RMSNorm
+from .rotary import precompute_freqs_cis, apply_rotary_emb
+from .tokenizer import ProteinTokenizer
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import MaskedLMOutput
+class DotDict(dict):
+    """Dictionary that supports the dot notation to access attributes (similarly to HuggingFace)."""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class AMPLIFYConfig(PretrainedConfig):
+    model_type = "AMPLIFY"
+    # All config parameters must have a default value.
+    def __init__(
+        self,
+        hidden_size: int = 960,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 15,
+        intermediate_size: int = 3840,
+        dropout_prob: float = 0,
+        embedding_init_range: float = 0.02,
+        decoder_init_range: float = 0.02,
+        rms_norm: bool = True,
+        norm_eps: float = 1e-05,
+        hidden_act: str = "SwiGLU",
+        layer_norm_after_embedding: bool = False,
+        layer_norm_before_last_layer: bool = True,
+        vocab_size: int = 27,
+        ffn_bias: bool = False,
+        att_bias: bool = False,
+        pad_token_id: int = 0,
+        max_length: int = 2048,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout_prob = dropout_prob
+        self.embedding_init_range = embedding_init_range
+        self.decoder_init_range = decoder_init_range
+        self.rms_norm = rms_norm
+        self.norm_eps = norm_eps
+        self.hidden_act = hidden_act
+        self.layer_norm_after_embedding = layer_norm_after_embedding
+        self.layer_norm_before_last_layer = layer_norm_before_last_layer
+        self.vocab_size = vocab_size
+        self.ffn_bias = ffn_bias
+        self.att_bias = att_bias
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+class EncoderBlock(nn.Module):
+    """Transformer encoder block."""
+    def __init__(self, config: AMPLIFYConfig):
+        """Initialize a EncoderBlock.
+        Args:
+            hidden_size (int): _description_
+            num_attention_heads (int): _description_
+            intermediate_size (int, optional): _description_. Defaults to 2048.
+            dropout_prob (float, optional): _description_. Defaults to 0.1.
+            activation (str, optional): _description_. Defaults to "relu".
+            rms_norm (bool, optional): _description_. Defaults to True.
+            norm_eps (float, optional): _description_. Defaults to 1e-5.
+            pad_token_id (int, optional): _description_. Defaults to 0.
+            max_length (int, optional): _description_. Defaults to 2048.
+            ffn_bias (bool, optional): _description_. Defaults to False.
+            att_bias (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        self.config = config
+        self.d_head = config.hidden_size // config.num_attention_heads
+        # Attention
+        self.q = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.k = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.v = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.wo = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.resid_dropout = nn.Dropout(config.dropout_prob)
+        # Feedforward network
+        act = config.hidden_act.lower()
+        if act == "swiglu":
+            # To keep the number of parameters and the amount of computation constant, we reduce the number of
+            # hidden units by a factor of 2/3 (https://arxiv.org/pdf/2002.05202.pdf) and make it a multiple of 8 to
+            # avoid RuntimeError due to misaligned operand
+            multiple_of = 8
+            intermediate_size = int(2 * config.intermediate_size / 3)
+            intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+            self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=config.ffn_bias)
+        elif act == "relu":
+            self.ffn = nn.Sequential(
+                nn.Linear(config.hidden_size, config.intermediate_size, bias=config.ffn_bias),
+                nn.ReLU(),
+                nn.Linear(config.intermediate_size, config.hidden_size, bias=config.ffn_bias),
+            )
+        elif act == "gelu":
+            self.ffn = nn.Sequential(
+                nn.Linear(config.hidden_size, config.intermediate_size, bias=config.ffn_bias),
+                nn.GELU(),
+                nn.Linear(config.intermediate_size, config.hidden_size, bias=config.ffn_bias),
+            )
+        else:
+            raise ValueError(f"Unsupported hidden_act: {config.hidden_act}")
+        self.attention_norm = (
+            RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        )
+        self.ffn_norm = (
+            RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        )
+        self.ffn_dropout = nn.Dropout(config.dropout_prob)
+    def forward(
+        self,
+        x: torch.Tensor,
+        pad_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        output_attentions: bool,
+        max_seqlen: int = None,
+        cu_seqlens: torch.Tensor = None,
+    ):
+        attn, contact = self._att_block(self.attention_norm(x), pad_mask, freqs_cis, output_attentions, max_seqlen, cu_seqlens)
+        x = x + attn
+        x = x + self._ff_block(self.ffn_norm(x))
+        return x, contact
+    def _att_block(
+        self,
+        x: torch.Tensor,
+        pad_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        output_attentions: bool,
+        max_seqlen: int = None,
+        cu_seqlens: torch.Tensor = None,
+    ):
+        batch_size, seq_len, _ = x.shape
+        xq, xk, xv = self.q(x), self.k(x), self.v(x)
+        # Reshape for rotary embeddings
+        xq = xq.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xk = xk.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xv = xv.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+        # Attn block
+        attn_weights = None
+        # Flash attention if the tensors are packed
+        if cu_seqlens is not None:
+            attn = flash_attn_varlen_func(
+                q=xq.squeeze(0),
+                k=xk.squeeze(0),
+                v=xv.squeeze(0),
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                dropout_p=0.0,
+                causal=False,
+            )
+        # Eager attention if attention weights are needed in the output
+        elif output_attentions:
+            attn_weights = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
+            if pad_mask is not None:
+                attn_weights = attn_weights + pad_mask.type(attn_weights.dtype)
+            attn_weights = attn_weights.softmax(-1)
+            attn = attn_weights @ xv.permute(0, 2, 1, 3)
+            attn = attn.transpose(1, 2)
+        # SDPA will pick an appropriate backend otherwise
+        else:
+            attn = scaled_dot_product_attention(
+                query=xq.transpose(1, 2),
+                key=xk.transpose(1, 2),
+                value=xv.transpose(1, 2),
+                attn_mask=pad_mask,
+                dropout_p=0,
+            ).transpose(1, 2)
+        attn_scores = self.wo(attn.reshape(batch_size, seq_len, self.config.num_attention_heads * self.d_head))
+        return (self.resid_dropout(attn_scores), attn_weights)
+    def _ff_block(self, x: torch.Tensor):
+        return self.ffn_dropout(self.ffn(x))
+class AMPLIFYPreTrainedModel(PreTrainedModel):
+    config_class = AMPLIFYConfig
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.uniform_(-self.config.decoder_init_range, self.config.decoder_init_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.uniform_(-self.config.embedding_init_range, self.config.embedding_init_range)
+class AMPLIFY(AMPLIFYPreTrainedModel):
+    """The main model class.
+    Args:
+       config (amplify.model.amplify.AMPLIFYConfig): model configuration, usually defined from the Hydra configuration.
+    """
+    def __init__(self, config: AMPLIFYConfig, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.encoder = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        if config.layer_norm_after_embedding:
+            self.layer_norm_1 = (
+                RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+            )
+        self.transformer_encoder = nn.ModuleList()
+        for _ in range(config.num_hidden_layers):
+            self.transformer_encoder.append(EncoderBlock(config))
+        if config.layer_norm_before_last_layer:
+            self.layer_norm_2 = (
+                RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+            )
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        freqs_cis = precompute_freqs_cis(config.hidden_size // config.num_attention_heads, config.max_length)
+        # Ensures freqs_cis is moved to the same devices as the model. Non-persistent buffers are not saved in the state_dict.
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def load(cls, checkpoint_path: str, config_path: str):
+        with open(config_path, "r") as file:
+            cfg = yaml.safe_load(file)
+        model = AMPLIFY(AMPLIFYConfig(**cfg["model"], **cfg["tokenizer"]))
+        if checkpoint_path.endswith(".safetensors"):
+            state_dict = safetensors.torch.load_file(checkpoint_path)
+        elif checkpoint_path.endswith(".pt"):
+            state_dict = torch.load(checkpoint_path)
+        else:
+            raise ValueError(f"Expected checkpoint to be a `.pt` or `.safetensors` file.")
+        model.load_state_dict(state_dict)
+        cfg["tokenizer"]["vocab_path"] = "/home/mila/l/lola.lebreton/AMPLIFY/conf/tokenizer/amplify_vocab.txt"
+        tokenizer = ProteinTokenizer(**cfg["tokenizer"])
+        return model, tokenizer
+    def forward(
+        self,
+        src,
+        position_ids: torch.Tensor = None,
+        max_seqlen: int = None,
+        cu_seqlens: torch.Tensor = None,
+        pad_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        # Initialize
+        hidden_states, attentions = [], []
+        # We will output all the hidden_states that have an index higher than output_hidden_index
+        if type(output_hidden_states) == bool and not output_hidden_states:
+            output_hidden_index = self.config.num_hidden_layers + 1
+        elif type(output_hidden_states) == int:
+            output_hidden_index = output_hidden_states
+        else:
+            output_hidden_index = 0
+        # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
+        if pad_mask is not None:
+            pad_mask = pad_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, pad_mask.size(-1), 1)
+            if output_attentions:
+                pad_mask = torch.where(pad_mask == 1, float(0.0), float("-inf"))
+        # Checks to be done if inputs are packed sequences
+        if cu_seqlens is not None:
+            assert not output_attentions, "Output attentions is not supported when sequences are packed."
+            assert max_seqlen is not None, "Missing max_seqlen. It must be provided when cu_seqlens are not None."
+            assert src.shape[0] == 1, "Cumulative sequence lengths are provided but src are not packed."
+            assert src.is_cuda, "Packing uses an implementation of flash-attention and is only supported on GPU."
+            # Create position_ids if not provided
+            if position_ids is None:
+                position_ids = torch.stack([torch.arange(0, seqlen, device=src.device) for seqlen in cu_seqlens], dim=0)
+        # RoPE
+        if position_ids is not None:
+            freqs_cis = self.freqs_cis[position_ids]
+        else:
+            freqs_cis = self.freqs_cis[: src.shape[1]]
+        # Embedding
+        x = self.encoder(src)
+        if self.config.layer_norm_after_embedding:
+            x = self.layer_norm_1(x)
+        # Transformer encoder
+        for idx, layer in enumerate(self.transformer_encoder):
+            x, attn = layer(x, pad_mask, freqs_cis, output_attentions, max_seqlen, cu_seqlens)
+            if idx >= output_hidden_index:
+                hidden_states.append(x)
+            if output_attentions:
+                attentions.append(attn)
+        # Classification head with layer norm
+        logits = self.decoder(self.layer_norm_2(x) if self.config.layer_norm_before_last_layer else x)
+        # Return logits or the output of the last hidden layer
+        return MaskedLMOutput(logits=logits, hidden_states=hidden_states, attentions=attentions)

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_": "PLM",
+  "architectures": [
+    "AMPLIFY"
+  ],
+  "att_bias": false,
+  "auto_map": {
+    "AutoConfig": "amplify.AMPLIFYConfig",
+    "AutoModel": "amplify.AMPLIFY"
+  },
+  "bos_token_id": 3,
+  "decoder_init_range": 0.02,
+  "dropout_prob": 0,
+  "embedding_init_range": 0.02,
+  "eos_token_id": 4,
+  "ffn_bias": false,
+  "hidden_act": "SwiGLU",
+  "hidden_size": 640,
+  "intermediate_size": 2560,
+  "layer_norm_after_embedding": false,
+  "layer_norm_before_last_layer": true,
+  "mask_token_id": 2,
+  "max_length": 2048,
+  "model_type": "AMPLIFY",
+  "norm_eps": 1e-05,
+  "num_attention_heads": 10,
+  "num_hidden_layers": 24,
+  "other_special_token_ids": null,
+  "pad_token_id": 0,
+  "pre_activation_layer_norm": true,
+  "rms_norm": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "unk_token_id": 1,
+  "vocab_path": "conf/tokenizer/plm_vocab.txt",
+  "vocab_size": 27
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2375f1f54cbe00bdbe27eedcd039c92d12f165720c0349bc582a6eb42c099ce
+size 473126988

rmsnorm.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight

rotary.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from typing import Tuple
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings.
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)

tokenizer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+from typing import List, Optional, Union
+from torch import Tensor
+class ProteinTokenizer(object):
+    def __init__(
+        self,
+        vocab_path: str,
+        pad_token_id: int,
+        mask_token_id: int,
+        bos_token_id: int,
+        eos_token_id: int,
+        unk_token_id: int,
+        other_special_token_ids: Optional[List[int]],
+        **kwargs,
+    ):
+        """Vocabulary comprising the amino acids, and the special tokens <unk>, <bos>, <eos>, <pad> and <mask>.
+        Args:
+            vocab_path (str): Path to the vocabulary file to load.
+            pad_token_id (int): <PAD> token index.
+            mask_token_id (int): <MASK> token index.
+            bos_token_id (int): <BOS> token index.
+            eos_token_id (int): <EOS> token index.
+            unk_token_id (int): <UNK> token index.
+            other_special_token_ids (Optional[List[int]]): List of additional special tokens.
+        """
+        self._token_to_id = dict()
+        self._id_to_token = dict()
+        with open(vocab_path, "r") as vocab_file:
+            for i, token in enumerate(vocab_file):
+                token = token.strip()
+                self._token_to_id[token] = i
+                self._id_to_token[i] = token
+        # Padding token
+        self.pad_token_id = pad_token_id
+        self.pad_token = self._token_to_id.get(pad_token_id)
+        # Beginning and end of sequence
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.bos_token = self._token_to_id.get(bos_token_id)
+        self.eos_token = self._token_to_id.get(eos_token_id)
+        # Mask token
+        self.mask_token_id = mask_token_id
+        self.mask_token = self._token_to_id.get(mask_token_id)
+        # Unknown token
+        self.unk_token_id = unk_token_id
+        self.unk_token = self._id_to_token.get(unk_token_id)
+        # Set of all special token indices
+        self.special_token_ids = set()
+        self.special_token_ids.add(pad_token_id)
+        self.special_token_ids.add(mask_token_id)
+        self.special_token_ids.add(bos_token_id)
+        self.special_token_ids.add(eos_token_id)
+        self.special_token_ids.add(unk_token_id)
+        if other_special_token_ids is not None:
+            self.special_token_ids.update(other_special_token_ids)
+    def __len__(self) -> int:
+        return len(self._token_to_id)
+    def token_to_id(self, token: str) -> int:
+        return self._token_to_id.get(token, self.unk_token_id)
+    def id_to_token(self, index: int) -> str:
+        return self._id_to_token.get(index, self.unk_token)
+    def encode(
+        self,
+        tokens: List[str],
+        max_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+        random_truncate: bool = True,
+        **kwargs,
+    ) -> Union[List[int], Tensor]:
+        """Encodes a list of tokens into a list or tensor of token indices.
+        Args:
+            tokens (List[str]): Sequence of tokens to encode.
+            max_length (Optional[int], optional): Truncate the sequence to the specified length. Defaults to None.
+            add_special_tokens (bool, optional): Add special tokens <bos> and <eos> at the start and end.. Defaults to True.
+            random_truncate (bool, optional): Truncate the sequence to a random subsequence of if longer than truncate.
+            Defaults to True.
+        Returns:
+            Union[List[int], Tensor]: Token indices.
+        """
+        token_ids = list(map(self.token_to_id, tokens))
+        if add_special_tokens:
+            token_ids = [self.bos_token_id] + token_ids + [self.eos_token_id]
+        if max_length is not None and max_length < len(token_ids):
+            if random_truncate:
+                offset = int(torch.randint(0, len(token_ids) - max_length, (1,)).item())
+            else:
+                offset = 0
+            token_ids = token_ids[offset : offset + max_length]
+        return torch.as_tensor(token_ids, dtype=torch.long)
+    def decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = True,
+        **kwargs,
+    ) -> Union[List[str], str]:
+        """Decodes a list or tensor of token ids into a list or string of tokens.
+        Args:
+            token_ids (List[int]): Token indices to decode.
+            skip_special_tokens (bool, optional): Skip the special tokens <bos> and <eos> at the start and end.
+            Defaults to True.
+        Returns:
+            Union[List[str], str]: Protein.
+        """
+        if torch.is_tensor(token_ids):
+            token_ids = token_ids.tolist()
+        if skip_special_tokens:
+            if len(token_ids) > 0 and token_ids[0] in self.special_token_ids:
+                token_ids = token_ids[1:]
+            if len(token_ids) > 0 and token_ids[-1] in self.special_token_ids:
+                token_ids = token_ids[:-1]
+        tokens = " ".join(map(self.id_to_token, token_ids))
+        return tokens