initial commit

Browse files

Files changed (9) hide show

README.md +117 -3
amplify.py +238 -0
config.json +38 -0
model.safetensors +3 -0
rmsnorm.py +34 -0
rotary.py +80 -0
special_tokens_map.json +7 -0
tokenizer.json +154 -0
tokenizer_config.json +58 -0

README.md CHANGED Viewed

@@ -1,3 +1,117 @@
----
-license: mit
----

+---
+license: mit
+datasets:
+  - drug-discovery/UR100P
+language:
+  - en
+tags:
+  - biology
+---
+## AMPLIFY
+AMPLIFY is an efficient, state-of-the-art protein language model pre-trained using masked language modeling on UniRef100, OAS, and SCOP ([UR100P](https://huggingface.co/datasets/drug-discovery/UR100P)). AMPLIFY can generate residue and protein embeddings, suggest mutations, differentiate disordered proteins from non-protein sequences, and much more. AMPLIFY is available in two sizes, 120M and 350M parameters, with the `_base` models not extended beyond 512 residues (Stage 1). The model architecture and pre-training procedure are detailed below. For more details, please refer to the [accompanying paper](https://www.biorxiv.org/content/10.1101/2024.09.23.614603v1).
+- [`AMPLIFY_350M`](https://huggingface.co/drug-discovery/AMPLIFY_350M)
+- [`AMPLIFY_350M_base`](https://huggingface.co/drug-discovery/AMPLIFY_350M_base)
+- [`AMPLIFY_120M`](https://huggingface.co/drug-discovery/AMPLIFY_120M)
+- [`AMPLIFY_120M_base`](https://huggingface.co/drug-discovery/AMPLIFY_120M_base)
+### Model Descritpion
+|                                | AMPLIFY 120M | AMPLIFY 350M |
+| :----------------------------- | -----------: | -----------: |
+| `hidden-size`                  |          640 |          960 |
+| `num-hidden-layers`            |           24 |           32 |
+| `num-attention-heads`          |           10 |           15 |
+| `intermediate-size`            |         2560 |         3840 |
+| `max-position-embeddings`      |         2048 |         2048 |
+| `vocab-size`                   |           27 |           27 |
+| `rope-theta`                   |        10000 |        10000 |
+| `dropout-prob`                 |            0 |            0 |
+| `embedding-init-range`         |         0.02 |         0.02 |
+| `norm-eps`                     |      1.0e-05 |      1.0e-05 |
+| `hidden-act`                   |       swiglu |       swiglu |
+| `pre-activation-layer-norm`    |         true |         true |
+| `layer-norm-after-embedding`   |        false |        false |
+| `layer-norm-before-last-layer` |         true |         true |
+| `rms-norm`                     |         true |         true |
+| `ffn-bias`                     |        false |        false |
+| `attn-bias`                    |        false |        false |
+### Training Descritpion
+|                     |     Stage 1 |                      Stage 2 |
+| :------------------ | ----------: | ---------------------------: |
+| `dataset`           |      UR100P |                       UR100P |
+| `max-steps`         |     1000000 | 25000 (120M) or 50000 (350M) |
+| `max-length`        |         512 |                         2048 |
+| `optimizer`         |       adamw |                        adamw |
+| `lr`                |       0.001 |                        0.001 |
+| `betas`             | (0.9, 0.95) |                  (0.9, 0.95) |
+| `eps`               |     1.0e-08 |                      1.0e-08 |
+| `weight-decay`      |        0.01 |                         0.01 |
+| `scheduler`         | cosinedecay |                         none |
+| `warmup-steps`      |       1,000 |                         none |
+| `final-step`        |     900,000 |                         none |
+| `warmup-steps`      |       1,000 |                         none |
+| `gradient-clipping` |         1.0 |                          1.0 |
+| `tf32`              |        true |                         true |
+| `mixed-precision`   |        bf16 |                         bf16 |
+| `padding`           |  max-length |                   max-length |
+| `random-truncate`   |        true |                         true |
+| `mask-probability`  |        0.15 |                         0.15 |
+| `total-batch-size`  |        4096 |                         4096 |
+| `deepspeed`         |        true |                         true |
+| `zero-stage`        |           3 |                            3 |
+## Get Started
+```python
+from transformers import AutoModel
+from transformers import AutoTokenizer
+from datasets import load_dataset
+# Load AMPLIFY and tokenizer
+model = AutoModel.from_pretrained("drug-discovery/AMPLIFY_350M", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("drug-discovery/AMPLIFY_350M", trust_remote_code=True)
+# Move the model to GPU (required due to Flash Attention)
+model = model.to("cuda")
+# Load the UniProt validation set
+dataset = load_dataset("drug-discovery/UR100P", data_dir="UniProt", split="test")
+for sample in dataset:
+    # Protein
+    print("Sample: ", sample["name"], sample["sequence"])
+    # Tokenize the protein
+    input = tokenizer.encode(sample["sequence"], return_tensors="pt")
+    print("Input: ", input)
+    # Move to the GPU and make a prediction
+    input = input.to("cuda")
+    output = model(input)
+    print("Output: ", output)
+    break
+```
+## Citations
+If you find the models useful in your research, we ask that you cite the paper:
+```bibtex
+@article{Fournier2024.09.23.614603,
+	title        = {Protein Language Models: Is Scaling Necessary?},
+	author       = {Fournier, Quentin and Vernon, Robert M. and van der Sloot, Almer and Schulz, Benjamin and Chandar, Sarath and Langmead, Christopher James},
+	year         = {2024},
+	journal      = {bioRxiv},
+	publisher    = {Cold Spring Harbor Laboratory},
+	doi          = {10.1101/2024.09.23.614603},
+	url          = {https://www.biorxiv.org/content/early/2024/09/23/2024.09.23.614603},
+	elocation-id = {2024.09.23.614603},
+	eprint       = {https://www.biorxiv.org/content/early/2024/09/23/2024.09.23.614603.full.pdf}
+}
+```

amplify.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# From https://stackoverflow.com/a/23689767
+# From https://github.com/pytorch/pytorch/issues/97899
+# From https://github.com/facebookresearch/llama/blob/main/llama/model.py
+import torch
+from torch import nn
+from xformers.ops import SwiGLU, memory_efficient_attention
+from .rmsnorm import RMSNorm
+from .rotary import precompute_freqs_cis, apply_rotary_emb
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import MaskedLMOutput
+class DotDict(dict):
+    """Dictionary that supports the dot notation to access attributes (similarly to HuggingFace)."""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class AMPLIFYConfig(PretrainedConfig):
+    model_type = "AMPLIFY"
+    # All config parameters must have a default value.
+    def __init__(
+        self,
+        hidden_size: int = 960,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 15,
+        intermediate_size: int = 3840,
+        dropout_prob: float = 0,
+        embedding_init_range: float = 0.02,
+        decoder_init_range: float = 0.02,
+        rms_norm: bool = True,
+        norm_eps: float = 1e-05,
+        hidden_act: str = "SwiGLU",
+        layer_norm_after_embedding: bool = False,
+        layer_norm_before_last_layer: bool = True,
+        vocab_size: int = 27,
+        ffn_bias: bool = False,
+        att_bias: bool = False,
+        pad_token_id: int = 0,
+        max_length: int = 2048,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout_prob = dropout_prob
+        self.embedding_init_range = embedding_init_range
+        self.decoder_init_range = decoder_init_range
+        self.rms_norm = rms_norm
+        self.norm_eps = norm_eps
+        self.hidden_act = hidden_act
+        self.layer_norm_after_embedding = layer_norm_after_embedding
+        self.layer_norm_before_last_layer = layer_norm_before_last_layer
+        self.vocab_size = vocab_size
+        self.ffn_bias = ffn_bias
+        self.att_bias = att_bias
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+class EncoderBlock(nn.Module):
+    """Transformer encoder block."""
+    def __init__(self, config: AMPLIFYConfig):
+        """Initialize a EncoderBlock.
+        Args:
+            hidden_size (int): _description_
+            num_attention_heads (int): _description_
+            intermediate_size (int, optional): _description_. Defaults to 2048.
+            dropout_prob (float, optional): _description_. Defaults to 0.1.
+            activation (str, optional): _description_. Defaults to "relu".
+            rms_norm (bool, optional): _description_. Defaults to True.
+            norm_eps (float, optional): _description_. Defaults to 1e-5.
+            pad_token_id (int, optional): _description_. Defaults to 0.
+            max_length (int, optional): _description_. Defaults to 2048.
+            ffn_bias (bool, optional): _description_. Defaults to False.
+            att_bias (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        self.config = config
+        self.d_head = config.hidden_size // config.num_attention_heads
+        # Attention
+        self.q = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.k = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.v = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.wo = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=config.att_bias)
+        self.resid_dropout = nn.Dropout(config.dropout_prob)
+        # Feedforward network
+        match config.hidden_act.lower():
+            case "swiglu":
+                # To keep the number of parameters and the amount of computation constant, we reduce the number of
+                # hidden units by a factor of 2/3 (https://arxiv.org/pdf/2002.05202.pdf) and make it a multiple of 8 to
+                # avoid RuntimeError due to misaligned operand
+                multiple_of = 8
+                intermediate_size = int(2 * config.intermediate_size / 3)
+                intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+                self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=config.ffn_bias)
+            case "relu":
+                self.ffn = nn.Sequential(
+                    nn.Linear(config.hidden_size, config.intermediate_size, bias=config.ffn_bias),
+                    nn.ReLU(),
+                    nn.Linear(config.intermediate_size, config.hidden_size, bias=config.ffn_bias),
+                )
+            case "gelu":
+                self.ffn = nn.Sequential(
+                    nn.Linear(config.hidden_size, config.intermediate_size, bias=config.ffn_bias),
+                    nn.GELU(),
+                    nn.Linear(config.intermediate_size, config.hidden_size, bias=config.ffn_bias),
+                )
+        self.attention_norm = RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        self.ffn_dropout = nn.Dropout(config.dropout_prob)
+    def forward(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
+        attn, contact = self._att_block(self.attention_norm(x), pad_mask, freqs_cis, output_attentions)
+        x = x + attn
+        x = x + self._ff_block(self.ffn_norm(x))
+        return x, contact
+    def _att_block(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
+        batch_size, seq_len, _ = x.shape
+        xq, xk, xv = self.q(x), self.k(x), self.v(x)
+        # Reshape for rotary embeddings
+        xq = xq.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xk = xk.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xv = xv.view(batch_size, seq_len, self.config.num_attention_heads, self.d_head)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+        attn = memory_efficient_attention(
+            query=xq,
+            key=xk,
+            value=xv,
+            attn_bias=pad_mask,
+            p=self.config.dropout_prob if self.training else 0,
+        )
+        _attn = None
+        if output_attentions:
+            _attn = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
+            if pad_mask is not None:
+                _attn = _attn + pad_mask
+            _attn = _attn.softmax(-1)
+        return self.resid_dropout(self.wo(attn.view(batch_size, seq_len, self.config.num_attention_heads * self.d_head))), _attn
+    def _ff_block(self, x: torch.Tensor):
+        return self.ffn_dropout(self.ffn(x))
+class AMPLIFYPreTrainedModel(PreTrainedModel):
+    config_class = AMPLIFYConfig
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.uniform_(-self.config.decoder_init_range, self.config.decoder_init_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.uniform_(-self.config.embedding_init_range, self.config.embedding_init_range)
+class AMPLIFY(AMPLIFYPreTrainedModel):
+    """The main model class.
+       Args:
+          config (amplify.model.amplify.AMPLIFYConfig): model configuration, usually defined from the Hydra configuration.
+    """
+    def __init__(self, config: AMPLIFYConfig, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.encoder = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        if config.layer_norm_after_embedding:
+            self.layer_norm_1 = RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        self.transformer_encoder = nn.ModuleList()
+        for _ in range(config.num_hidden_layers):
+            self.transformer_encoder.append(EncoderBlock(config))
+        if config.layer_norm_before_last_layer:
+            self.layer_norm_2 = RMSNorm(config.hidden_size, config.norm_eps) if config.rms_norm else nn.LayerNorm(config.hidden_size, config.norm_eps)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.freqs_cis = precompute_freqs_cis(config.hidden_size // config.num_attention_heads, config.max_length)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, src, pad_mask=None, output_hidden_states=False, output_attentions=False):
+        # Initialize
+        hidden_states, attentions = [], []
+        # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
+        if pad_mask is not None and not torch.all(pad_mask == 0):
+            pad_mask = pad_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, pad_mask.size(-1), 1)
+        else:
+            pad_mask = None
+        # RoPE
+        self.freqs_cis = self.freqs_cis.to(src.device, non_blocking=True)
+        freqs_cis = self.freqs_cis[: src.shape[1]]
+        # Embedding
+        x = self.encoder(src)
+        if self.config.layer_norm_after_embedding:
+            x = self.layer_norm_1(x)
+        # Transformer encoder
+        for layer in self.transformer_encoder:
+            x, attn = layer(x, pad_mask, freqs_cis, output_attentions)
+            if output_hidden_states:
+                hidden_states.append(x)
+            if output_attentions:
+                attentions.append(attn)
+        # Classification head with layer norm
+        logits = self.decoder(self.layer_norm_2(x) if self.config.layer_norm_before_last_layer else x)
+        # Return logits or the output of the last hidden layer
+        return MaskedLMOutput(logits=logits, hidden_states=hidden_states, attentions=attentions)

config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_name_": "AMPLIFY",
+  "architectures": [
+    "AMPLIFY"
+  ],
+  "att_bias": false,
+  "auto_map": {
+    "AutoConfig": "amplify.AMPLIFYConfig",
+    "AutoModel": "amplify.AMPLIFY"
+  },
+  "bias": false,
+  "bos_token_id": 3,
+  "decoder_init_range": 0.02,
+  "dropout_prob": 0,
+  "embedding_init_range": 0.02,
+  "eos_token_id": 4,
+  "ffn_bias": false,
+  "hidden_act": "SwiGLU",
+  "hidden_size": 640,
+  "intermediate_size": 2560,
+  "layer_norm_after_embedding": false,
+  "layer_norm_before_last_layer": true,
+  "mask_token_id": 2,
+  "max_length": 2048,
+  "model_type": "AMPLIFY",
+  "norm_eps": 1e-05,
+  "num_attention_heads": 10,
+  "num_hidden_layers": 24,
+  "other_special_token_ids": null,
+  "pad_token_id": 0,
+  "pre_activation_layer_norm": true,
+  "rms_norm": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "unk_token_id": 1,
+  "vocab_path": "conf/tokenizer/amplify_vocab.txt",
+  "vocab_size": 27
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cdd05fcfa647ed4861c13fc5bb6f94c49acf0c0510dbc5ea75a10aaec558170
+size 473126988

rmsnorm.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight

rotary.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from typing import Tuple
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings.
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "String": ""
+    },
+    "behavior": "Removed",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<bos>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<bos>": {
+        "id": "<bos>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<bos>"
+        ]
+      },
+      "<eos>": {
+        "id": "<eos>",
+        "ids": [
+          4
+        ],
+        "tokens": [
+          "<eos>"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordPiece",
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": "##",
+    "max_input_chars_per_word": 100,
+    "vocab": {
+      "<pad>": 0,
+      "<unk>": 1,
+      "<mask>": 2,
+      "<bos>": 3,
+      "<eos>": 4,
+      "|": 5,
+      "L": 6,
+      "A": 7,
+      "G": 8,
+      "V": 9,
+      "S": 10,
+      "E": 11,
+      "R": 12,
+      "T": 13,
+      "I": 14,
+      "D": 15,
+      "P": 16,
+      "K": 17,
+      "Q": 18,
+      "N": 19,
+      "F": 20,
+      "Y": 21,
+      "M": 22,
+      "H": 23,
+      "W": 24,
+      "C": 25,
+      "B": 26
+    }
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 2048,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "truncation_side": "right",
+  "unk_token": "<unk>"
+}