add GLM code

Browse files

Files changed (10) hide show

LICENSE.txt +201 -0
added_tokens.json +10 -0
config.json +34 -0
configuration_glm.py +136 -0
generation_config.json +4 -0
merges.txt +0 -0
modeling_glm.py +975 -0
tokenization_glm.py +362 -0
tokenizer_config.json +18 -0
vocab.json +0 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright Zhengxiao Du
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "<|startofpiece|>": 50257,
+  "<|endofpiece|>": 50258,
+  "[CLS]": 50259,
+  "[MASK]": 50260,
+  "[SEP]": 50261,
+  "[UNUSED]": 50262,
+  "[gMASK]": 50263,
+  "[sMASK]": 50264
+}

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "/workspace/hanyu/hanyu/WebGLM-HGF/WebGLM",
+  "architectures": [
+    "GLMForConditionalGeneration"
+  ],
+  "attention_dropout_prob": 0.1,
+  "attention_scale": 1.0,
+  "auto_map": {
+    "AutoConfig": "configuration_glm.GLMConfig",
+    "AutoModel": "modeling_glm.GLMModel",
+    "AutoModelForMultipleChoice": "modeling_glm.GLMForMultipleChoice",
+    "AutoModelForSeq2SeqLM": "modeling_glm.GLMForConditionalGeneration"
+  },
+  "block_position_encoding": true,
+  "checkpoint_activations": false,
+  "checkpoint_num_layers": 1,
+  "embedding_dropout_prob": 0.1,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "max_sequence_length": 1024,
+  "model_type": "glm",
+  "num_attention_heads": 64,
+  "num_layers": 48,
+  "output_dropout_prob": 0.1,
+  "output_predict": true,
+  "parallel_output": true,
+  "pool_token": "cls",
+  "relative_encoding": false,
+  "spell_func": "lstm",
+  "spell_length": null,
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.4",
+  "vocab_size": 50304
+}

configuration_glm.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# coding=utf-8
+# Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GLM model configuration """
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+GLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "shunxing1234/GLM": "https://huggingface.co/shunxing1234/GLM/resolve/main/config.json",
+    # See all GLM models at https://huggingface.co/models?filter=glm
+}
+class GLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GLMModel`].
+    It is used to instantiate an GLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture.
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~GLMModel`] or
+            [`~TFGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or
+            [`~TFGLMModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        Example:
+    ```python
+    >>> from transformers import GLMModel, GLMConfig
+    >>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration
+    >>> configuration = GLMConfig()
+    >>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration
+    >>> model = GLMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "glm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers"
+    }
+    def __init__(
+            self,
+            num_layers=24,
+            vocab_size=30592,
+            hidden_size=1024,
+            num_attention_heads=16,
+            embedding_dropout_prob=0.1,
+            attention_dropout_prob=0.1,
+            output_dropout_prob=0.1,
+            max_sequence_length=512,
+            checkpoint_activations=False,
+            checkpoint_num_layers=1,
+            parallel_output=True,
+            relative_encoding=False,
+            block_position_encoding=True,
+            output_predict=False,
+            spell_length=None,
+            spell_func="lstm",
+            attention_scale=1.0,
+            initializer_range=0.02,
+            pool_token="cls",
+            **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.max_sequence_length = max_sequence_length
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.parallel_output = parallel_output
+        self.relative_encoding = relative_encoding
+        self.block_position_encoding = block_position_encoding
+        self.output_predict = output_predict
+        self.spell_length = spell_length
+        self.spell_func = spell_func
+        self.attention_scale = attention_scale
+        self.initializer_range = initializer_range
+        self.pool_token = pool_token
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.27.4"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_glm.py ADDED Viewed

	@@ -0,0 +1,975 @@

+# coding=utf-8
+# Copyright 2022 shunxing1234 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch GLM model. """
+import math
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch.nn import init, LayerNorm, Linear, CrossEntropyLoss
+from transformers.activations import gelu
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    ModelOutput,
+    SequenceClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+)
+from .configuration_glm import GLMConfig
+from torch.nn.parameter import Parameter
+_CHECKPOINT_FOR_DOC = "shunxing1234/GLM"
+_CONFIG_FOR_DOC = "GLMConfig"
+_TOKENIZER_FOR_DOC = "GLMTokenizer"
+GLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "shunxing1234/GLM",
+    # See all GLM models at https://huggingface.co/models?filter=glm
+]
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    return init_
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+    return init_
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+def split_tensor_along_last_dim(tensor, num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class MLP(torch.nn.Module):
+    """MLP for GPT2.
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+    def __init__(self, hidden_size, output_dropout_prob, init_method,
+                 output_layer_init_method=None):
+        super(MLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size)
+        # Project back to h.
+        self.dense_4h_to_h = Linear(
+            4 * hidden_size,
+            hidden_size)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+class VocabEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+    def __init__(self, config):
+        super(VocabEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = config.vocab_size
+        self.embedding_dim = config.hidden_size
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.vocab_start_index = 0
+        self.vocab_end_index = self.num_embeddings
+        # Allocate weights.
+        self.weight = Parameter(torch.Tensor(self.num_embeddings,
+                                             self.embedding_dim))
+        # And initialize.
+        init.xavier_normal_(self.weight)
+    def forward(self, input_):
+        # Get the embeddings.
+        output = F.embedding(input_, self.weight,
+                             self.padding_idx, self.max_norm,
+                             self.norm_type, self.scale_grad_by_freq,
+                             self.sparse)
+        return output
+class PositionalEmbedding(torch.nn.Module):
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+        self.hidden_size = hidden_size
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
+        self.register_buffer('inv_freq', inv_freq)
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+class SelfAttention(torch.nn.Module):
+    """self-attention layer for GLM.
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        attention_dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None,
+                 attention_scale=1.0):
+        super(SelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        self.hidden_size = hidden_size
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads = num_attention_heads
+        self.attention_scale = attention_scale
+        # Strided linear layer.
+        self.query_key_value = Linear(hidden_size, 3 * hidden_size)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+        # Output.
+        self.dense = Linear(hidden_size,
+                            hidden_size)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, ltor_mask, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [b,1,s,s]
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+        # self attention
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.attention_scale > 1.0:
+            # Raw attention scores. [b, np, s, s]
+            attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_scale),
+                                            key_layer.transpose(-1, -2) / math.sqrt(
+                                                self.hidden_size_per_attention_head * self.attention_scale))
+        else:
+            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt(
+                self.hidden_size_per_attention_head))
+        # Apply the left to right attention mask.
+        ltor_mask = ltor_mask.type_as(attention_scores)
+        attention_scores = torch.mul(attention_scores, ltor_mask)
+        if self.attention_scale > 1.0:
+            max_attention_scores = attention_scores.max(dim=-1, keepdim=True)[0]
+            attention_scores -= max_attention_scores
+            attention_scores *= self.attention_scale
+        attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask)
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # with get_cuda_rng_tracker().fork():
+        attention_probs = self.attention_dropout(attention_probs)
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+        return output
+class GLMBlock(torch.nn.Module):
+    """A single layer transformer for GLM.
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 attention_scale=1.0):
+        super(GLMBlock, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            attention_scale=attention_scale)
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+        # MLP
+        self.mlp = MLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+    def forward(self, hidden_states, ltor_mask, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [b,1, s,s]
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+        return output
+class GLMStack(torch.nn.Module):
+    """GLM transformer.
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 block_position_encoding=False,
+                 attention_scale=1.0,
+                 ):
+        super(GLMStack, self).__init__()
+        self.hidden_size = hidden_size
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(0.0, init_method_std,
+                                                          num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.block_position_encoding = block_position_encoding
+        # Position embedding (serial).
+        if block_position_encoding:
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size)
+            self.block_position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size)
+            torch.nn.init.normal_(self.block_position_embeddings.weight, mean=0.0, std=init_method_std)
+        else:
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size)
+        # Initialize the position embeddings.
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+        def get_layer():
+            return GLMBlock(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                attention_scale=attention_scale)
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+    def forward(self, hidden_states, position_ids, attention_mask, memory_states=None):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = memory_states[0].size(1) if memory_states else 0
+        # attention mask is the beginning postion of B region, \in [0, query_len)
+        is_scalar = torch.numel(attention_mask) == 1
+        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
+        if is_sep:
+            sep = attention_mask.item() if is_scalar else attention_mask
+            # conventional transformer
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                m = hidden_states.new_ones((1, seq_length, seq_length))
+                m = torch.tril(m)
+                if is_scalar:
+                    m[0, :, :int(sep)] = 1
+                else:
+                    m = m.expand(batch_size, -1, -1)
+                    ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1)
+                    mask = ids < sep.view(-1, 1)
+                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
+                if memory_length > 0:
+                    m = m.expand(batch_size, -1, -1)
+                    m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2)
+                m = m.unsqueeze(1)
+                return m
+            attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length)
+        else:
+            if attention_mask.dim() == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+            attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
+        if self.block_position_encoding:
+            position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1]
+        position_embeddings = self.position_embeddings(position_ids)
+        hidden_states = hidden_states + position_embeddings
+        if self.block_position_encoding:
+            block_position_embeddings = self.block_position_embeddings(block_position_ids)
+            hidden_states = hidden_states + block_position_embeddings
+        hidden_states = self.embedding_dropout(hidden_states)
+        def check_detach(_hidden_states):
+            return _hidden_states.detach()
+        mem_layers = [check_detach(hidden_states)]
+        for i, layer in enumerate(self.layers):
+            args = [hidden_states, attention_mask]
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    # None for past_key_value
+                    return module(*inputs)
+                return custom_forward
+            mem_i = memory_states[i] if memory_states else None
+            if self.checkpoint_activations:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    mem=mem_i,
+                )
+            else:
+                hidden_states = layer(*args, mem=mem_i)
+            mem_layers.append(check_detach(hidden_states))
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        mem_layers = self.update_mems(mem_layers, memory_states)
+        return (output, mem_layers)
+    def update_mems(self, hiddens, mems):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = memory_length + query_length
+        new_mems = []
+        # with torch.no_grad():
+        for i in range(len(hiddens)):
+            if new_memory_length <= query_length:
+                new_mems.append(hiddens[i][:, -new_memory_length:])
+            else:
+                new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1))
+        return new_mems
+class GLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+    config_class = GLMConfig
+    base_model_prefix = "glm"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, torch.nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, torch.nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, torch.nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMModel):
+            module.gradient_checkpointing = value
+GLM_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+    Parameters:
+        config ([`~GLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+GLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`GLMTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare GLM Model transformer outputting raw hidden-states without any specific head on top.",
+    GLM_START_DOCSTRING,
+)
+class GLMModel(GLMPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.output_predict = config.output_predict
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabEmbedding(config)
+        # Transformer
+        self.transformer = GLMStack(config.num_layers,
+                                    config.hidden_size,
+                                    config.num_attention_heads,
+                                    config.max_sequence_length,
+                                    config.embedding_dropout_prob,
+                                    config.attention_dropout_prob,
+                                    config.output_dropout_prob,
+                                    config.checkpoint_activations,
+                                    config.checkpoint_num_layers,
+                                    attention_scale=config.attention_scale,
+                                    block_position_encoding=config.block_position_encoding)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids=None,
+            position_ids=None,
+            attention_mask=None,
+            mems=None,
+            **kwargs
+    ):
+        batch_size = input_ids.size(0)
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        device = input_ids.device
+        input_shape = input_ids.size()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device)
+            block_position_ids = torch.zeros(input_shape[-1], dtype=torch.long, device=device)
+            position_ids = torch.stack((position_ids, block_position_ids), dim=0).unsqueeze(0)
+        if attention_mask is None:
+            attention_mask = torch.zeros(batch_size)
+        # Transformer.
+        transformer_output = self.transformer(embeddings, position_ids, attention_mask, mems)
+        last_hidden_states, mems = transformer_output
+        logits = None
+        if self.output_predict:
+            logits = F.linear(last_hidden_states, self.word_embeddings.weight)
+        return ModelOutput(
+            last_hidden_states=last_hidden_states,
+            logits=logits,
+            mems=mems,
+        )
+@add_start_docstrings(
+    """GLM Model transformer for multiple choice classification""",
+    GLM_START_DOCSTRING
+)
+class GLMForMultipleChoice(GLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.glm = GLMModel(config)
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            position_ids=None,
+            attention_mask=None,
+            choice_ids=None,
+            choice_indices=None,
+            labels=None,
+            mems=None,
+            **kwargs
+    ):
+        model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs)
+        lm_logits = model_output.logits
+        log_probs = []
+        for output, choices, choice_index in zip(F.log_softmax(lm_logits, dim=-1), choice_ids, choice_indices):
+            log_probs_single = []
+            for choice, choice_target_id in zip(choices, choice_index):
+                tmp = output[choice_target_id, choice]
+                log_probs_single.append(tmp.sum())
+            log_probs.append(torch.stack(log_probs_single))
+        log_probs = torch.stack(log_probs)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(log_probs, labels)
+        return ModelOutput(
+            loss=loss,
+            logits=log_probs,
+            lm_logits=lm_logits,
+            mems=model_output.mems
+        )
+@add_start_docstrings(
+    """GLM Model transformer with a `language modeling` head on top""",
+    GLM_START_DOCSTRING,
+)
+class GLMForConditionalGeneration(GLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.glm = GLMModel(config)
+        self.post_init()
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            return past
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            reordered_decoder_past = reordered_decoder_past + (
+                layer_past_states.index_select(0, beam_idx.to(layer_past_states.device)),)
+        return reordered_decoder_past
+    def prepare_inputs_for_generation(self, input_ids, past=None, position_ids=None, generation_attention_mask=None,
+                                      **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        attention_mask = generation_attention_mask
+        seq_length = input_ids.shape[1]
+        if past:
+            if position_ids is not None:
+                position_ids = position_ids[:, :, seq_length - 1].unsqueeze(-1)
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, seq_length - 1, :seq_length].unsqueeze(-2)
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        else:
+            if position_ids is not None:
+                position_ids = position_ids[:, :, :seq_length]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, :seq_length, :seq_length]
+        if position_ids is not None and input_ids.size(0) > position_ids.size(0):
+            batch_size = position_ids.size(0)
+            num_beams = input_ids.size(0) // batch_size
+            position_ids = position_ids.unsqueeze(1).expand(-1, num_beams, -1, -1)
+            position_ids = position_ids.reshape(batch_size * num_beams, *position_ids.shape[-2:])
+        if attention_mask is not None and input_ids.size(0) > attention_mask.size(0):
+            batch_size = attention_mask.size(0)
+            num_beams = input_ids.size(0) // batch_size
+            attention_mask = attention_mask.unsqueeze(1).expand(-1, num_beams, -1, -1, -1)
+            attention_mask = attention_mask.reshape(batch_size * num_beams, *attention_mask.shape[-3:])
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "mems": past,
+        }
+    def forward(
+            self,
+            input_ids=None,
+            position_ids=None,
+            attention_mask=None,
+            labels=None,
+            mems=None,
+            **kwargs
+    ):
+        model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs)
+        lm_logits = model_output.logits
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+        return ModelOutput(
+            loss=loss,
+            logits=lm_logits,
+            mems=model_output.mems
+        )
+@add_start_docstrings(
+    """GLM Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    GLM_START_DOCSTRING,
+)
+class GLMForSequenceClassification(GLMPreTrainedModel):
+    def __init__(self, config: GLMConfig, hidden_dropout=None, num_class=1):
+        super().__init__(config)
+        self.pool_token = config.pool_token
+        self.glm = GLMModel(config)
+        self.glm.output_predict = False
+        self.num_class = num_class
+        # Multi-choice head.
+        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.output_dropout_prob
+        )
+        self.dropout = torch.nn.Dropout(classifier_dropout)
+        self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(self,
+                input_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                labels=None):
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1, *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+        model_out = self.glm(input_ids, position_ids, attention_mask)
+        outputs, mems = model_out.last_hidden_states, model_out.mems
+        output = outputs[:, 0, :]
+        output = self.dropout(output)
+        output = torch.tanh(self.dense(output))
+        output = self.dropout(output)
+        logits = self.out_proj(output)
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits, labels)
+        # loss = F.cross_entropy(logits.contiguous().float(), labels.long())
+        return SequenceClassifierOutput(loss=loss,
+                                        logits=logits,
+                                        hidden_states=outputs)

tokenization_glm.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from typing import Optional, Tuple, List, Union
+from shutil import copyfile
+import torch
+from transformers import PreTrainedTokenizer, RobertaTokenizer, GPT2Tokenizer, BertTokenizer
+from transformers.utils import logging
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.utils.generic import _is_torch_device
+import sentencepiece as spm
+logger = logging.get_logger(__name__)
+class GLMBatchEncoding(BatchEncoding):
+    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
+        """
+        Send all values to device by calling `v.to(device)` (PyTorch only).
+        Args:
+            device (`str` or `torch.device`): The device to put the tensors on.
+        Returns:
+            [`BatchEncoding`]: The same instance after modification.
+        """
+        # This check catches things like APEX blindly calling "to" on all inputs to a module
+        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
+        # into a HalfTensor
+        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
+            self.data = {k: v.to(device=device) if torch.is_tensor(v) else v for k, v in self.data.items()}
+        else:
+            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
+        return self
+class GLMTokenizerMixin:
+    @property
+    def sop_token(self) -> Optional[str]:
+        return "<|startofpiece|>"
+    @property
+    def sop_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the start token in the vocabulary, used when training a model with autoregressive blank filling.
+        """
+        return self.convert_tokens_to_ids(self.sop_token)
+    @property
+    def eop_token(self) -> Optional[str]:
+        return "<|endofpiece|>"
+    @property
+    def eop_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end token in the vocabulary, used when training a model with autoregressive blank filling.
+        """
+        return self.convert_tokens_to_ids(self.eop_token)
+    @property
+    def gmask_token_id(self) -> int:
+        return self.convert_tokens_to_ids("[gMASK]")
+    @property
+    def smask_token_id(self) -> int:
+        return self.convert_tokens_to_ids("[sMASK]")
+    @property
+    def mask_token_ids(self):
+        return [self.mask_token_id, self.smask_token_id, self.gmask_token_id]
+    def _build_input_for_multiple_choice(self, context, choices):
+        context_id = context["input_ids"]
+        if torch.is_tensor(context_id):
+            context_id = context_id.tolist()
+        division = len(context_id)
+        mask_position = context_id.index(self.mask_token_id)
+        token = torch.tensor(context_id, dtype=torch.long)
+        attention_mask = [context["attention_mask"].expand(division, -1)]
+        position_id = torch.arange(division, dtype=torch.long)
+        block_position_id = torch.zeros(division, dtype=torch.long)
+        choice_ids, choice_indices = [], []
+        for choice_str in choices:
+            choice = torch.tensor(self(choice_str, add_special_tokens=False, padding=False)['input_ids'],
+                                  dtype=torch.long)
+            choice_ids.append(choice)
+            choice_indices.append(torch.arange(len(token), len(token) + len(choice), dtype=torch.long))
+            attention_mask.append(torch.tril(torch.ones((len(choice), len(choice)), dtype=torch.long)))
+            token = torch.cat((token, torch.tensor([self.sop_token_id], dtype=torch.long), choice[:-1]))
+            position_id = torch.cat((position_id, torch.tensor([mask_position] * len(choice), dtype=torch.long)))
+            block_position_id = torch.cat((block_position_id, torch.arange(1, 1 + len(choice), dtype=torch.long)))
+        attention_mask = torch.block_diag(*attention_mask)
+        attention_mask[division:, :division] = context["attention_mask"].unsqueeze(0)
+        return {
+            "input_ids": token,
+            "position_ids": torch.stack((position_id, block_position_id)),
+            "attention_mask": attention_mask,
+            "choice_ids": choice_ids,
+            "choice_indices": choice_indices
+        }
+    def _pad_batch(self, tokens, position_ids, attention_mask, max_seq_length):
+        pad_length = max_seq_length - len(tokens)
+        attention_mask = torch.nn.functional.pad(
+            attention_mask,
+            (0, pad_length, 0, pad_length),
+            mode="constant",
+            value=0,
+        )
+        tokens = torch.cat((tokens, torch.zeros(pad_length, dtype=torch.long)))
+        position_ids = torch.cat((position_ids, position_ids[..., -1:].expand(-1, pad_length)), dim=-1)
+        return tokens, position_ids, attention_mask
+    def _collate(self, samples):
+        TILE = 1
+        length_to_pad = (max(map(lambda spl: len(spl["input_ids"]), samples)) + TILE - 1) // TILE * TILE
+        token_batch, position_id_batch, attention_mask_batch = [], [], []
+        choices_batch, choice_target_ids_batch = [], []
+        for sample in samples:
+            token, position_id, attention_mask = self._pad_batch(
+                sample["input_ids"], sample["position_ids"], sample["attention_mask"], length_to_pad
+            )
+            token_batch.append(token)
+            position_id_batch.append(position_id)
+            attention_mask_batch.append(attention_mask)
+            choices_batch.append(sample["choice_ids"])
+            choice_target_ids_batch.append(sample["choice_indices"])
+        return {
+            "input_ids": torch.stack(token_batch),
+            "position_ids": torch.stack(position_id_batch),
+            "attention_mask": torch.stack(attention_mask_batch).unsqueeze(1),
+            "choice_ids": choices_batch,
+            "choice_indices": choice_target_ids_batch,
+        }
+    def build_inputs_for_multiple_choice(self, model_input: BatchEncoding, choices, max_length=None):
+        samples = [{key: value[i] for key, value in model_input.items()} for i in range(len(model_input["input_ids"]))]
+        samples = [self._build_input_for_multiple_choice(sample, choice) for sample, choice in
+                   zip(samples, choices)]
+        inputs = self._collate(samples)
+        return GLMBatchEncoding(inputs)
+    def build_inputs_for_generation(self, model_input: BatchEncoding, max_gen_length=512, targets=None, padding=False):
+        mask_ids = self.mask_token_ids
+        input_ids = model_input.input_ids
+        batch_size, seq_length = input_ids.shape[:2]
+        position_id, block_position_id = list(range(seq_length)), [0 for _ in range(seq_length)]
+        position_ids, block_position_ids = [], []
+        labels = None
+        if targets is not None:
+            is_batched = isinstance(targets, (list, tuple))
+            targets = self(targets, add_special_tokens=False, padding=False).input_ids
+            if not is_batched:
+                targets = [targets]
+            assert len(targets) == len(input_ids)
+            targets = [(target + [self.eop_token_id])[:max_gen_length] for target in targets]
+            if not padding:
+                max_gen_length = max(map(len, targets))
+            targets = [[self.sop_token_id] + target for target in targets]
+            labels = [target[1:] for target in targets]
+            targets = [target + [self.pad_token_id] * (max_gen_length + 1 - len(target)) for target in targets]
+            labels = [label + [-100] * (max_gen_length - len(label)) for label in labels]
+            targets = torch.tensor(targets, dtype=input_ids.dtype, device=input_ids.device)
+            labels = torch.tensor(labels, dtype=input_ids.dtype, device=input_ids.device)
+            labels = torch.cat((input_ids.new_full((batch_size, seq_length), -100), labels), dim=1)
+        for i in range(batch_size):
+            mask_positions = []
+            for mask_id in mask_ids:
+                mask_positions += (input_ids[i] == mask_id).nonzero(as_tuple=True)[0].tolist()
+            if not mask_positions:
+                raise ValueError("Cannot find mask token in the input")
+            mask_positions.sort()
+            mask_pos = mask_positions[0]
+            position_ids.append(position_id + [mask_pos] * max_gen_length)
+            block_position_ids.append(block_position_id + list(range(1, max_gen_length + 1)))
+        position_ids = torch.tensor(position_ids, dtype=input_ids.dtype, device=input_ids.device)
+        block_position_ids = torch.tensor(block_position_ids, dtype=input_ids.dtype, device=input_ids.device)
+        position_ids = torch.stack((position_ids, block_position_ids), dim=1)
+        attention_mask = model_input.attention_mask
+        attention_mask = attention_mask.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1)
+        generation_attention_mask = torch.cat([attention_mask.new_zeros((seq_length, max_gen_length)),
+                                               torch.tril(attention_mask.new_ones((max_gen_length, max_gen_length)))],
+                                              dim=0).unsqueeze(0).expand(batch_size, -1, -1)
+        attention_mask = torch.cat((attention_mask, generation_attention_mask), dim=2)
+        attention_mask = attention_mask.unsqueeze(1)
+        if targets is None:
+            input_ids = torch.cat((input_ids, input_ids.new_full((batch_size, 1), self.sop_token_id)), dim=-1)
+        else:
+            input_ids = torch.cat((input_ids, targets[:, :-1]), dim=1)
+        batch = {"input_ids": input_ids, "position_ids": position_ids}
+        if labels is None:
+            batch["generation_attention_mask"] = attention_mask
+        else:
+            batch["attention_mask"] = attention_mask
+            batch["labels"] = labels
+        return BatchEncoding(batch)
+class GLMRobertaTokenizer(RobertaTokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    truncation_side: str = "left"
+    @property
+    def gmask_token_id(self) -> int:
+        raise NotImplementedError("The model doesn't support gMASK")
+    @property
+    def smask_token_id(self) -> int:
+        raise NotImplementedError("The model doesn't support sMASK")
+    @property
+    def mask_token_ids(self):
+        return [self.mask_token_id]
+class GLMChineseTokenizer(PreTrainedTokenizer, GLMTokenizerMixin):
+    vocab_files_names = {"vocab_file": "cog-pretrain.model"}
+    truncation_side: str = "left"
+    def __init__(self, vocab_file, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text, **kwargs):
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+    def convert_tokens_to_string(self, tokens):
+        return self.sp_model.decode(tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        assert token_ids_1 is None
+        cls = [self.cls_token_id]
+        eos = [self.eos_token_id]
+        return cls + token_ids_0 + eos
+class GLMGPT2Tokenizer(GPT2Tokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    truncation_side: str = "left"
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        assert token_ids_1 is None
+        cls = [self.cls_token_id]
+        eos = [self.eos_token_id]
+        return cls + token_ids_0 + eos
+class GLMBertTokenizer(BertTokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    truncation_side: str = "left"
+    @property
+    def gmask_token_id(self) -> int:
+        raise NotImplementedError("The model doesn't support gMASK")
+    @property
+    def smask_token_id(self) -> int:
+        raise NotImplementedError("The model doesn't support sMASK")
+    @property
+    def mask_token_ids(self):
+        return [self.mask_token_id]
+class GLMTokenizer:
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
+        if config_tokenizer_class == "GLMRobertaTokenizer":
+            tokenizer_class = GLMRobertaTokenizer
+        elif config_tokenizer_class == "GLMChineseTokenizer":
+            tokenizer_class = GLMChineseTokenizer
+        elif config_tokenizer_class == "GLMGPT2Tokenizer":
+            tokenizer_class = GLMGPT2Tokenizer
+        elif config_tokenizer_class == "GLMBertTokenizer":
+            tokenizer_class = GLMBertTokenizer
+        else:
+            raise NotImplementedError("Not implemented tokenizer type:", config_tokenizer_class)
+        return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "name_or_path": "THUDM/glm-10b",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "unk_token": "[UNK]",
+  "additional_special_tokens": ["<|startofpiece|>", "<|endofpiece|>", "[gMASK]", "[sMASK]"],
+  "add_prefix_space": false,
+  "tokenizer_class": "GLMGPT2Tokenizer",
+  "use_fast": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_glm.GLMGPT2Tokenizer",
+      null
+      ]
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff