Upload ASVDOPTForCausalLM

Browse files

Files changed (5) hide show

config.json +77 -0
configuration_asvd_opt.py +129 -0
generation_config.json +7 -0
model.safetensors +3 -0
modeling_asvd_opt.py +44 -0

config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "_name_or_path": "huggingface_repos/opt-125m-asvd90",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "ASVDOPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_asvd_opt.ASVDOPTConfig",
+    "AutoModelForCausalLM": "modeling_asvd_opt.ASVDOPTForCausalLM"
+  },
+  "bos_token_id": 2,
+  "do_layer_norm_before": true,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 3072,
+  "hidden_size": 768,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "opt",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "torch_dtype": "float16",
+  "transformers_version": "4.35.2",
+  "truncation_ranks": {
+    "model.decoder.layers.0.self_attn.k_proj": 230,
+    "model.decoder.layers.0.self_attn.out_proj": 345,
+    "model.decoder.layers.0.self_attn.q_proj": 230,
+    "model.decoder.layers.0.self_attn.v_proj": 153,
+    "model.decoder.layers.1.fc2": 491,
+    "model.decoder.layers.1.self_attn.k_proj": 115,
+    "model.decoder.layers.1.self_attn.out_proj": 192,
+    "model.decoder.layers.1.self_attn.q_proj": 115,
+    "model.decoder.layers.10.self_attn.k_proj": 268,
+    "model.decoder.layers.10.self_attn.q_proj": 230,
+    "model.decoder.layers.11.self_attn.k_proj": 268,
+    "model.decoder.layers.11.self_attn.q_proj": 307,
+    "model.decoder.layers.2.fc1": 307,
+    "model.decoder.layers.2.fc2": 368,
+    "model.decoder.layers.2.self_attn.k_proj": 307,
+    "model.decoder.layers.2.self_attn.out_proj": 268,
+    "model.decoder.layers.2.self_attn.q_proj": 268,
+    "model.decoder.layers.2.self_attn.v_proj": 268,
+    "model.decoder.layers.3.fc2": 307,
+    "model.decoder.layers.3.self_attn.k_proj": 153,
+    "model.decoder.layers.3.self_attn.q_proj": 230,
+    "model.decoder.layers.4.fc2": 430,
+    "model.decoder.layers.4.self_attn.q_proj": 230,
+    "model.decoder.layers.4.self_attn.v_proj": 307,
+    "model.decoder.layers.5.fc2": 491,
+    "model.decoder.layers.5.self_attn.out_proj": 345,
+    "model.decoder.layers.5.self_attn.q_proj": 268,
+    "model.decoder.layers.6.fc2": 430,
+    "model.decoder.layers.6.self_attn.out_proj": 345,
+    "model.decoder.layers.6.self_attn.q_proj": 268,
+    "model.decoder.layers.7.fc2": 552,
+    "model.decoder.layers.7.self_attn.k_proj": 345,
+    "model.decoder.layers.7.self_attn.out_proj": 230,
+    "model.decoder.layers.7.self_attn.q_proj": 345,
+    "model.decoder.layers.8.fc1": 430,
+    "model.decoder.layers.8.self_attn.k_proj": 268,
+    "model.decoder.layers.8.self_attn.out_proj": 345,
+    "model.decoder.layers.8.self_attn.q_proj": 307,
+    "model.decoder.layers.9.fc2": 552,
+    "model.decoder.layers.9.self_attn.k_proj": 345
+  },
+  "use_cache": true,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 768
+}

configuration_asvd_opt.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class ASVDOPTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the OPT
+    [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50272):
+            Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`OPTModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        ffn_dim (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        do_layer_norm_before (`bool`, *optional*, defaults to `True`):
+            Whether to perform layer normalization before the attention block.
+        word_embed_proj_dim (`int`, *optional*):
+            `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-350m`. Defaults to
+            `hidden_size`.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        enable_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not if the linear layers in the attention blocks should use the bias term.
+        layer_norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether or not if the layer norms should have learnable parameters.
+    Example:
+    ```python
+    >>> from transformers import OPTConfig, OPTModel
+    >>> # Initializing a OPT facebook/opt-large style configuration
+    >>> configuration = OPTConfig()
+    >>> # Initializing a model (with random weights) from the facebook/opt-large style configuration
+    >>> model = OPTModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "opt"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50272,
+        hidden_size=768,
+        num_hidden_layers=12,
+        ffn_dim=3072,
+        max_position_embeddings=2048,
+        do_layer_norm_before=True,
+        _remove_final_layer_norm=False,
+        word_embed_proj_dim=None,
+        dropout=0.1,
+        attention_dropout=0.0,
+        num_attention_heads=12,
+        activation_function="relu",
+        layerdrop=0.0,
+        init_std=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=2,
+        eos_token_id=2,
+        enable_bias=True,
+        layer_norm_elementwise_affine=True,
+        truncation_ranks=None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_attention_heads = num_attention_heads
+        self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size
+        self.ffn_dim = ffn_dim
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.layerdrop = layerdrop
+        self.use_cache = use_cache
+        self.do_layer_norm_before = do_layer_norm_before
+        # We keep these variables at `True` for backward compatibility.
+        self.enable_bias = enable_bias
+        self.layer_norm_elementwise_affine = layer_norm_elementwise_affine
+        # Note that the only purpose of `_remove_final_layer_norm` is to keep backward compatibility
+        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        self._remove_final_layer_norm = _remove_final_layer_norm
+        # for avsd
+        self.truncation_ranks = truncation_ranks

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.35.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5cf413204607f269a3b403edce27d31e032d1a02768e533e475ed5328640be
+size 225654128

modeling_asvd_opt.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import OPTForCausalLM
+from .configuration_asvd_opt import ASVDOPTConfig
+import torch.nn as nn
+class ASVDLinear(nn.Module):
+    def __init__(self, in_features, out_features, rank, bias=True):
+        super().__init__()
+        self.BLinear = nn.Linear(in_features, rank, bias=False)
+        self.ALinear = nn.Linear(rank, out_features, bias=bias)
+    def forward(self, input):
+        return self.ALinear(self.BLinear(input))
+class ASVDOPTForCausalLM(OPTForCausalLM):
+    config_class = ASVDOPTConfig
+    def __init__(self, config:ASVDOPTConfig):
+        super().__init__(config)
+        self.truncation_ranks=config.truncation_ranks
+        full_name_dict = {module: name for name, module in self.named_modules()}
+        linear_info = {}
+        modules = [self]
+        while len(modules) > 0:
+            submodule = modules.pop()
+            for name, raw_linear in submodule.named_children():
+                if isinstance(raw_linear, nn.Linear):
+                    full_name = full_name_dict[raw_linear]
+                    linear_info[raw_linear] = {
+                        "father": submodule,
+                        "name": name,
+                        "full_name": full_name,
+                    }
+                else:
+                    modules.append(raw_linear)
+        for name,module in self.named_modules():
+            if name in self.truncation_ranks:
+                info=linear_info[module]
+                new_layer=ASVDLinear(module.in_features,module.out_features,self.truncation_ranks[name],bias=module.bias is not None)
+                setattr(info["father"], info["name"], new_layer)