pavan01729 commited on
Commit
2dc9a76
1 Parent(s): 4ffdfc6

Upload ASVDOPTForCausalLM

Browse files
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "huggingface_repos/opt-125m-asvd90",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "ASVDOPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "auto_map": {
11
+ "AutoConfig": "configuration_asvd_opt.ASVDOPTConfig",
12
+ "AutoModelForCausalLM": "modeling_asvd_opt.ASVDOPTForCausalLM"
13
+ },
14
+ "bos_token_id": 2,
15
+ "do_layer_norm_before": true,
16
+ "dropout": 0.1,
17
+ "enable_bias": true,
18
+ "eos_token_id": 2,
19
+ "ffn_dim": 3072,
20
+ "hidden_size": 768,
21
+ "init_std": 0.02,
22
+ "layer_norm_elementwise_affine": true,
23
+ "layerdrop": 0.0,
24
+ "max_position_embeddings": 2048,
25
+ "model_type": "opt",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 1,
29
+ "prefix": "</s>",
30
+ "torch_dtype": "float16",
31
+ "transformers_version": "4.35.2",
32
+ "truncation_ranks": {
33
+ "model.decoder.layers.0.self_attn.k_proj": 230,
34
+ "model.decoder.layers.0.self_attn.out_proj": 345,
35
+ "model.decoder.layers.0.self_attn.q_proj": 230,
36
+ "model.decoder.layers.0.self_attn.v_proj": 153,
37
+ "model.decoder.layers.1.fc2": 491,
38
+ "model.decoder.layers.1.self_attn.k_proj": 115,
39
+ "model.decoder.layers.1.self_attn.out_proj": 192,
40
+ "model.decoder.layers.1.self_attn.q_proj": 115,
41
+ "model.decoder.layers.10.self_attn.k_proj": 268,
42
+ "model.decoder.layers.10.self_attn.q_proj": 230,
43
+ "model.decoder.layers.11.self_attn.k_proj": 268,
44
+ "model.decoder.layers.11.self_attn.q_proj": 307,
45
+ "model.decoder.layers.2.fc1": 307,
46
+ "model.decoder.layers.2.fc2": 368,
47
+ "model.decoder.layers.2.self_attn.k_proj": 307,
48
+ "model.decoder.layers.2.self_attn.out_proj": 268,
49
+ "model.decoder.layers.2.self_attn.q_proj": 268,
50
+ "model.decoder.layers.2.self_attn.v_proj": 268,
51
+ "model.decoder.layers.3.fc2": 307,
52
+ "model.decoder.layers.3.self_attn.k_proj": 153,
53
+ "model.decoder.layers.3.self_attn.q_proj": 230,
54
+ "model.decoder.layers.4.fc2": 430,
55
+ "model.decoder.layers.4.self_attn.q_proj": 230,
56
+ "model.decoder.layers.4.self_attn.v_proj": 307,
57
+ "model.decoder.layers.5.fc2": 491,
58
+ "model.decoder.layers.5.self_attn.out_proj": 345,
59
+ "model.decoder.layers.5.self_attn.q_proj": 268,
60
+ "model.decoder.layers.6.fc2": 430,
61
+ "model.decoder.layers.6.self_attn.out_proj": 345,
62
+ "model.decoder.layers.6.self_attn.q_proj": 268,
63
+ "model.decoder.layers.7.fc2": 552,
64
+ "model.decoder.layers.7.self_attn.k_proj": 345,
65
+ "model.decoder.layers.7.self_attn.out_proj": 230,
66
+ "model.decoder.layers.7.self_attn.q_proj": 345,
67
+ "model.decoder.layers.8.fc1": 430,
68
+ "model.decoder.layers.8.self_attn.k_proj": 268,
69
+ "model.decoder.layers.8.self_attn.out_proj": 345,
70
+ "model.decoder.layers.8.self_attn.q_proj": 307,
71
+ "model.decoder.layers.9.fc2": 552,
72
+ "model.decoder.layers.9.self_attn.k_proj": 345
73
+ },
74
+ "use_cache": true,
75
+ "vocab_size": 50272,
76
+ "word_embed_proj_dim": 768
77
+ }
configuration_asvd_opt.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ class ASVDOPTConfig(PretrainedConfig):
8
+ r"""
9
+ This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
10
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the OPT
12
+ [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.
13
+
14
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
15
+ documentation from [`PretrainedConfig`] for more information.
16
+
17
+
18
+ Args:
19
+ vocab_size (`int`, *optional*, defaults to 50272):
20
+ Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the
21
+ `inputs_ids` passed when calling [`OPTModel`]
22
+ hidden_size (`int`, *optional*, defaults to 768):
23
+ Dimensionality of the layers and the pooler layer.
24
+ num_hidden_layers (`int`, *optional*, defaults to 12):
25
+ Number of decoder layers.
26
+ ffn_dim (`int`, *optional*, defaults to 3072):
27
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
28
+ num_attention_heads (`int`, *optional*, defaults to 12):
29
+ Number of attention heads for each attention layer in the Transformer decoder.
30
+ activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
31
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
32
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
33
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
34
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
35
+ just in case (e.g., 512 or 1024 or 2048).
36
+ do_layer_norm_before (`bool`, *optional*, defaults to `True`):
37
+ Whether to perform layer normalization before the attention block.
38
+ word_embed_proj_dim (`int`, *optional*):
39
+ `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-350m`. Defaults to
40
+ `hidden_size`.
41
+ dropout (`float`, *optional*, defaults to 0.1):
42
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
43
+ attention_dropout (`float`, *optional*, defaults to 0.0):
44
+ The dropout ratio for the attention probabilities.
45
+ layerdrop (`float`, *optional*, defaults to 0.0):
46
+ The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
47
+ details.
48
+ init_std (`float`, *optional*, defaults to 0.02):
49
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50
+ use_cache (`bool`, *optional*, defaults to `True`):
51
+ Whether or not the model should return the last key/values attentions (not used by all models).
52
+ enable_bias (`bool`, *optional*, defaults to `True`):
53
+ Whether or not if the linear layers in the attention blocks should use the bias term.
54
+ layer_norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
55
+ Whether or not if the layer norms should have learnable parameters.
56
+
57
+ Example:
58
+
59
+ ```python
60
+ >>> from transformers import OPTConfig, OPTModel
61
+
62
+ >>> # Initializing a OPT facebook/opt-large style configuration
63
+ >>> configuration = OPTConfig()
64
+
65
+ >>> # Initializing a model (with random weights) from the facebook/opt-large style configuration
66
+ >>> model = OPTModel(configuration)
67
+
68
+ >>> # Accessing the model configuration
69
+ >>> configuration = model.config
70
+ ```"""
71
+ model_type = "opt"
72
+ keys_to_ignore_at_inference = ["past_key_values"]
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_size=50272,
77
+ hidden_size=768,
78
+ num_hidden_layers=12,
79
+ ffn_dim=3072,
80
+ max_position_embeddings=2048,
81
+ do_layer_norm_before=True,
82
+ _remove_final_layer_norm=False,
83
+ word_embed_proj_dim=None,
84
+ dropout=0.1,
85
+ attention_dropout=0.0,
86
+ num_attention_heads=12,
87
+ activation_function="relu",
88
+ layerdrop=0.0,
89
+ init_std=0.02,
90
+ use_cache=True,
91
+ pad_token_id=1,
92
+ bos_token_id=2,
93
+ eos_token_id=2,
94
+ enable_bias=True,
95
+ layer_norm_elementwise_affine=True,
96
+ truncation_ranks=None,
97
+ **kwargs,
98
+ ):
99
+ super().__init__(
100
+ pad_token_id=pad_token_id,
101
+ bos_token_id=bos_token_id,
102
+ eos_token_id=eos_token_id,
103
+ **kwargs,
104
+ )
105
+ self.vocab_size = vocab_size
106
+ self.max_position_embeddings = max_position_embeddings
107
+ self.num_attention_heads = num_attention_heads
108
+ self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size
109
+ self.ffn_dim = ffn_dim
110
+ self.hidden_size = hidden_size
111
+ self.num_hidden_layers = num_hidden_layers
112
+ self.dropout = dropout
113
+ self.attention_dropout = attention_dropout
114
+ self.activation_function = activation_function
115
+ self.init_std = init_std
116
+ self.layerdrop = layerdrop
117
+ self.use_cache = use_cache
118
+ self.do_layer_norm_before = do_layer_norm_before
119
+ # We keep these variables at `True` for backward compatibility.
120
+ self.enable_bias = enable_bias
121
+ self.layer_norm_elementwise_affine = layer_norm_elementwise_affine
122
+
123
+ # Note that the only purpose of `_remove_final_layer_norm` is to keep backward compatibility
124
+ # with checkpoints that have been fine-tuned before transformers v4.20.1
125
+ # see https://github.com/facebookresearch/metaseq/pull/164
126
+ self._remove_final_layer_norm = _remove_final_layer_norm
127
+
128
+ # for avsd
129
+ self.truncation_ranks = truncation_ranks
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.35.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b5cf413204607f269a3b403edce27d31e032d1a02768e533e475ed5328640be
3
+ size 225654128
modeling_asvd_opt.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import OPTForCausalLM
2
+ from .configuration_asvd_opt import ASVDOPTConfig
3
+ import torch.nn as nn
4
+
5
+ class ASVDLinear(nn.Module):
6
+ def __init__(self, in_features, out_features, rank, bias=True):
7
+ super().__init__()
8
+ self.BLinear = nn.Linear(in_features, rank, bias=False)
9
+ self.ALinear = nn.Linear(rank, out_features, bias=bias)
10
+
11
+ def forward(self, input):
12
+ return self.ALinear(self.BLinear(input))
13
+
14
+ class ASVDOPTForCausalLM(OPTForCausalLM):
15
+ config_class = ASVDOPTConfig
16
+
17
+ def __init__(self, config:ASVDOPTConfig):
18
+ super().__init__(config)
19
+ self.truncation_ranks=config.truncation_ranks
20
+
21
+ full_name_dict = {module: name for name, module in self.named_modules()}
22
+ linear_info = {}
23
+ modules = [self]
24
+ while len(modules) > 0:
25
+ submodule = modules.pop()
26
+ for name, raw_linear in submodule.named_children():
27
+ if isinstance(raw_linear, nn.Linear):
28
+ full_name = full_name_dict[raw_linear]
29
+ linear_info[raw_linear] = {
30
+ "father": submodule,
31
+ "name": name,
32
+ "full_name": full_name,
33
+ }
34
+ else:
35
+ modules.append(raw_linear)
36
+
37
+
38
+ for name,module in self.named_modules():
39
+ if name in self.truncation_ranks:
40
+ info=linear_info[module]
41
+ new_layer=ASVDLinear(module.in_features,module.out_features,self.truncation_ranks[name],bias=module.bias is not None)
42
+ setattr(info["father"], info["name"], new_layer)
43
+
44
+