denizyuret-shallowai commited on
Commit
3d9d407
·
1 Parent(s): 0730aee

Upload model

Browse files
Files changed (4) hide show
  1. config.json +32 -0
  2. configuration_custom4.py +182 -0
  3. modeling_custom4.py +56 -0
  4. pytorch_model.bin +3 -0
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-160m",
3
+ "architectures": [
4
+ "CustomModel4"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "modeling_custom4.CustomModel4"
9
+ },
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": 0.1,
12
+ "eos_token_id": 0,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout": 0.0,
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "layer_norm_eps": 1e-05,
19
+ "max_position_embeddings": 2048,
20
+ "model_type": "gpt_neox",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 12,
23
+ "rope_scaling": null,
24
+ "rotary_emb_base": 10000,
25
+ "rotary_pct": 0.25,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float16",
28
+ "transformers_version": "4.31.0",
29
+ "use_cache": true,
30
+ "use_parallel_residual": true,
31
+ "vocab_size": 50304
32
+ }
configuration_custom4.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # """ GPTNeoX model configuration"""
16
+
17
+ # from ...configuration_utils import PretrainedConfig
18
+ # from ...utils import logging
19
+
20
+
21
+ # logger = logging.get_logger(__name__)
22
+
23
+ # GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ # "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
25
+ # # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
26
+ # }
27
+
28
+
29
+ # class GPTNeoXConfig(PretrainedConfig):
30
+ # r"""
31
+ # This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
32
+ # GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
33
+ # with the defaults will yield a similar configuration to that of the GPTNeoX
34
+ # [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
35
+
36
+ # Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
+ # documentation from [`PretrainedConfig`] for more information.
38
+
39
+
40
+ # Args:
41
+ # vocab_size (`int`, *optional*, defaults to 50432):
42
+ # Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
43
+ # `inputs_ids` passed when calling [`GPTNeoXModel`].
44
+ # hidden_size (`int`, *optional*, defaults to 6144):
45
+ # Dimension of the encoder layers and the pooler layer.
46
+ # num_hidden_layers (`int`, *optional*, defaults to 44):
47
+ # Number of hidden layers in the Transformer encoder.
48
+ # num_attention_heads (`int`, *optional*, defaults to 64):
49
+ # Number of attention heads for each attention layer in the Transformer encoder.
50
+ # intermediate_size (`int`, *optional*, defaults to 24576):
51
+ # Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
52
+ # hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
53
+ # The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
54
+ # `"relu"`, `"selu"` and `"gelu_new"` are supported.
55
+ # rotary_pct (`float`, *optional*, defaults to 0.25):
56
+ # percentage of hidden dimensions to allocate to rotary embeddings
57
+ # rotary_emb_base (`int`, *optional*, defaults to 10000)
58
+ # base for computing rotary embeddings frequency
59
+ # attention_dropout (`float`, *optional*, defaults to 0.0):
60
+ # The dropout ratio probability of the attention score.
61
+ # hidden_dropout (`float`, *optional*, defaults to 0.0):
62
+ # The dropout ratio of (1) the word embeddings, (2) the post-attention hidden states, and (3) the post-mlp
63
+ # hidden states.
64
+ # classifier_dropout (`float`, *optional*, defaults to 0.1):
65
+ # Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
66
+
67
+ # The dropout ratio for the hidden layer.
68
+ # max_position_embeddings (`int`, *optional*, defaults to 2048):
69
+ # The maximum sequence length that this model might ever be used with. Typically set this to something large
70
+ # just in case (e.g., 512 or 1024 or 2048).
71
+ # initializer_range (`float`, *optional*, defaults to 1e-5):
72
+ # The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
73
+ # layer_norm_eps (`float`, *optional*, defaults to 1e-12):
74
+ # The epsilon used by the layer normalization layers.
75
+ # use_cache (`bool`, *optional*, defaults to `True`):
76
+ # Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ # relevant if `config.is_decoder=True`.
78
+ # use_parallel_residual (`bool`, *optional*, defaults to `True`):
79
+ # Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
80
+ # speedup at large scales (e.g. 20B).
81
+ # rope_scaling (`Dict`, *optional*):
82
+ # Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
83
+ # strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
84
+ # is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
85
+ # `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
86
+ # these scaling strategies behave:
87
+ # https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
88
+ # experimental feature, subject to breaking API changes in future versions.
89
+
90
+ # Example:
91
+
92
+ # ```python
93
+ # >>> from transformers import GPTNeoXConfig, GPTNeoXModel
94
+
95
+ # >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
96
+ # >>> configuration = GPTNeoXConfig()
97
+
98
+ # >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
99
+ # >>> model = GPTNeoXModel(configuration) # doctest: +SKIP
100
+
101
+ # >>> # Accessing the model configuration
102
+ # >>> configuration = model.config # doctest: +SKIP
103
+ # ```"""
104
+ # model_type = "gpt_neox"
105
+
106
+ from transformers import PretrainedConfig
107
+
108
+ class CustomConfig4(PretrainedConfig):
109
+ model_type = "custom4"
110
+
111
+ def __init__(
112
+ self,
113
+ vocab_size=50432,
114
+ hidden_size=6144,
115
+ num_hidden_layers=44,
116
+ num_attention_heads=64,
117
+ intermediate_size=24576,
118
+ hidden_act="gelu",
119
+ rotary_pct=0.25,
120
+ rotary_emb_base=10000,
121
+ attention_dropout=0.0,
122
+ hidden_dropout=0.0,
123
+ classifier_dropout=0.1,
124
+ max_position_embeddings=2048,
125
+ initializer_range=0.02,
126
+ layer_norm_eps=1e-5,
127
+ use_cache=True,
128
+ bos_token_id=0,
129
+ eos_token_id=2,
130
+ tie_word_embeddings=False,
131
+ use_parallel_residual=True,
132
+ rope_scaling=None,
133
+ **kwargs,
134
+ ):
135
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
136
+ self.vocab_size = vocab_size
137
+ self.max_position_embeddings = max_position_embeddings
138
+ self.hidden_size = hidden_size
139
+ self.num_hidden_layers = num_hidden_layers
140
+ self.num_attention_heads = num_attention_heads
141
+ self.intermediate_size = intermediate_size
142
+ self.hidden_act = hidden_act
143
+ self.rotary_pct = rotary_pct
144
+ self.rotary_emb_base = rotary_emb_base
145
+ self.attention_dropout = attention_dropout
146
+ self.hidden_dropout = hidden_dropout
147
+ self.classifier_dropout = classifier_dropout
148
+ self.initializer_range = initializer_range
149
+ self.layer_norm_eps = layer_norm_eps
150
+ self.use_cache = use_cache
151
+ self.tie_word_embeddings = tie_word_embeddings
152
+ self.use_parallel_residual = use_parallel_residual
153
+ self.rope_scaling = rope_scaling
154
+ self._rope_scaling_validation()
155
+
156
+ if self.hidden_size % self.num_attention_heads != 0:
157
+ raise ValueError(
158
+ "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
159
+ )
160
+
161
+ # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
162
+ def _rope_scaling_validation(self):
163
+ """
164
+ Validate the `rope_scaling` configuration.
165
+ """
166
+ if self.rope_scaling is None:
167
+ return
168
+
169
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
170
+ raise ValueError(
171
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
172
+ f"got {self.rope_scaling}"
173
+ )
174
+ rope_scaling_type = self.rope_scaling.get("type", None)
175
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
176
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
177
+ raise ValueError(
178
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
179
+ )
180
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
181
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
182
+
modeling_custom4.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/docs/transformers/custom_models
2
+
3
+ from transformers import PreTrainedModel, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoConfig
4
+ from transformers.modeling_outputs import CausalLMOutputWithPast
5
+ from torch.nn.functional import log_softmax
6
+ from torch.nn.modules.container import ModuleList
7
+ from .configuration_custom4 import CustomConfig4
8
+
9
+ class CustomModel4(PreTrainedModel):
10
+ config_class = CustomConfig4
11
+
12
+ def __init__(self, config):
13
+ super().__init__(config)
14
+
15
+ def forward(self, *args, labels=None, **kwargs):
16
+ loss = None
17
+ logits = None
18
+ for model, coeff in zip(self.models, self.coeffs):
19
+ logp = log_softmax(model.forward(*args, **kwargs).logits, dim=-1)
20
+ logits = coeff * logp if logits is None else logits + coeff * logp
21
+ # The rest copied from modeling_llama.py:
22
+ if labels is not None:
23
+ # Shift so that tokens < n predict n
24
+ shift_logits = logits[..., :-1, :].contiguous()
25
+ shift_labels = labels[..., 1:].contiguous()
26
+ # Flatten the tokens
27
+ loss_fct = CrossEntropyLoss()
28
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
29
+ shift_labels = shift_labels.view(-1)
30
+ # Enable model parallelism
31
+ shift_labels = shift_labels.to(shift_logits.device)
32
+ loss = loss_fct(shift_logits, shift_labels)
33
+
34
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
35
+
36
+
37
+ @classmethod
38
+ def combine_models(cls, *args, coeffs = [], **kwargs):
39
+ models = []
40
+ for model in args:
41
+ models.append(AutoModelForCausalLM.from_pretrained(model, **kwargs).eval())
42
+ if coeffs == []:
43
+ coeffs = [1/len(args)] * len(args)
44
+ m = cls(models[0].config)
45
+ m.models = ModuleList(models)
46
+ m.coeffs = coeffs
47
+ return m
48
+
49
+
50
+
51
+ CustomConfig4.register_for_auto_class()
52
+ CustomModel4.register_for_auto_class('AutoModelForCausalLM')
53
+ CustomModel4.register_for_auto_class('AutoModel')
54
+ AutoConfig.register("custom4", CustomConfig4)
55
+ AutoModel.register(CustomConfig4, CustomModel4)
56
+ AutoModelForCausalLM.register(CustomConfig4, CustomModel4)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffa03b589263eccf2e09157196fab7b2abdaece84c8ed0f4b18f06540f48fd0
3
+ size 465579541