ruixie commited on
Commit
b394858
1 Parent(s): 2f83afe

Delete configuration_shell.py

Browse files
Files changed (1) hide show
  1. configuration_shell.py +0 -150
configuration_shell.py DELETED
@@ -1,150 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 The BigCode team and HuggingFace Inc. team.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """ Shell configuration"""
16
-
17
- from transformers.configuration_utils import PretrainedConfig
18
- from transformers.utils import logging
19
-
20
-
21
- logger = logging.get_logger(__name__)
22
-
23
-
24
- class ShellConfig(PretrainedConfig):
25
- """
26
- This is the configuration class to store the configuration of a [`ShellModel`]. It is used to instantiate a
27
- Shell model according to the specified arguments, defining the model architecture. Instantiating a
28
- configuration with the defaults will yield a similar configuration to that of the Shell
29
- [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.
30
-
31
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
- documentation from [`PretrainedConfig`] for more information.
33
-
34
-
35
- Args:
36
- vocab_size (`int`, *optional*, defaults to 50257):
37
- Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
38
- `inputs_ids` passed when calling [`ShellModel`].
39
- n_positions (`int`, *optional*, defaults to 1024):
40
- The maximum sequence length that this model might ever be used with. Typically set this to something large
41
- just in case (e.g., 512 or 1024 or 2048).
42
- n_embd (`int`, *optional*, defaults to 768):
43
- Dimensionality of the embeddings and hidden states.
44
- n_layer (`int`, *optional*, defaults to 12):
45
- Number of hidden layers in the Transformer encoder.
46
- n_head (`int`, *optional*, defaults to 12):
47
- Number of attention heads for each attention layer in the Transformer encoder.
48
- n_inner (`int`, *optional*, defaults to None):
49
- Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
50
- activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
51
- Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
52
- "gelu_pytorch_tanh"]`.
53
- resid_pdrop (`float`, *optional*, defaults to 0.1):
54
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
55
- embd_pdrop (`float`, *optional*, defaults to 0.1):
56
- The dropout ratio for the embeddings.
57
- attn_pdrop (`float`, *optional*, defaults to 0.1):
58
- The dropout ratio for the attention.
59
- layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
60
- The epsilon to use in the layer normalization layers.
61
- initializer_range (`float`, *optional*, defaults to 0.02):
62
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
63
- scale_attn_weights (`bool`, *optional*, defaults to `True`):
64
- Scale attention weights by dividing by sqrt(hidden_size)..
65
- use_cache (`bool`, *optional*, defaults to `True`):
66
- Whether or not the model should return the last key/values attentions (not used by all models).
67
- attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
68
- Whether to call the fused softmax in float32.
69
- scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
70
- Whether to scale the attention softmax in float32.
71
- attention_type (`bool`, *optional*, defaults to `True`):
72
- Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
73
- Example:
74
-
75
- ```python
76
- >>> from transformers import ShellConfig, ShellModel
77
-
78
- >>> # Initializing a Shell configuration
79
- >>> configuration = ShellConfig()
80
-
81
- >>> # Initializing a model (with random weights) from the configuration
82
- >>> model = ShellModel(configuration)
83
-
84
- >>> # Accessing the model configuration
85
- >>> configuration = model.config
86
- ```"""
87
-
88
- model_type = "kclgpt"
89
- keys_to_ignore_at_inference = ["past_key_values"]
90
- attribute_map = {
91
- "hidden_size": "n_embd",
92
- "max_position_embeddings": "n_positions",
93
- "num_attention_heads": "n_head",
94
- "num_hidden_layers": "n_layer",
95
- }
96
-
97
- def __init__(
98
- self,
99
- vocab_size=50257,
100
- n_positions=1024,
101
- n_embd=768,
102
- n_layer=12,
103
- n_head=12,
104
- n_inner=None,
105
- activation_function="gelu_pytorch_tanh",
106
- resid_pdrop=0.1,
107
- embd_pdrop=0.1,
108
- attn_pdrop=0.1,
109
- layer_norm_epsilon=1e-5,
110
- initializer_range=0.02,
111
- scale_attn_weights=True,
112
- use_cache=True,
113
- bos_token_id=50256,
114
- eos_token_id=50256,
115
- attention_softmax_in_fp32=True,
116
- scale_attention_softmax_in_fp32=True,
117
- group_query_attention=True,
118
- num_query_groups=1,
119
- position_embedding_type="learned_absolute",
120
- rope_scaling=None,
121
- **kwargs,
122
- ):
123
- self.vocab_size = vocab_size
124
- self.n_positions = n_positions
125
- self.n_embd = n_embd
126
- self.n_layer = n_layer
127
- self.n_head = n_head
128
- self.n_inner = n_inner
129
- self.activation_function = activation_function
130
- self.resid_pdrop = resid_pdrop
131
- self.embd_pdrop = embd_pdrop
132
- self.attn_pdrop = attn_pdrop
133
- self.layer_norm_epsilon = layer_norm_epsilon
134
- self.initializer_range = initializer_range
135
- self.scale_attn_weights = scale_attn_weights
136
- self.use_cache = use_cache
137
- self.attention_softmax_in_fp32 = attention_softmax_in_fp32
138
- self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
139
- self.group_query_attention = group_query_attention
140
- self.num_query_groups = num_query_groups
141
- self.position_embedding_type = position_embedding_type
142
- self.rope_scaling = rope_scaling
143
- assert self.position_embedding_type in [
144
- "learned_absolute", "rope"
145
- ], "position_embedding_type must be one of ['learned_absolute', 'rope']"
146
-
147
- self.bos_token_id = bos_token_id
148
- self.eos_token_id = eos_token_id
149
-
150
- super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)