Eugene Cheah (picocreator) commited on
Commit
9789e00
1 Parent(s): bb01ae9

head divisor fix

Browse files
Files changed (2) hide show
  1. configuration_rwkv5.py +30 -23
  2. modeling_rwkv5.py +8 -11
configuration_rwkv5.py CHANGED
@@ -21,44 +21,46 @@ from transformers.utils import logging
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
- RWKV5_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 
25
 
26
 
27
  class Rwkv5Config(PretrainedConfig):
28
  """
29
- This is the configuration class to store the configuration of a [`Rwkv5Model`]. It is used to instantiate a RWKV5
30
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
31
  defaults will yield a similar configuration to that of the RWVK-4
32
- [RWKV/rwkv-5-world-1b5](https://huggingface.co/RWKV/rwkv-5-world-1b5) architecture.
33
 
34
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
  documentation from [`PretrainedConfig`] for more information.
36
 
37
 
38
  Args:
39
- vocab_size (`int`, *optional*, defaults to 65536):
40
- Vocabulary size of the RWKV5 model. Defines the number of different tokens that can be represented by the
41
- `inputs_ids` passed when calling [`Rwkv5Model`].
42
- hidden_size (`int`, *optional*, defaults to 768):
 
 
 
43
  Dimensionality of the embeddings and hidden states.
44
- num_hidden_layers (`int`, *optional*, defaults to 24):
45
  Number of hidden layers in the model.
46
  attention_hidden_size (`int`, *optional*):
47
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
48
- num_attention_heads (`int`, *optional*, defaults to 64):
49
- The attention heads to use in rwkv5 self_attention module.
50
- head_size (`int`, *optional*, defaults to 64): head_size of rwkv5 self_attention module.
51
  intermediate_size (`int`, *optional*):
52
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
53
- layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
54
  The epsilon to use in the layer normalization layers.
55
  bos_token_id (`int`, *optional*, defaults to 0):
56
- The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer
57
  as GPTNeoX.
58
  eos_token_id (`int`, *optional*, defaults to 0):
59
- The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer as
60
  GPTNeoX.
61
- rescale_every (`int`, *optional*, defaults to 6):
62
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
63
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
64
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
@@ -70,28 +72,30 @@ class Rwkv5Config(PretrainedConfig):
70
  Example:
71
 
72
  ```python
73
- >>> from transformers import Rwkv5Config, Rwkv5Model
74
 
75
- >>> # Initializing a Rwkv5 configuration
76
- >>> configuration = Rwkv5Config()
77
 
78
  >>> # Initializing a model (with random weights) from the configuration
79
- >>> model = Rwkv5Model(configuration)
80
 
81
  >>> # Accessing the model configuration
82
  >>> configuration = model.config
83
  ```"""
84
 
85
  model_type = "rwkv5"
 
86
 
87
- def __init__(
88
  self,
89
  vocab_size=65536,
 
90
  hidden_size=768,
91
  num_hidden_layers=24,
92
  attention_hidden_size=None,
93
- num_attention_heads=64,
94
  head_size=64,
 
95
  intermediate_size=None,
96
  layer_norm_epsilon=1e-5,
97
  bos_token_id=0,
@@ -99,14 +103,16 @@ class Rwkv5Config(PretrainedConfig):
99
  rescale_every=6,
100
  tie_word_embeddings=False,
101
  use_cache=True,
 
102
  **kwargs,
103
  ):
104
  self.vocab_size = vocab_size
 
105
  self.hidden_size = hidden_size
106
  self.num_hidden_layers = num_hidden_layers
107
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
108
- self.num_attention_heads = num_attention_heads
109
  self.head_size = head_size
 
110
  self.intermediate_size = None
111
  self.layer_norm_epsilon = layer_norm_epsilon
112
  self.rescale_every = rescale_every
@@ -114,7 +120,8 @@ class Rwkv5Config(PretrainedConfig):
114
 
115
  self.bos_token_id = bos_token_id
116
  self.eos_token_id = eos_token_id
 
117
 
118
  super().__init__(
119
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
120
- )
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
+ RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25
+
26
+ }
27
 
28
 
29
  class Rwkv5Config(PretrainedConfig):
30
  """
31
+ This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
32
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
33
  defaults will yield a similar configuration to that of the RWVK-4
34
+ [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
35
 
36
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
  documentation from [`PretrainedConfig`] for more information.
38
 
39
 
40
  Args:
41
+ vocab_size (`int`, *optional*, defaults to 50277):
42
+ Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
43
+ `inputs_ids` passed when calling [`RwkvModel`].
44
+ context_length (`int`, *optional*, defaults to 1024):
45
+ The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
46
+ lets use any sequence length).
47
+ hidden_size (`int`, *optional*, defaults to 4096):
48
  Dimensionality of the embeddings and hidden states.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
  Number of hidden layers in the model.
51
  attention_hidden_size (`int`, *optional*):
52
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
 
 
 
53
  intermediate_size (`int`, *optional*):
54
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
55
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
56
  The epsilon to use in the layer normalization layers.
57
  bos_token_id (`int`, *optional*, defaults to 0):
58
+ The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
59
  as GPTNeoX.
60
  eos_token_id (`int`, *optional*, defaults to 0):
61
+ The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
62
  GPTNeoX.
63
+ rescale_every (`int`, *optional*, default to 6):
64
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
65
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
66
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 
72
  Example:
73
 
74
  ```python
75
+ >>> from transformers import RwkvConfig, RwkvModel
76
 
77
+ >>> # Initializing a Rwkv configuration
78
+ >>> configuration = RwkvConfig()
79
 
80
  >>> # Initializing a model (with random weights) from the configuration
81
+ >>> model = RwkvModel(configuration)
82
 
83
  >>> # Accessing the model configuration
84
  >>> configuration = model.config
85
  ```"""
86
 
87
  model_type = "rwkv5"
88
+ attribute_map = {"max_position_embeddings": "context_length"}
89
 
90
+ def __init__( #1.5B World
91
  self,
92
  vocab_size=65536,
93
+ context_length=4096,
94
  hidden_size=768,
95
  num_hidden_layers=24,
96
  attention_hidden_size=None,
 
97
  head_size=64,
98
+ head_size_divisor=8,
99
  intermediate_size=None,
100
  layer_norm_epsilon=1e-5,
101
  bos_token_id=0,
 
103
  rescale_every=6,
104
  tie_word_embeddings=False,
105
  use_cache=True,
106
+ model_version="5_2",
107
  **kwargs,
108
  ):
109
  self.vocab_size = vocab_size
110
+ self.context_length = context_length
111
  self.hidden_size = hidden_size
112
  self.num_hidden_layers = num_hidden_layers
113
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
 
114
  self.head_size = head_size
115
+ self.head_size_divisor = head_size_divisor
116
  self.intermediate_size = None
117
  self.layer_norm_epsilon = layer_norm_epsilon
118
  self.rescale_every = rescale_every
 
120
 
121
  self.bos_token_id = bos_token_id
122
  self.eos_token_id = eos_token_id
123
+ self.model_version = model_version
124
 
125
  super().__init__(
126
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
127
+ )
modeling_rwkv5.py CHANGED
@@ -178,6 +178,7 @@ def rwkv_linear_attention_v5_cpu(
178
  gate,
179
  lxw,
180
  lxb,
 
181
  ow,
182
  state,
183
  ):
@@ -199,7 +200,7 @@ def rwkv_linear_attention_v5_cpu(
199
  state = at + time_decay * state
200
 
201
  out = out.reshape(B * T, H * S)
202
- out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
203
  out = out.to(dtype=hidden.dtype) * gate
204
  out = out @ ow
205
 
@@ -221,6 +222,7 @@ def rwkv_linear_attention(
221
  gate,
222
  lxw,
223
  lxb,
 
224
  ow,
225
  state,
226
  ):
@@ -244,13 +246,14 @@ def rwkv_linear_attention(
244
  gate,
245
  lxw,
246
  lxb,
 
247
  ow,
248
  state,
249
  )
250
  else:
251
  out, state = WKV_5.apply(B, T, H * S, H, receptance, key, value, time_decay, time_first, state)
252
  out = out.reshape(B * T, H * S)
253
- out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
254
  out = out.to(dtype=hidden.dtype) * gate
255
  out = out @ ow
256
  return out, state
@@ -271,6 +274,7 @@ class RwkvSelfAttention(nn.Module):
271
  # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L146
272
  num_attention_heads = hidden_size // config.head_size
273
  self.num_attention_heads = num_attention_heads
 
274
  attention_hidden_size = (
275
  config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
276
  )
@@ -343,6 +347,7 @@ class RwkvSelfAttention(nn.Module):
343
  gate,
344
  self.ln_x.weight,
345
  self.ln_x.bias,
 
346
  self.output.weight.t(),
347
  state=layer_state,
348
  )
@@ -747,14 +752,6 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
747
  block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
748
  block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
749
  else:
750
- # Deal with quantization statistics
751
- if hasattr(block.attention.output.weight, "SCB"):
752
- block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
753
- block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
754
- elif hasattr(block.attention.output.weight, "quant_state"):
755
- self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
756
- self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
757
- else:
758
  block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
759
  block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
760
 
@@ -859,4 +856,4 @@ class Rwkv5ForCausalLM(Rwkv5PreTrainedModel):
859
  state=rwkv_outputs.state,
860
  hidden_states=rwkv_outputs.hidden_states,
861
  attentions=rwkv_outputs.attentions,
862
- )
 
178
  gate,
179
  lxw,
180
  lxb,
181
+ head_size_divisor,
182
  ow,
183
  state,
184
  ):
 
200
  state = at + time_decay * state
201
 
202
  out = out.reshape(B * T, H * S)
203
+ out = F.group_norm(out / head_size_divisor, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
204
  out = out.to(dtype=hidden.dtype) * gate
205
  out = out @ ow
206
 
 
222
  gate,
223
  lxw,
224
  lxb,
225
+ head_size_divisor,
226
  ow,
227
  state,
228
  ):
 
246
  gate,
247
  lxw,
248
  lxb,
249
+ head_size_divisor,
250
  ow,
251
  state,
252
  )
253
  else:
254
  out, state = WKV_5.apply(B, T, H * S, H, receptance, key, value, time_decay, time_first, state)
255
  out = out.reshape(B * T, H * S)
256
+ out = F.group_norm(out / head_size_divisor, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
257
  out = out.to(dtype=hidden.dtype) * gate
258
  out = out @ ow
259
  return out, state
 
274
  # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L146
275
  num_attention_heads = hidden_size // config.head_size
276
  self.num_attention_heads = num_attention_heads
277
+ self.head_size_divisor = config.head_size_divisor
278
  attention_hidden_size = (
279
  config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
280
  )
 
347
  gate,
348
  self.ln_x.weight,
349
  self.ln_x.bias,
350
+ self.head_size_divisor,
351
  self.output.weight.t(),
352
  state=layer_state,
353
  )
 
752
  block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
753
  block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
754
  else:
 
 
 
 
 
 
 
 
755
  block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
756
  block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
757
 
 
856
  state=rwkv_outputs.state,
857
  hidden_states=rwkv_outputs.hidden_states,
858
  attentions=rwkv_outputs.attentions,
859
+ )