Eugene Cheah (picocreator) commited on
Commit
1fe8f0d
1 Parent(s): 9789e00

using rwkv5 batch

Browse files
Files changed (2) hide show
  1. configuration_rwkv5.py +22 -29
  2. modeling_rwkv5.py +8 -0
configuration_rwkv5.py CHANGED
@@ -21,46 +21,42 @@ from transformers.utils import logging
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
- RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25
-
26
- }
27
 
28
 
29
  class Rwkv5Config(PretrainedConfig):
30
  """
31
- This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
32
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
33
  defaults will yield a similar configuration to that of the RWVK-4
34
- [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
35
 
36
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
  documentation from [`PretrainedConfig`] for more information.
38
 
39
 
40
  Args:
41
- vocab_size (`int`, *optional*, defaults to 50277):
42
- Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
43
- `inputs_ids` passed when calling [`RwkvModel`].
44
- context_length (`int`, *optional*, defaults to 1024):
45
- The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
46
- lets use any sequence length).
47
- hidden_size (`int`, *optional*, defaults to 4096):
48
  Dimensionality of the embeddings and hidden states.
49
- num_hidden_layers (`int`, *optional*, defaults to 32):
50
  Number of hidden layers in the model.
51
  attention_hidden_size (`int`, *optional*):
52
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
 
 
 
53
  intermediate_size (`int`, *optional*):
54
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
55
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
56
  The epsilon to use in the layer normalization layers.
57
  bos_token_id (`int`, *optional*, defaults to 0):
58
- The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
59
- as GPTNeoX.
60
  eos_token_id (`int`, *optional*, defaults to 0):
61
- The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
62
- GPTNeoX.
63
- rescale_every (`int`, *optional*, default to 6):
64
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
65
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
66
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
@@ -72,28 +68,27 @@ class Rwkv5Config(PretrainedConfig):
72
  Example:
73
 
74
  ```python
75
- >>> from transformers import RwkvConfig, RwkvModel
76
 
77
- >>> # Initializing a Rwkv configuration
78
- >>> configuration = RwkvConfig()
79
 
80
  >>> # Initializing a model (with random weights) from the configuration
81
- >>> model = RwkvModel(configuration)
82
 
83
  >>> # Accessing the model configuration
84
  >>> configuration = model.config
85
  ```"""
86
 
87
  model_type = "rwkv5"
88
- attribute_map = {"max_position_embeddings": "context_length"}
89
 
90
- def __init__( #1.5B World
91
  self,
92
  vocab_size=65536,
93
- context_length=4096,
94
  hidden_size=768,
95
  num_hidden_layers=24,
96
  attention_hidden_size=None,
 
97
  head_size=64,
98
  head_size_divisor=8,
99
  intermediate_size=None,
@@ -103,14 +98,13 @@ class Rwkv5Config(PretrainedConfig):
103
  rescale_every=6,
104
  tie_word_embeddings=False,
105
  use_cache=True,
106
- model_version="5_2",
107
  **kwargs,
108
  ):
109
  self.vocab_size = vocab_size
110
- self.context_length = context_length
111
  self.hidden_size = hidden_size
112
  self.num_hidden_layers = num_hidden_layers
113
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
 
114
  self.head_size = head_size
115
  self.head_size_divisor = head_size_divisor
116
  self.intermediate_size = None
@@ -120,7 +114,6 @@ class Rwkv5Config(PretrainedConfig):
120
 
121
  self.bos_token_id = bos_token_id
122
  self.eos_token_id = eos_token_id
123
- self.model_version = model_version
124
 
125
  super().__init__(
126
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
+ RWKV5_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 
25
 
26
 
27
  class Rwkv5Config(PretrainedConfig):
28
  """
29
+ This is the configuration class to store the configuration of a [`Rwkv5Model`]. It is used to instantiate a RWKV5
30
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
31
  defaults will yield a similar configuration to that of the RWVK-4
32
+ [RWKV/rwkv-5-world-1b5](https://huggingface.co/RWKV/rwkv-5-world-1b5) architecture.
33
 
34
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
  documentation from [`PretrainedConfig`] for more information.
36
 
37
 
38
  Args:
39
+ vocab_size (`int`, *optional*, defaults to 65536):
40
+ Vocabulary size of the RWKV5 model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`Rwkv5Model`].
42
+ hidden_size (`int`, *optional*, defaults to 768):
 
 
 
43
  Dimensionality of the embeddings and hidden states.
44
+ num_hidden_layers (`int`, *optional*, defaults to 24):
45
  Number of hidden layers in the model.
46
  attention_hidden_size (`int`, *optional*):
47
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
48
+ num_attention_heads (`int`, *optional*, defaults to 64):
49
+ The attention heads to use in rwkv5 self_attention module.
50
+ head_size (`int`, *optional*, defaults to 64): head_size of rwkv5 self_attention module.
51
  intermediate_size (`int`, *optional*):
52
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
53
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
54
  The epsilon to use in the layer normalization layers.
55
  bos_token_id (`int`, *optional*, defaults to 0):
56
+ The id of the beginning of sentence token in the vocabulary. Defaults to 0.
 
57
  eos_token_id (`int`, *optional*, defaults to 0):
58
+ The id of the end of sentence token in the vocabulary. Defaults to 0.
59
+ rescale_every (`int`, *optional*, defaults to 6):
 
60
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
61
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
62
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
 
68
  Example:
69
 
70
  ```python
71
+ >>> from transformers import Rwkv5Config, Rwkv5Model
72
 
73
+ >>> # Initializing a Rwkv5 configuration
74
+ >>> configuration = Rwkv5Config()
75
 
76
  >>> # Initializing a model (with random weights) from the configuration
77
+ >>> model = Rwkv5Model(configuration)
78
 
79
  >>> # Accessing the model configuration
80
  >>> configuration = model.config
81
  ```"""
82
 
83
  model_type = "rwkv5"
 
84
 
85
+ def __init__(
86
  self,
87
  vocab_size=65536,
 
88
  hidden_size=768,
89
  num_hidden_layers=24,
90
  attention_hidden_size=None,
91
+ num_attention_heads=64,
92
  head_size=64,
93
  head_size_divisor=8,
94
  intermediate_size=None,
 
98
  rescale_every=6,
99
  tie_word_embeddings=False,
100
  use_cache=True,
 
101
  **kwargs,
102
  ):
103
  self.vocab_size = vocab_size
 
104
  self.hidden_size = hidden_size
105
  self.num_hidden_layers = num_hidden_layers
106
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
107
+ self.num_attention_heads = num_attention_heads
108
  self.head_size = head_size
109
  self.head_size_divisor = head_size_divisor
110
  self.intermediate_size = None
 
114
 
115
  self.bos_token_id = bos_token_id
116
  self.eos_token_id = eos_token_id
 
117
 
118
  super().__init__(
119
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
modeling_rwkv5.py CHANGED
@@ -752,6 +752,14 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
752
  block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
753
  block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
754
  else:
 
 
 
 
 
 
 
 
755
  block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
756
  block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
757
 
 
752
  block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
753
  block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
754
  else:
755
+ # Deal with quantization statistics
756
+ if hasattr(block.attention.output.weight, "SCB"):
757
+ block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
758
+ block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
759
+ elif hasattr(block.attention.output.weight, "quant_state"):
760
+ self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
761
+ self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
762
+ else:
763
  block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
764
  block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
765