jetaudio commited on
Commit
6a64281
1 Parent(s): 61a554d

Update configuration_rwkv5.py

Browse files
Files changed (1) hide show
  1. configuration_rwkv5.py +23 -36
configuration_rwkv5.py CHANGED
@@ -21,79 +21,67 @@ from transformers.utils import logging
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
- RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25
-
26
- }
27
 
28
 
29
  class Rwkv5Config(PretrainedConfig):
30
  """
31
- This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
32
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
33
  defaults will yield a similar configuration to that of the RWVK-4
34
- [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
35
-
36
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
  documentation from [`PretrainedConfig`] for more information.
38
-
39
-
40
  Args:
41
- vocab_size (`int`, *optional*, defaults to 50277):
42
- Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
43
- `inputs_ids` passed when calling [`RwkvModel`].
44
- context_length (`int`, *optional*, defaults to 1024):
45
- The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
46
- lets use any sequence length).
47
- hidden_size (`int`, *optional*, defaults to 4096):
48
  Dimensionality of the embeddings and hidden states.
49
- num_hidden_layers (`int`, *optional*, defaults to 32):
50
  Number of hidden layers in the model.
51
  attention_hidden_size (`int`, *optional*):
52
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
 
 
 
53
  intermediate_size (`int`, *optional*):
54
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
55
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
56
  The epsilon to use in the layer normalization layers.
57
  bos_token_id (`int`, *optional*, defaults to 0):
58
- The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
59
  as GPTNeoX.
60
  eos_token_id (`int`, *optional*, defaults to 0):
61
- The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
62
  GPTNeoX.
63
- rescale_every (`int`, *optional*, default to 6):
64
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
65
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
66
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
67
  Whether or not to tie the word embeddings with the input token embeddings.
68
  use_cache (`bool`, *optional*, defaults to `True`):
69
  Whether or not the model should return the last state.
70
-
71
-
72
  Example:
73
-
74
  ```python
75
- >>> from transformers import RwkvConfig, RwkvModel
76
-
77
- >>> # Initializing a Rwkv configuration
78
- >>> configuration = RwkvConfig()
79
-
80
  >>> # Initializing a model (with random weights) from the configuration
81
- >>> model = RwkvModel(configuration)
82
-
83
  >>> # Accessing the model configuration
84
  >>> configuration = model.config
85
  ```"""
86
 
87
  model_type = "rwkv5"
88
- attribute_map = {"max_position_embeddings": "context_length"}
89
 
90
- def __init__( #1.5B World
91
  self,
92
  vocab_size=65536,
93
- context_length=4096,
94
  hidden_size=768,
95
  num_hidden_layers=24,
96
  attention_hidden_size=None,
 
97
  head_size=64,
98
  intermediate_size=None,
99
  layer_norm_epsilon=1e-5,
@@ -102,14 +90,13 @@ class Rwkv5Config(PretrainedConfig):
102
  rescale_every=6,
103
  tie_word_embeddings=False,
104
  use_cache=True,
105
- model_version="5_2",
106
  **kwargs,
107
  ):
108
  self.vocab_size = vocab_size
109
- self.context_length = context_length
110
  self.hidden_size = hidden_size
111
  self.num_hidden_layers = num_hidden_layers
112
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
 
113
  self.head_size = head_size
114
  self.intermediate_size = None
115
  self.layer_norm_epsilon = layer_norm_epsilon
@@ -118,8 +105,8 @@ class Rwkv5Config(PretrainedConfig):
118
 
119
  self.bos_token_id = bos_token_id
120
  self.eos_token_id = eos_token_id
121
- self.model_version = model_version
122
 
123
  super().__init__(
124
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
125
  )
 
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
+ RWKV5_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 
25
 
26
 
27
  class Rwkv5Config(PretrainedConfig):
28
  """
29
+ This is the configuration class to store the configuration of a [`Rwkv5Model`]. It is used to instantiate a RWKV5
30
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
31
  defaults will yield a similar configuration to that of the RWVK-4
32
+ [RWKV/rwkv-5-world-1b5](https://huggingface.co/RWKV/rwkv-5-world-1b5) architecture.
 
33
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
  documentation from [`PretrainedConfig`] for more information.
 
 
35
  Args:
36
+ vocab_size (`int`, *optional*, defaults to 65536):
37
+ Vocabulary size of the RWKV5 model. Defines the number of different tokens that can be represented by the
38
+ `inputs_ids` passed when calling [`Rwkv5Model`].
39
+ hidden_size (`int`, *optional*, defaults to 768):
 
 
 
40
  Dimensionality of the embeddings and hidden states.
41
+ num_hidden_layers (`int`, *optional*, defaults to 24):
42
  Number of hidden layers in the model.
43
  attention_hidden_size (`int`, *optional*):
44
  Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
45
+ num_attention_heads (`int`, *optional*, defaults to 64):
46
+ The attention heads to use in rwkv5 self_attention module.
47
+ head_size (`int`, *optional*, defaults to 64): head_size of rwkv5 self_attention module.
48
  intermediate_size (`int`, *optional*):
49
  Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
50
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
51
  The epsilon to use in the layer normalization layers.
52
  bos_token_id (`int`, *optional*, defaults to 0):
53
+ The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer
54
  as GPTNeoX.
55
  eos_token_id (`int`, *optional*, defaults to 0):
56
+ The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer as
57
  GPTNeoX.
58
+ rescale_every (`int`, *optional*, defaults to 6):
59
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
60
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
61
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
62
  Whether or not to tie the word embeddings with the input token embeddings.
63
  use_cache (`bool`, *optional*, defaults to `True`):
64
  Whether or not the model should return the last state.
 
 
65
  Example:
 
66
  ```python
67
+ >>> from transformers import Rwkv5Config, Rwkv5Model
68
+ >>> # Initializing a Rwkv5 configuration
69
+ >>> configuration = Rwkv5Config()
 
 
70
  >>> # Initializing a model (with random weights) from the configuration
71
+ >>> model = Rwkv5Model(configuration)
 
72
  >>> # Accessing the model configuration
73
  >>> configuration = model.config
74
  ```"""
75
 
76
  model_type = "rwkv5"
 
77
 
78
+ def __init__(
79
  self,
80
  vocab_size=65536,
 
81
  hidden_size=768,
82
  num_hidden_layers=24,
83
  attention_hidden_size=None,
84
+ num_attention_heads=64,
85
  head_size=64,
86
  intermediate_size=None,
87
  layer_norm_epsilon=1e-5,
 
90
  rescale_every=6,
91
  tie_word_embeddings=False,
92
  use_cache=True,
 
93
  **kwargs,
94
  ):
95
  self.vocab_size = vocab_size
 
96
  self.hidden_size = hidden_size
97
  self.num_hidden_layers = num_hidden_layers
98
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
99
+ self.num_attention_heads = num_attention_heads
100
  self.head_size = head_size
101
  self.intermediate_size = None
102
  self.layer_norm_epsilon = layer_norm_epsilon
 
105
 
106
  self.bos_token_id = bos_token_id
107
  self.eos_token_id = eos_token_id
 
108
 
109
  super().__init__(
110
  tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
111
  )
112
+