Yixin Song commited on
Commit
8bb0c7b
1 Parent(s): 3267a95
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ ---
6
+
7
+ # Model Card for TurboSparse-Mixtral
8
+ The TurboSparse-Mixtral Large Language Model (LLM) is an sparsified version of the Mixtral.
9
+
10
+ <img src="takeaway.png" alt="avatar" width="300" height="200"/>
11
+
12
+ The average performance is evaluated using benchmarks from the OpenLLM Leaderboard.
13
+
14
+ ## Inference
15
+
16
+ Our code for accelerating TurboSparse-Mixtral is currently being refined. Stay tuned! Now you can run this model like dense model.
17
+
18
+ ## Chat-Template
19
+
20
+ During sparsification, we also utilize some SFT datasets.
21
+ We take ChatML as our chat template:
22
+ ```
23
+ <|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n
24
+ ```
25
+
26
+ ## Allow Finetuning
27
+
28
+ As we merged the predictors for FFN neurons in models, you can finetune TurboSparse-Mixtral with any framework and algorithm.
29
+
30
+ ## License
31
+
32
+ The model is licensed under Apache-2.0, while model weights are fully open for academic research and also allow **free** commercial usage.
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 57000,
3
+ "<|im_start|>": 57001
4
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TurboSparseMixtralForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_turbosparsemixtral.TurboSparseMixtralConfig",
7
+ "AutoModel": "modeling_turbosparsemixtral.TurboSparseMixtralForCausalLM",
8
+ "AutoModelForCausalLM": "modeling_turbosparsemixtral.TurboSparseMixtralForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 1,
12
+ "eos_token_id": 2,
13
+ "hidden_act": "relu",
14
+ "hidden_size": 4096,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 14336,
17
+ "max_position_embeddings": 32768,
18
+ "model_type": "turbosparsemixtral",
19
+ "num_attention_heads": 32,
20
+ "num_experts_per_tok": 2,
21
+ "num_hidden_layers": 32,
22
+ "num_key_value_heads": 8,
23
+ "num_local_experts": 8,
24
+ "output_router_logits": true,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_theta": 1000000.0,
27
+ "router_aux_loss_coef": 0.02,
28
+ "router_jitter_noise": 0.0,
29
+ "sliding_window": null,
30
+ "tie_word_embeddings": false,
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.41.0",
33
+ "use_cache": false,
34
+ "vocab_size": 57024
35
+ }
configuration_turbosparsemixtral.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ This model config is from Mistral config """
16
+ """ Viola model configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class TurboSparseMixtralConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an
28
+ Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of the Mixtral-7B-v0.1 or Mixtral-7B-Instruct-v0.1.
30
+
31
+ [mixtralai/Mixtral-8x7B](https://huggingface.co/mixtralai/Mixtral-8x7B)
32
+ [mixtralai/Mixtral-7B-Instruct-v0.1](https://huggingface.co/mixtralai/Mixtral-7B-Instruct-v0.1)
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the Mixtral model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`MixtralModel`]
42
+ hidden_size (`int`, *optional*, defaults to 4096):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 14336):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer encoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer encoder.
50
+ num_key_value_heads (`int`, *optional*, defaults to 8):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
57
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
58
+ The non-linear activation function (function or string) in the decoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
60
+ The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
61
+ allows sequence of up to 4096*32 tokens.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ pad_token_id (`int`, *optional*):
70
+ The id of the padding token.
71
+ bos_token_id (`int`, *optional*, defaults to 1):
72
+ The id of the "beginning-of-sequence" token.
73
+ eos_token_id (`int`, *optional*, defaults to 2):
74
+ The id of the "end-of-sequence" token.
75
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
76
+ Whether the model's input and output word embeddings should be tied.
77
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
78
+ The base period of the RoPE embeddings.
79
+ sliding_window (`int`, *optional*):
80
+ Sliding window attention window size. If not specified, will default to `4096`.
81
+ attention_dropout (`float`, *optional*, defaults to 0.0):
82
+ The dropout ratio for the attention probabilities.
83
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
84
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
85
+ parameter
86
+ num_local_experts (`int`, *optional*, defaults to 8):
87
+ Number of experts per Sparse MLP layer.
88
+ output_router_logits (`bool`, *optional*, defaults to `False`):
89
+ Whether or not the router logits should be returned by the model. Enabeling this will also
90
+ allow the model to output the auxiliary loss. See [here]() for more details
91
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
92
+ The aux loss factor for the total loss.
93
+ router_jitter_noise (`float`, *optional*, defaults to 0.0):
94
+ Amount of noise to add to the router.
95
+
96
+ ```python
97
+ >>> from transformers import MixtralModel, MixtralConfig
98
+
99
+ >>> # Initializing a Mixtral 7B style configuration
100
+ >>> configuration = MixtralConfig()
101
+
102
+ >>> # Initializing a model from the Mixtral 7B style configuration
103
+ >>> model = MixtralModel(configuration)
104
+
105
+ >>> # Accessing the model configuration
106
+ >>> configuration = model.config
107
+ ```"""
108
+
109
+ model_type = "turbosparsemixtral"
110
+ keys_to_ignore_at_inference = ["past_key_values"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_size=32000,
115
+ hidden_size=4096,
116
+ intermediate_size=14336,
117
+ num_hidden_layers=32,
118
+ num_attention_heads=32,
119
+ num_key_value_heads=8,
120
+ hidden_act="silu",
121
+ max_position_embeddings=4096 * 32,
122
+ initializer_range=0.02,
123
+ rms_norm_eps=1e-5,
124
+ use_cache=True,
125
+ pad_token_id=None,
126
+ bos_token_id=1,
127
+ eos_token_id=2,
128
+ tie_word_embeddings=False,
129
+ rope_theta=1e6,
130
+ sliding_window=None,
131
+ attention_dropout=0.0,
132
+ num_experts_per_tok=2,
133
+ num_local_experts=8,
134
+ output_router_logits=False,
135
+ router_aux_loss_coef=0.001,
136
+ router_jitter_noise=0.0,
137
+ **kwargs,
138
+ ):
139
+ self.vocab_size = vocab_size
140
+ self.max_position_embeddings = max_position_embeddings
141
+ self.hidden_size = hidden_size
142
+ self.intermediate_size = intermediate_size
143
+ self.num_hidden_layers = num_hidden_layers
144
+ self.num_attention_heads = num_attention_heads
145
+ self.sliding_window = sliding_window
146
+
147
+ # for backward compatibility
148
+ if num_key_value_heads is None:
149
+ num_key_value_heads = num_attention_heads
150
+
151
+ self.num_key_value_heads = num_key_value_heads
152
+ self.hidden_act = hidden_act
153
+ self.initializer_range = initializer_range
154
+ self.rms_norm_eps = rms_norm_eps
155
+ self.use_cache = use_cache
156
+ self.rope_theta = rope_theta
157
+ self.attention_dropout = attention_dropout
158
+
159
+ self.num_experts_per_tok = num_experts_per_tok
160
+ self.num_local_experts = num_local_experts
161
+ self.output_router_logits = output_router_logits
162
+ self.router_aux_loss_coef = router_aux_loss_coef
163
+ self.router_jitter_noise = router_jitter_noise
164
+ super().__init__(
165
+ pad_token_id=pad_token_id,
166
+ bos_token_id=bos_token_id,
167
+ eos_token_id=eos_token_id,
168
+ tie_word_embeddings=tie_word_embeddings,
169
+ **kwargs,
170
+ )
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.41.0",
6
+ "use_cache": false
7
+ }
modeling_turbosparsemixtral.py ADDED
The diff for this file is too large to render. See raw diff
 
takeaway.png ADDED
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86840d604f9e18ebbdc35aa937cfc2486fe774534ceea0fd3f667a72bc7584b2
3
+ size 925420
tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "57000": {
30
+ "content": "<|im_end|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "57001": {
38
+ "content": "<|im_start|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "additional_special_tokens": [
47
+ "<|im_start|>"
48
+ ],
49
+ "bos_token": "<s>",
50
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}",
51
+ "clean_up_tokenization_spaces": false,
52
+ "eos_token": "<|im_end|>",
53
+ "model_max_length":32768,
54
+ "pad_token": "</s>",
55
+ "padding_side": "right",
56
+ "sp_model_kwargs": {},
57
+ "spaces_between_special_tokens": false,
58
+ "split_special_tokens": false,
59
+ "tokenizer_class": "LlamaTokenizer",
60
+ "unk_token": "<unk>",
61
+ "use_default_system_prompt": false
62
+ }