niwz
/

Text Generation
Transformers
Safetensors
Chinese
phi3
conversational
Inference Endpoints
niuwz commited on
Commit
059744b
1 Parent(s): d2528ab

upload model and config files for mini-Chinese-Phi3

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "fine_tuned/sft",
3
+ "architectures": [
4
+ "MiniPhi3"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 2,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 1,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2048,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "phi3",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "num_key_value_heads": 12,
19
+ "original_max_position_embeddings": 512,
20
+ "pad_token_id": 0,
21
+ "resid_pdrop": 0.0,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "sliding_window": null,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.41.2",
29
+ "use_cache": true,
30
+ "use_cope": false,
31
+ "vocab_size": 32064
32
+ }
configuation_miniPhi3.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+
7
+ class MiniPhiConfig(PretrainedConfig):
8
+
9
+ model_type = "phi3"
10
+ keys_to_ignore_at_inference = ["past_key_values"]
11
+
12
+ def __init__(
13
+ self,
14
+ vocab_size=32000,
15
+ hidden_size=768,
16
+ intermediate_size=2048,
17
+ num_hidden_layers=12,
18
+ num_attention_heads=12,
19
+ num_key_value_heads=None,
20
+ resid_pdrop=0.0,
21
+ embd_pdrop=0.0,
22
+ attention_dropout=0.0,
23
+ hidden_act="silu",
24
+ max_position_embeddings=512,
25
+ original_max_position_embeddings=512,
26
+ initializer_range=0.02,
27
+ rms_norm_eps=1e-5,
28
+ use_cache=True,
29
+ tie_word_embeddings=False,
30
+ rope_theta=10000.0,
31
+ rope_scaling=None,
32
+ bos_token_id=2,
33
+ eos_token_id=1,
34
+ pad_token_id=0,
35
+ sliding_window=None,
36
+ use_cope=True,
37
+ **kwargs,
38
+ ):
39
+ self.vocab_size = vocab_size
40
+ self.hidden_size = hidden_size
41
+ self.intermediate_size = intermediate_size
42
+ self.num_hidden_layers = num_hidden_layers
43
+ self.num_attention_heads = num_attention_heads
44
+
45
+ if num_key_value_heads is None:
46
+ num_key_value_heads = num_attention_heads
47
+
48
+ self.num_key_value_heads = num_key_value_heads
49
+ self.resid_pdrop = resid_pdrop
50
+ self.embd_pdrop = embd_pdrop
51
+ self.attention_dropout = attention_dropout
52
+ self.hidden_act = hidden_act
53
+ self.max_position_embeddings = max_position_embeddings
54
+ self.original_max_position_embeddings = original_max_position_embeddings
55
+ self.initializer_range = initializer_range
56
+ self.rms_norm_eps = rms_norm_eps
57
+ self.use_cache = use_cache
58
+ self.rope_theta = rope_theta
59
+ self.rope_scaling = rope_scaling
60
+ self._rope_scaling_validation()
61
+ self.sliding_window = sliding_window
62
+ self.use_cope = use_cope
63
+
64
+ super().__init__(
65
+ bos_token_id=bos_token_id,
66
+ eos_token_id=eos_token_id,
67
+ pad_token_id=pad_token_id,
68
+ tie_word_embeddings=tie_word_embeddings,
69
+ **kwargs,
70
+ )
71
+
72
+ def _rope_scaling_validation(self):
73
+ """
74
+ Validate the `rope_scaling` configuration.
75
+ """
76
+ if self.rope_scaling is None:
77
+ return
78
+
79
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
80
+ raise ValueError(
81
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
82
+ f"got {self.rope_scaling}"
83
+ )
84
+ rope_scaling_type = self.rope_scaling.get("type", None)
85
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
86
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
87
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
88
+ raise ValueError(
89
+ f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
90
+ if not (
91
+ isinstance(rope_scaling_short_factor, list)
92
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
93
+ ):
94
+ raise ValueError(
95
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
96
+ )
97
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
98
+ raise ValueError(
99
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
100
+ )
101
+ if not (
102
+ isinstance(rope_scaling_long_factor, list)
103
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
104
+ ):
105
+ raise ValueError(
106
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
107
+ )
108
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
109
+ raise ValueError(
110
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
111
+ )
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.41.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:115b4d2487314a3395318200afd5c4f952d1a38a4bf28835ba38f7300eeeadfb
3
+ size 536825040
modeling_miniphi3.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.cache_utils import Cache
2
+ from transformers.models.phi3.configuration_phi3 import Phi3Config
3
+ from transformers.models.phi3.modeling_phi3 import repeat_kv, Phi3Attention, Phi3Model, Phi3ForCausalLM, apply_rotary_pos_emb, Phi3FlashAttention2
4
+ from configuation_miniPhi3 import MiniPhiConfig
5
+ from typing import List, Optional, Tuple, Union
6
+ from transformers.utils import (
7
+ add_code_sample_docstrings,
8
+ add_start_docstrings,
9
+ add_start_docstrings_to_model_forward,
10
+ is_flash_attn_2_available,
11
+ is_flash_attn_greater_or_equal_2_10,
12
+ logging,
13
+ replace_return_docstrings,
14
+ )
15
+ import warnings
16
+
17
+ import inspect
18
+ if is_flash_attn_2_available():
19
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
20
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
21
+
22
+ _flash_supports_window_size = "window_size" in list(
23
+ inspect.signature(flash_attn_func).parameters)
24
+
25
+ import math
26
+ logger = logging.get_logger(__name__)
27
+ import torch
28
+ import torch.nn as nn
29
+
30
+ from einops import einsum
31
+
32
+
33
+ class CoPE(nn.Module):
34
+ def __init__(self, npos_max, head_dim):
35
+ super().__init__()
36
+ self.npos_max = npos_max
37
+ self.pos_emb = nn.parameter.Parameter(
38
+ torch.zeros(1, head_dim, npos_max))
39
+
40
+ def forward(self, query, attn_logits):
41
+ # compute positions
42
+ gates = torch.sigmoid(attn_logits)
43
+ pos = gates.flip(-1).cumsum(dim=-1).flip(-1)
44
+ pos = pos.clamp(max=self.npos_max - 1)
45
+ # interpolate from integer positions
46
+ pos_ceil = pos.ceil().long()
47
+ pos_floor = pos.floor().long()
48
+ logits_int = torch.matmul(query, self.pos_emb)
49
+ logits_ceil = logits_int.gather(-1, pos_ceil)
50
+ logits_floor = logits_int.gather(-1, pos_floor)
51
+ w = pos - pos_floor
52
+ return logits_ceil * w + logits_floor * (1 - w)
53
+
54
+
55
+ class MiniPhi3Attention(Phi3Attention):
56
+ def __init__(self, config: MiniPhiConfig, origin_params):
57
+ super().__init__(config, layer_idx=0)
58
+ self.__replace_param(origin_params)
59
+ self.cope = CoPE(self.max_position_embeddings, self.head_dim)
60
+
61
+ def __replace_param(self, origin_params: dict):
62
+ self.__dict__.update(origin_params)
63
+ del self.rotary_emb
64
+
65
+ def forward(
66
+ self,
67
+ hidden_states: torch.Tensor,
68
+ attention_mask: Optional[torch.Tensor] = None,
69
+ position_ids: Optional[torch.LongTensor] = None,
70
+ past_key_value=None,
71
+ output_attentions: bool = False,
72
+ use_cache: bool = False,
73
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
74
+
75
+ bsz, q_len, _ = hidden_states.size()
76
+
77
+ qkv = self.qkv_proj(hidden_states)
78
+ query_pos = self.num_heads * self.head_dim
79
+ query_states = qkv[..., :query_pos]
80
+ key_states = qkv[..., query_pos: query_pos +
81
+ self.num_key_value_heads * self.head_dim]
82
+ value_states = qkv[..., query_pos +
83
+ self.num_key_value_heads * self.head_dim:]
84
+
85
+ query_states = query_states.view(
86
+ bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
87
+ key_states = key_states.view(
88
+ bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
89
+ value_states = value_states.view(
90
+ bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
91
+
92
+ kv_seq_len = key_states.shape[-2]
93
+ if past_key_value is not None:
94
+ if self.layer_idx is None:
95
+ raise ValueError(
96
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
97
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
98
+ "with a layer index."
99
+ )
100
+ kv_seq_len += past_key_value.get_usable_length(
101
+ kv_seq_len, self.layer_idx)
102
+ # cos, sin = self.rotary_emb(
103
+ # value_states, position_ids, seq_len=kv_seq_len)
104
+
105
+ # query_states, key_states = apply_rotary_pos_emb(
106
+ # query_states, key_states, cos, sin, position_ids)
107
+
108
+ if past_key_value is not None:
109
+ # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
110
+ # key_states, value_states = past_key_value.update(
111
+ # key_states, value_states, self.layer_idx, cache_kwargs)
112
+ key_states, value_states = past_key_value.update(
113
+ key_states, value_states, self.layer_idx)
114
+
115
+ # repeat k/v heads if n_kv_heads < n_heads
116
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
117
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
118
+
119
+ attn_weights = torch.matmul(
120
+ query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
121
+
122
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
123
+ raise ValueError(
124
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
125
+ f" {attn_weights.size()}"
126
+ )
127
+
128
+ if attention_mask is not None:
129
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
130
+ raise ValueError(
131
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
132
+ )
133
+ attn_weights = attn_weights + attention_mask
134
+
135
+ attn_weights = self.cope(query_states, attn_weights)
136
+ # upcast attention to fp32
137
+ attn_weights = nn.functional.softmax(
138
+ attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
139
+ attn_weights = nn.functional.dropout(
140
+ attn_weights, p=self.attention_dropout, training=self.training)
141
+
142
+ attn_output = torch.matmul(attn_weights, value_states)
143
+
144
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
145
+ raise ValueError(
146
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
147
+ f" {attn_output.size()}"
148
+ )
149
+
150
+ attn_output = attn_output.transpose(1, 2).contiguous()
151
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
152
+
153
+ attn_output = self.o_proj(attn_output)
154
+
155
+ if not output_attentions:
156
+ attn_weights = None
157
+
158
+ return attn_output, attn_weights, past_key_value
159
+
160
+
161
+ class MiniPhi3FlashAttention2(Phi3FlashAttention2):
162
+ def __init__(self, config: MiniPhiConfig, origin_params):
163
+ super().__init__(config, layer_idx=0)
164
+ self.__replace_param(origin_params)
165
+ "Flash attention does not support cope"
166
+ self.cope = CoPE(self.max_position_embeddings, self.head_dim)
167
+
168
+ def __replace_param(self, origin_params: dict):
169
+ self.__dict__.update(origin_params)
170
+ del self.rotary_emb
171
+
172
+ def forward(
173
+ self,
174
+ hidden_states: torch.Tensor,
175
+ attention_mask: Optional[torch.LongTensor] = None,
176
+ position_ids: Optional[torch.LongTensor] = None,
177
+ past_key_value: Optional[Cache] = None,
178
+ output_attentions: bool = False,
179
+ use_cache: bool = False,
180
+ **kwargs,
181
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
182
+ # Phi3FlashAttention2 attention does not support output_attentions
183
+
184
+ if not _flash_supports_window_size:
185
+ logger.warning_once(
186
+ "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
187
+ )
188
+ raise ValueError(
189
+ "The current flash attention version does not support sliding window attention.")
190
+
191
+ output_attentions = False
192
+
193
+ if "padding_mask" in kwargs:
194
+ warnings.warn(
195
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
196
+ )
197
+
198
+ # overwrite attention_mask with padding_mask
199
+ attention_mask = kwargs.pop("padding_mask")
200
+
201
+ bsz, q_len, _ = hidden_states.size()
202
+
203
+ qkv = self.qkv_proj(hidden_states)
204
+ query_pos = self.num_heads * self.head_dim
205
+ query_states = qkv[..., :query_pos]
206
+ key_states = qkv[..., query_pos: query_pos +
207
+ self.num_key_value_heads * self.head_dim]
208
+ value_states = qkv[..., query_pos +
209
+ self.num_key_value_heads * self.head_dim:]
210
+
211
+ # Flash attention requires the input to have the shape
212
+ # batch_size x seq_length x head_dim x hidden_dim
213
+ # therefore we just need to keep the original shape
214
+ query_states = query_states.view(
215
+ bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
216
+ key_states = key_states.view(
217
+ bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
218
+ value_states = value_states.view(
219
+ bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
220
+
221
+ kv_seq_len = key_states.shape[-2]
222
+ if past_key_value is not None:
223
+ if self.layer_idx is None:
224
+ raise ValueError(
225
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
226
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
227
+ "with a layer index."
228
+ )
229
+ kv_seq_len += past_key_value.get_usable_length(
230
+ kv_seq_len, self.layer_idx)
231
+
232
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
233
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
234
+ # cos, sin = self.rotary_emb(
235
+ # value_states, position_ids, seq_len=rotary_seq_len)
236
+
237
+ # query_states, key_states = apply_rotary_pos_emb(
238
+ # query_states, key_states, cos, sin, position_ids)
239
+
240
+ use_sliding_windows = (
241
+ _flash_supports_window_size
242
+ and getattr(self.config, "sliding_window", None) is not None
243
+ and kv_seq_len > self.config.sliding_window
244
+ )
245
+
246
+ if past_key_value is not None:
247
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
248
+ cache_has_contents = past_key_value.get_seq_length(
249
+ self.layer_idx) > 0
250
+ if (
251
+ getattr(self.config, "sliding_window", None) is not None
252
+ and kv_seq_len > self.config.sliding_window
253
+ and cache_has_contents
254
+ ):
255
+ slicing_tokens = 1 - self.config.sliding_window
256
+
257
+ past_key = past_key_value[self.layer_idx][0]
258
+ past_value = past_key_value[self.layer_idx][1]
259
+
260
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
261
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
262
+
263
+ if past_key.shape[-2] != self.config.sliding_window - 1:
264
+ raise ValueError(
265
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
266
+ f" {past_key.shape}"
267
+ )
268
+
269
+ if attention_mask is not None:
270
+ attention_mask = attention_mask[:, slicing_tokens:]
271
+ attention_mask = torch.cat(
272
+ [attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
273
+
274
+ # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
275
+ key_states, value_states = past_key_value.update(
276
+ key_states, value_states, self.layer_idx)
277
+
278
+ # repeat k/v heads if n_kv_heads < n_heads
279
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
280
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
281
+
282
+ attn_dropout = self.attention_dropout if self.training else 0.0
283
+
284
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
285
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
286
+ # cast them back in the correct dtype just to be sure everything works as expected.
287
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
288
+ # in fp32.
289
+
290
+ if query_states.dtype == torch.float32:
291
+ if torch.is_autocast_enabled():
292
+ target_dtype = torch.get_autocast_gpu_dtype()
293
+ # Handle the case where the model is quantized
294
+ elif hasattr(self.config, "_pre_quantization_dtype"):
295
+ target_dtype = self.config._pre_quantization_dtype
296
+ else:
297
+ target_dtype = self.qkv_proj.weight.dtype
298
+
299
+ logger.warning_once(
300
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
301
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
302
+ f" {target_dtype}."
303
+ )
304
+
305
+ query_states = query_states.to(target_dtype)
306
+ key_states = key_states.to(target_dtype)
307
+ value_states = value_states.to(target_dtype)
308
+
309
+ # Reashape to the expected shape for Flash Attention
310
+ query_states = query_states.transpose(1, 2)
311
+ key_states = key_states.transpose(1, 2)
312
+ value_states = value_states.transpose(1, 2)
313
+
314
+ attn_output = self._flash_attention_forward(
315
+ query_states,
316
+ key_states,
317
+ value_states,
318
+ attention_mask,
319
+ q_len,
320
+ dropout=attn_dropout,
321
+ use_sliding_windows=use_sliding_windows,
322
+ )
323
+
324
+ attn_output = attn_output.reshape(
325
+ bsz, q_len, self.hidden_size).contiguous()
326
+ attn_output = self.o_proj(attn_output)
327
+
328
+ if not output_attentions:
329
+ attn_weights = None
330
+
331
+ return attn_output, attn_weights, past_key_value
332
+
333
+
334
+ class MiniPhi3(Phi3ForCausalLM):
335
+ """
336
+ 参数量约0.13B
337
+ MiniPhi3(
338
+ (embed_tokens): Embedding(32000, 768, padding_idx=0)
339
+ (embed_dropout): Dropout(p=0.0, inplace=False)
340
+ (layers): ModuleList(
341
+ (0-11): 12 x Phi3DecoderLayer(
342
+ (self_attn): Phi3Attention(
343
+ (o_proj): Linear(in_features=768, out_features=768, bias=False)
344
+ (qkv_proj): Linear(in_features=768, out_features=2304, bias=False)
345
+ (rotary_emb): Phi3RotaryEmbedding()
346
+ )
347
+ (mlp): Phi3MLP(
348
+ (gate_up_proj): Linear(in_features=768, out_features=4096, bias=False)
349
+ (down_proj): Linear(in_features=2048, out_features=768, bias=False)
350
+ (activation_fn): SiLU()
351
+ )
352
+ (input_layernorm): Phi3RMSNorm()
353
+ (resid_attn_dropout): Dropout(p=0.0, inplace=False)
354
+ (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
355
+ (post_attention_layernorm): Phi3RMSNorm()
356
+ )
357
+ )
358
+ (norm): Phi3RMSNorm()
359
+ )
360
+ """
361
+
362
+ def __init__(self, config: MiniPhiConfig):
363
+ super().__init__(config)
364
+ "原计划将CoPE加入Phi3,但是因为其暂时不支持Flash Attention,因此暂时搁置"
365
+ if config.use_cope:
366
+ ATTN_CLS = MiniPhi3FlashAttention2 if config._attn_implementation == "flash_attention_2" else MiniPhi3Attention
367
+ for i, layer in enumerate(self.model.layers):
368
+ layer.self_attn = ATTN_CLS(
369
+ config, layer.self_attn.__dict__)
special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[user]",
4
+ "[end]",
5
+ "[assistant]"
6
+ ],
7
+ "bos_token": {
8
+ "content": "[BOS]",
9
+ "lstrip": false,
10
+ "normalized": false,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "eos_token": {
15
+ "content": "[EOS]",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "pad_token": {
22
+ "content": "[PAD]",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[BOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "31998": {
28
+ "content": "\t",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "31999": {
36
+ "content": "\n",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "32000": {
44
+ "content": "[user]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "32001": {
52
+ "content": "[end]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "32002": {
60
+ "content": "[assistant]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ }
67
+ },
68
+ "additional_special_tokens": [
69
+ "[user]",
70
+ "[end]",
71
+ "[assistant]"
72
+ ],
73
+ "bos_token": "[BOS]",
74
+ "clean_up_tokenization_spaces": true,
75
+ "eos_token": "[EOS]",
76
+ "model_max_length": 1000000000000000019884624838656,
77
+ "pad_token": "[PAD]",
78
+ "tokenizer_class": "PreTrainedTokenizerFast"
79
+ }