Jan Philipp Harries commited on
Commit
401cb9c
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ /pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/workspace/models/phi-1_5",
3
+ "activation_function": "gelu_new",
4
+ "architecture": {
5
+ "block_cls": "parallel",
6
+ "mixer": {},
7
+ "mlp": {
8
+ "mlp_cls": "mlp"
9
+ }
10
+ },
11
+ "architectures": [
12
+ "MixFormerSequentialForCausalLM"
13
+ ],
14
+ "auto_map": {
15
+ "AutoConfig": "configuration_mixformer_sequential.MixFormerSequentialConfig",
16
+ "AutoModelForCausalLM": "modeling_mixformer_sequential.MixFormerSequentialForCausalLM"
17
+ },
18
+ "embd_layer": "default",
19
+ "embd_pdrop": 0.0,
20
+ "initializer_range": 0.02,
21
+ "layer_norm_epsilon": 1e-05,
22
+ "model_type": "mixformer-sequential",
23
+ "n_embd": 2048,
24
+ "n_head": 32,
25
+ "n_inner": null,
26
+ "n_layer": 24,
27
+ "n_positions": 2048,
28
+ "phyagi_version": "0.0.4.dev",
29
+ "resid_pdrop": 0.0,
30
+ "rotary_dim": 32,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.33.1",
34
+ "use_cache": false,
35
+ "vocab_size": 50304
36
+ }
configuration_mixformer_sequential.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import math
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ from transformers import PretrainedConfig
8
+
9
+
10
+ class MixFormerSequentialConfig(PretrainedConfig):
11
+ """MixFormer (sequential for DeepSpeed) configuration."""
12
+
13
+ model_type = "mixformer-sequential"
14
+
15
+ attribute_map = {
16
+ "max_position_embeddings": "n_positions",
17
+ "hidden_size": "n_embd",
18
+ "num_attention_heads": "n_head",
19
+ "num_hidden_layers": "n_layer",
20
+ "input_emb_layer": "embd_layer", # `input_emb_layer` key is for backward compatibility
21
+ "blocks": "architecture", # `blocks` key is for backward compatibility
22
+ }
23
+
24
+ def __init__(
25
+ self,
26
+ vocab_size: Optional[int] = 50304,
27
+ n_positions: Optional[int] = 2048,
28
+ n_embd: Optional[int] = 1024,
29
+ n_layer: Optional[int] = 20,
30
+ n_inner: Optional[int] = None,
31
+ n_head: Optional[int] = 16,
32
+ rotary_dim: Optional[int] = 32,
33
+ activation_function: Optional[str] = "gelu_new",
34
+ embd_layer: Optional[str] = "default",
35
+ architecture: Union[Dict[str, Any], List[Dict[str, Any]]] = None,
36
+ embd_pdrop: Optional[float] = 0.0,
37
+ resid_pdrop: Optional[float] = 0.0,
38
+ layer_norm_epsilon: Optional[float] = 1e-5,
39
+ initializer_range: Optional[float] = 0.02,
40
+ tie_word_embeddings: Optional[bool] = False,
41
+ pad_vocab_size_multiple: Optional[int] = 64,
42
+ **kwargs
43
+ ) -> None:
44
+ #self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
45
+ #see https://huggingface.co/teknium/Puffin-Phi/commit/4648d063244250ea9612c241ff996a41b101c9ad
46
+ self.vocab_size = vocab_size
47
+ self.n_positions = n_positions
48
+ self.n_embd = n_embd
49
+ self.n_layer = n_layer
50
+ self.n_inner = n_inner
51
+ self.n_head = n_head
52
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
53
+ self.activation_function = activation_function
54
+ self.embd_layer = embd_layer
55
+ self.architecture = architecture
56
+ self.embd_pdrop = embd_pdrop
57
+ self.resid_pdrop = resid_pdrop
58
+ self.layer_norm_epsilon = layer_norm_epsilon
59
+ self.initializer_range = initializer_range
60
+
61
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.33.1"
4
+ }
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step2250
modeling_mixformer_sequential.py ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ # BSD 3-Clause License
5
+ #
6
+ # Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
7
+ # All rights reserved.
8
+ #
9
+ # Redistribution and use in source and binary forms, with or without
10
+ # modification, are permitted provided that the following conditions are met:
11
+ #
12
+ # * Redistributions of source code must retain the above copyright notice, this
13
+ # list of conditions and the following disclaimer.
14
+ #
15
+ # * Redistributions in binary form must reproduce the above copyright notice,
16
+ # this list of conditions and the following disclaimer in the documentation
17
+ # and/or other materials provided with the distribution.
18
+ #
19
+ # * Neither the name of the copyright holder nor the names of its
20
+ # contributors may be used to endorse or promote products derived from
21
+ # this software without specific prior written permission.
22
+ #
23
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
+
34
+ from __future__ import annotations
35
+
36
+ import math
37
+ import copy
38
+ from typing import Any, Dict, Optional, Tuple
39
+ from dataclasses import dataclass, field
40
+
41
+ import torch
42
+ import torch.nn as nn
43
+
44
+ from einops import rearrange
45
+ from transformers.activations import ACT2FN
46
+ from transformers import PretrainedConfig, PreTrainedModel
47
+ from transformers.modeling_outputs import CausalLMOutputWithPast
48
+
49
+ from .configuration_mixformer_sequential import MixFormerSequentialConfig
50
+
51
+ @dataclass
52
+ class InferenceParams:
53
+ """Inference parameters that are passed to the main model in order
54
+ to efficienly calculate and store the context during inference.
55
+ Adapted from https://github.com/Dao-AILab/flash-attention."""
56
+ max_sequence_len: int
57
+ max_batch_size: int
58
+ sequence_len_offset: int = 0
59
+ batch_size_offset: int = 0
60
+ key_value_memory_dict: dict = field(default_factory=dict)
61
+ fused_ft_kernel: bool = False
62
+ lengths_per_sample: Optional[torch.Tensor] = None
63
+
64
+
65
+ class Embedding(nn.Module):
66
+ """Token embedding with dropout."""
67
+
68
+ def __init__(self, config: PretrainedConfig) -> None:
69
+ super().__init__()
70
+
71
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
72
+ self.drop = nn.Dropout(config.embd_pdrop)
73
+
74
+ def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
75
+ input_shape = input_ids.size()
76
+ input_ids = input_ids.view(-1, input_shape[-1])
77
+
78
+ hidden_states = self.wte(input_ids)
79
+ hidden_states = self.drop(hidden_states)
80
+
81
+ return hidden_states
82
+
83
+ class RotaryEmbedding(nn.Module):
84
+ """PyTorch implementation of `flash-attn` RotaryEmbedding layer.
85
+ Adapted from https://github.com/Dao-AILab/flash-attention."""
86
+
87
+ def __init__(
88
+ self,
89
+ dim: int,
90
+ base: Optional[int] = 10000,
91
+ scale_base: Optional[float] = None,
92
+ device: Optional[str] = None,
93
+ **kwargs,
94
+ ) -> None:
95
+ super().__init__()
96
+
97
+ if scale_base is not None:
98
+ raise NotImplementedError
99
+
100
+ # Generate and save the inverse frequency buffer (non-trainable)
101
+ self.dim = dim
102
+ self.base = base
103
+ self.scale_base = scale_base
104
+ self.device = device
105
+
106
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
107
+ self.register_buffer("inv_freq", inv_freq)
108
+
109
+ scale = (
110
+ (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
111
+ if scale_base is not None
112
+ else None
113
+ )
114
+ self.register_buffer("scale", scale)
115
+
116
+ self._seq_len_cached = 0
117
+ self._cos_cached = None
118
+ self._sin_cached = None
119
+ self._cos_k_cached = None
120
+ self._sin_k_cached = None
121
+
122
+ def _update_cos_sin_cache(self, x: torch.FloatTensor, seqlen_offset: Optional[int] = 0) -> None:
123
+ # Reset the tables if the sequence length has changed,
124
+ # or if we're on a new device (possibly due to tracing for instance)
125
+ seqlen = x.shape[1] + seqlen_offset
126
+
127
+ # Re-generate the inverse frequency buffer if it's not fp32
128
+ # (for instance if model.half() was called)
129
+ if self.inv_freq.dtype != "torch.float32":
130
+ self.inv_freq = 1.0 / (
131
+ self.base ** (torch.arange(0, self.dim, 2, device=self.device, dtype=torch.float32) / self.dim)
132
+ )
133
+
134
+ if seqlen > self._seq_len_cached or self._cos_cached.device != x.device or self._cos_cached.dtype != x.dtype:
135
+ self._seq_len_cached = seqlen
136
+ t = torch.arange(seqlen, device=x.device, dtype=torch.float32)
137
+
138
+ # Don't do einsum, it converts fp32 to fp16
139
+ # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
140
+ freqs = torch.outer(t, self.inv_freq.to(device=t.device, dtype=torch.float32))
141
+ if self.scale is None:
142
+ self._cos_cached = torch.cos(freqs).to(x.dtype)
143
+ self._sin_cached = torch.sin(freqs).to(x.dtype)
144
+ else:
145
+ power = (
146
+ torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
147
+ ) / self.scale_base
148
+ scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
149
+
150
+ # We want the multiplication by scale to happen in fp32
151
+ self._cos_cached = (torch.cos(freqs) * scale).to(x.dtype)
152
+ self._sin_cached = (torch.sin(freqs) * scale).to(x.dtype)
153
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
154
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
155
+
156
+ def apply_rotary_emb_qkv(
157
+ self,
158
+ qkv: torch.FloatTensor,
159
+ sin: torch.FloatTensor,
160
+ cos: torch.FloatTensor,
161
+ sin_k: Optional[torch.FloatTensor] = None,
162
+ cos_k: Optional[torch.FloatTensor] = None,
163
+ ) -> torch.FloatTensor:
164
+ _, seqlen, three, _, headdim = qkv.shape
165
+ assert three == 3
166
+
167
+ rotary_seqlen, rotary_dim = cos.shape
168
+ rotary_dim *= 2
169
+ assert rotary_dim <= headdim
170
+ assert seqlen <= rotary_seqlen
171
+
172
+ cos_k = cos if cos_k is None else cos_k
173
+ sin_k = sin if sin_k is None else sin_k
174
+ assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
175
+
176
+ q_rot = qkv[:, :, 0, :, :rotary_dim]
177
+ q_pass = qkv[:, :, 0, :, rotary_dim:]
178
+
179
+ k_rot = qkv[:, :, 1, :, :rotary_dim]
180
+ k_pass = qkv[:, :, 1, :, rotary_dim:]
181
+
182
+ # Splits the queries and keys in half
183
+ q1, q2 = q_rot.chunk(2, dim=-1)
184
+ k1, k2 = k_rot.chunk(2, dim=-1)
185
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
186
+
187
+ # Casts to fp32 are necessary to prevent fp16 overflow issues
188
+ q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
189
+
190
+ # Computes the new keys and queries, recasting to original dtype
191
+ q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
192
+
193
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
194
+
195
+ return torch.cat(
196
+ [
197
+ torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
198
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
199
+ qkv[:, :, 2:3, :, :],
200
+ ],
201
+ axis=2,
202
+ )
203
+
204
+ def forward(self, qkv: torch.Tensor, seqlen_offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
205
+ """Perform the forward pass.
206
+
207
+ Args:
208
+ qkv: Query, key and value tensors of shape (batch, seqlen, nheads, headdim) or (batch, seqlen, 3, nheads, headdim).
209
+ seqlen_offset: Used in generation where the passed `qkv` is only the last token in the batch.
210
+
211
+ Returns:
212
+ New `qkv` and the cached sinusoids.
213
+
214
+ """
215
+
216
+ self._update_cos_sin_cache(qkv, seqlen_offset)
217
+
218
+ return self.apply_rotary_emb_qkv(qkv, self._sin_cached[seqlen_offset:], self._cos_cached[seqlen_offset:])
219
+
220
+ def _update_kv_cache(kv, inference_params, layer_idx):
221
+ """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
222
+ Adapted from https://github.com/Dao-AILab/flash-attention."""
223
+ # Pre-allocate memory for key-values for inference.
224
+ num_heads, head_dim = kv.shape[-2:]
225
+ if layer_idx not in inference_params.key_value_memory_dict:
226
+ kv_cache = torch.empty(
227
+ inference_params.max_batch_size, inference_params.max_sequence_len, 2,
228
+ num_heads, head_dim, dtype=kv.dtype, device=kv.device
229
+ )
230
+ inference_params.key_value_memory_dict[layer_idx] = kv_cache
231
+ else:
232
+ kv_cache = inference_params.key_value_memory_dict[layer_idx]
233
+
234
+ # Adjust key and value for inference
235
+ batch_start = inference_params.batch_size_offset
236
+ batch_end = batch_start + kv.shape[0]
237
+ sequence_start = inference_params.sequence_len_offset
238
+ sequence_end = sequence_start + kv.shape[1]
239
+ assert batch_end <= (kv_cache.shape[0] if kv_cache is not None else v_cache.shape[0])
240
+ assert sequence_end <= (kv_cache.shape[1] if kv_cache is not None else v_cache.shape[2])
241
+
242
+ assert kv_cache is not None
243
+ kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
244
+ kv = kv_cache[batch_start:batch_end, :sequence_end, ...]
245
+ return kv
246
+
247
+
248
+ class MLP(nn.Module):
249
+ """Multi-Layer Perceptron.
250
+
251
+ Reference:
252
+ Attention Is All You Need.
253
+ https://arxiv.org/pdf/1706.03762.pdf.
254
+
255
+ """
256
+
257
+ def __init__(self, config: PretrainedConfig, n_inner: Optional[int] = None, act_fn: Optional[str] = None) -> None:
258
+ super().__init__()
259
+
260
+ act_fn = config.activation_function if act_fn is None else act_fn
261
+ assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
262
+
263
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
264
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
265
+
266
+ self.fc1 = nn.Linear(config.n_embd, n_inner)
267
+ self.fc2 = nn.Linear(n_inner, config.n_embd)
268
+ self.act = ACT2FN[act_fn]
269
+
270
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
271
+ old_keys = [prefix + "fc_in.weight", prefix + "fc_out.weight", prefix + "fc_in.bias", prefix + "fc_out.bias"]
272
+ new_keys = [prefix + "fc1.weight", prefix + "fc2.weight", prefix + "fc1.bias", prefix + "fc2.bias"]
273
+
274
+ if all(k in state_dict for k in old_keys) and not all(k in state_dict for k in new_keys):
275
+ # Older version of `MLP` saved with different key names.
276
+ for old_key, new_key in zip(old_keys, new_keys):
277
+ state_dict[new_key] = state_dict.pop(old_key)
278
+
279
+ return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
280
+
281
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
282
+ hidden_states = self.fc1(hidden_states)
283
+ hidden_states = self.act(hidden_states)
284
+ hidden_states = self.fc2(hidden_states)
285
+
286
+ return hidden_states
287
+
288
+
289
+ class FusedMLP(nn.Module):
290
+ """Fused Multi-Layer Perceptron from `flash-attn`.
291
+
292
+ Reference:
293
+ https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/ops/fused_dense.py.
294
+
295
+ """
296
+ def __init__(self, config: PretrainedConfig, n_inner: Optional[int] = None, act_fn: Optional[str] = None,
297
+ raise_on_missing: bool = False) -> None:
298
+ super().__init__()
299
+
300
+ act_fn = config.activation_function if act_fn is None else act_fn
301
+ assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
302
+
303
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
304
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
305
+
306
+ gelu_activations = ["gelu_new", "gelu_fast", "gelu_approx"]
307
+ activation = "gelu_approx" if act_fn in gelu_activations else "relu"
308
+
309
+ self.mlp = MLP(config, n_inner=n_inner, act_fn=act_fn)
310
+
311
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
312
+ return self.mlp(hidden_states)
313
+
314
+ class SelfAttention(nn.Module):
315
+ """Implement the scaled dot product attention with softmax.
316
+ Adapted from https://github.com/Dao-AILab/flash-attention.
317
+ Arguments
318
+ ---------
319
+ softmax_scale: The temperature to use for the softmax attention.
320
+ (default: 1/sqrt(d_keys) where d_keys is computed at
321
+ runtime)
322
+ attention_dropout: The dropout rate to apply to the attention
323
+ (default: 0.0)
324
+ """
325
+ def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
326
+ super().__init__()
327
+ self.causal = causal
328
+ self.softmax_scale = softmax_scale
329
+ self.drop = nn.Dropout(attention_dropout)
330
+
331
+ def forward(self, qkv, causal=None, key_padding_mask=None):
332
+ """Implements the multihead softmax attention.
333
+ Arguments
334
+ ---------
335
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
336
+ causal: if passed, will override self.causal
337
+ key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
338
+ False means to mask out. (B, S)
339
+ """
340
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
341
+ causal = self.causal if causal is None else causal
342
+ q, k, v = qkv.unbind(dim=2)
343
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
344
+ scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
345
+ if key_padding_mask is not None:
346
+ padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype,
347
+ device=scores.device)
348
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
349
+ # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
350
+ scores = scores + rearrange(padding_mask, 'b s -> b 1 1 s')
351
+ if causal:
352
+ # "triu_tril_cuda_template" not implemented for 'BFloat16'
353
+ # So we have to construct the mask in float
354
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
355
+ # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
356
+ scores = scores + causal_mask.to(dtype=scores.dtype)
357
+ attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
358
+ attention_drop = self.drop(attention)
359
+ output = torch.einsum('bhts,bshd->bthd', attention_drop, v)
360
+ return output
361
+
362
+
363
+ class CrossAttention(nn.Module):
364
+ """Implement the scaled dot product attention with softmax.
365
+ Adapted from https://github.com/Dao-AILab/flash-attention.
366
+ Arguments
367
+ ---------
368
+ softmax_scale: The temperature to use for the softmax attention.
369
+ (default: 1/sqrt(d_keys) where d_keys is computed at
370
+ runtime)
371
+ attention_dropout: The dropout rate to apply to the attention
372
+ (default: 0.0)
373
+ """
374
+ def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
375
+ super().__init__()
376
+ self.causal = causal
377
+ self.softmax_scale = softmax_scale
378
+ self.drop = nn.Dropout(attention_dropout)
379
+
380
+ def forward(self, q, kv, causal=None, key_padding_mask=None):
381
+ """Implements the multihead softmax attention.
382
+ Arguments
383
+ ---------
384
+ q: The tensor containing the query. (B, Sq, H, D)
385
+ kv: The tensor containing the key and value. (B, Sk, 2, H, D)
386
+ causal: if passed, will override self.causal
387
+ key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
388
+ False means to mask out. (B, Sk)
389
+ """
390
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
391
+ causal = self.causal if causal is None else causal
392
+ seqlen_k = kv.shape[1]
393
+ assert kv.shape[0] == batch_size and kv.shape[3] == q.shape[2] and kv.shape[4] == q.shape[3]
394
+ k, v = kv.unbind(dim=2)
395
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
396
+ scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
397
+ if key_padding_mask is not None:
398
+ padding_mask = torch.full((batch_size, seqlen_k), -10000.0, dtype=scores.dtype,
399
+ device=scores.device)
400
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
401
+ # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
402
+ scores = scores + rearrange(padding_mask, 'b s -> b 1 1 s')
403
+ if causal:
404
+ # "triu_tril_cuda_template" not implemented for 'BFloat16'
405
+ # So we have to construct the mask in float
406
+ causal_mask = torch.triu(torch.full((seqlen_q, seqlen_k), -10000.0,
407
+ device=scores.device), 1)
408
+ # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
409
+ scores = scores + causal_mask.to(dtype=scores.dtype)
410
+ attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
411
+ attention_drop = self.drop(attention)
412
+ output = torch.einsum('bhts,bshd->bthd', attention_drop, v)
413
+ return output
414
+
415
+ def find_mha_dims(
416
+ config: PretrainedConfig, n_head: Optional[int] = None, head_dim: Optional[int] = None
417
+ ) -> Tuple[int, int]:
418
+ """Validate and return the number of heads and head dimension for multi-head attention.
419
+
420
+ Args:
421
+ config: Model configuration.
422
+ n_head: Number of heads.
423
+ head_dim: Head dimension.
424
+
425
+ Returns:
426
+ Number of heads and head dimension.
427
+
428
+ """
429
+
430
+ assert all(
431
+ hasattr(config, attr) for attr in ["n_embd", "n_head"]
432
+ ), "`config` must have `n_embd` and `n_head` attributes."
433
+
434
+ if head_dim is None:
435
+ assert (
436
+ config.n_embd % config.n_head == 0
437
+ ), f"Hidden size ({config.n_embd}) must be divisible by the number of heads ({config.n_head})."
438
+
439
+ if n_head is None and head_dim is None:
440
+ head_dim = config.n_embd // config.n_head
441
+ n_head = config.n_head
442
+ elif n_head is None or head_dim is None:
443
+ raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
444
+
445
+ return n_head, head_dim
446
+
447
+
448
+ class MHA(nn.Module):
449
+ """Multi-head attention layer.
450
+ Adapted from https://github.com/Dao-AILab/flash-attention."""
451
+
452
+ def __init__(
453
+ self,
454
+ config: PretrainedConfig,
455
+ rotary_dim: Optional[int] = None,
456
+ n_head: Optional[int] = None,
457
+ head_dim: Optional[int] = None,
458
+ bias: Optional[bool] = True,
459
+ dropout: Optional[float] = 0.0,
460
+ softmax_scale: Optional[float] = None,
461
+ causal: Optional[bool] = True,
462
+ layer_idx: Optional[int] = None,
463
+ rotary_emb_scale_base: Optional[float] = None,
464
+ return_residual: Optional[bool] = False,
465
+ checkpointing: Optional[bool] = False,
466
+ device: Optional[str] = None,
467
+ dtype: Optional[torch.dtype] = None,
468
+ fused_dense: Optional[bool] = True,
469
+ flash_attn: Optional[bool] = True,
470
+ cutlass_attn: Optional[bool] = False,
471
+ flash_rotary: Optional[bool] = True,
472
+ raise_on_missing: Optional[bool] = False
473
+ ) -> None:
474
+ super().__init__()
475
+
476
+ factory_kwargs = {"device": device, "dtype": dtype}
477
+ n_head, head_dim = find_mha_dims(config, n_head, head_dim)
478
+
479
+ self.hidden_size = config.n_embd
480
+ self.n_head = n_head
481
+ self.head_dim = head_dim
482
+ self.op_size = n_head * head_dim
483
+
484
+ self.causal = causal
485
+ self.layer_idx = layer_idx
486
+ self.rotary_emb_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
487
+ self.fused_dense = fused_dense
488
+ self.flash_attn = flash_attn
489
+ self.cutlass_attn = cutlass_attn
490
+ self.flash_rotary = flash_rotary
491
+ self.return_residual = return_residual
492
+ self.checkpointing = checkpointing
493
+
494
+ if self.rotary_emb_dim > 0:
495
+ rotary_kwargs = {"device": device}
496
+ if rotary_emb_scale_base is not None and rotary_emb_scale_base > 0.0:
497
+ rotary_kwargs["scale_base"] = rotary_emb_scale_base
498
+
499
+ self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, **rotary_kwargs)
500
+ else:
501
+ pass
502
+
503
+ self.Wqkv = nn.Linear(self.hidden_size, 3 * self.op_size, bias=bias, **factory_kwargs)
504
+ self.out_proj = nn.Linear(self.op_size, self.hidden_size, bias=bias, **factory_kwargs)
505
+
506
+ self.inner_attn = SelfAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
507
+ self.inner_cross_attn = CrossAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
508
+
509
+ def _update_kv_cache(self, kv: torch.FloatTensor, inference_params: InferenceParams) -> None:
510
+ """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
511
+ Adapted from https://github.com/Dao-AILab/flash-attention."""
512
+
513
+ assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
514
+
515
+ return _update_kv_cache(kv, inference_params, self.layer_idx)
516
+
517
+ def forward(
518
+ self,
519
+ x: torch.FloatTensor,
520
+ x_kv: Optional[torch.FloatTensor] = None,
521
+ key_padding_mask: Optional[torch.BoolTensor] = None,
522
+ cu_seqlens: Optional[torch.LongTensor] = None,
523
+ max_seqlen: Optional[int] = None,
524
+ mixer_subset: Optional[torch.LongTensor] = None,
525
+ past_cache: Optional[InferenceParams] = None,
526
+ **kwargs
527
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
528
+ """Perform the forward pass.
529
+
530
+ Args:
531
+ x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
532
+ cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
533
+ is the is the sum of the sequence lengths in the batch.
534
+ x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
535
+ key_padding_mask: boolean mask, True means to keep, False means to mask out.
536
+ (batch, seqlen). Only applicable when not using FlashAttention.
537
+ cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
538
+ of the sequences in the batch, used to index into x. Only applicable when using
539
+ FlashAttention.
540
+ max_seqlen: int. Maximum sequence length in the batch.
541
+ mixer_subset: for cross-attention only. If not None, will take a subset of x
542
+ before applying the query projection. Useful for e.g., ViT where we only care
543
+ about the CLS token in the last layer.
544
+ past_cache: For generation only.
545
+
546
+ Returns:
547
+ (batch, seqlen, hidden_dim) if cu_seqlens is None and max_seqlen is None,
548
+ else (total, hidden_dim) where total is the is the sum of the sequence lengths
549
+ in the batch.
550
+
551
+ """
552
+
553
+ if cu_seqlens is not None:
554
+ assert max_seqlen is not None
555
+ assert key_padding_mask is None
556
+ assert self.flash_attn
557
+ assert self.rotary_emb_dim == 0
558
+
559
+ if key_padding_mask is not None:
560
+ assert cu_seqlens is None
561
+ assert max_seqlen is None
562
+ assert not self.flash_attn
563
+
564
+ if past_cache is not None:
565
+ assert key_padding_mask is None
566
+ assert cu_seqlens is None and max_seqlen is None
567
+
568
+ attn_kwargs = {"key_padding_mask": key_padding_mask}
569
+
570
+ assert x_kv is None and mixer_subset is None
571
+
572
+ qkv = self.Wqkv(x)
573
+ qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
574
+
575
+ if past_cache is None:
576
+ if self.rotary_emb_dim > 0:
577
+ qkv = self.rotary_emb(qkv)
578
+ context = self.inner_attn(qkv, **attn_kwargs)
579
+
580
+ else:
581
+ if self.rotary_emb_dim > 0:
582
+ qkv = self.rotary_emb(qkv, seqlen_offset=past_cache.sequence_len_offset)
583
+ q = qkv[:, :, 0]
584
+ kv = self._update_kv_cache(qkv[:, :, 1:], past_cache)
585
+ # If we're processing the prompt, causal=None (use self.causal).
586
+ # If we're decoding, then causal=False.
587
+ causal = None if past_cache.sequence_len_offset == 0 else False
588
+ context = self.inner_cross_attn(q, kv, causal=causal)
589
+
590
+ out = rearrange(context, "... h d -> ... (h d)")
591
+ out = self.out_proj(out)
592
+
593
+ return out if not self.return_residual else (out, x)
594
+
595
+ class ParallelBlock(nn.Module):
596
+ """Parallel block.
597
+
598
+ This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
599
+
600
+ """
601
+
602
+ def __init__(
603
+ self,
604
+ config: PretrainedConfig,
605
+ mixer: Optional[Dict[str, Any]] = None,
606
+ mlp: Optional[Dict[str, Any]] = None,
607
+ block_idx: Optional[int] = None,
608
+ ) -> None:
609
+ super().__init__()
610
+
611
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
612
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
613
+ self.block_idx = block_idx
614
+
615
+ self.mixer = MHA(config=config, **mixer, layer_idx=block_idx)
616
+ mlp_cls = mlp.pop('mlp_cls')
617
+ if mlp_cls == 'fused_mlp':
618
+ self.mlp = FusedMLP(config=config, **mlp)
619
+ else:
620
+ self.mlp = MLP(config=config, **mlp)
621
+
622
+ def forward(self, hidden_states: torch.FloatTensor,
623
+ past_cache: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
624
+ residual = hidden_states
625
+ hidden_states = self.ln(hidden_states)
626
+
627
+ attn_outputs = self.mixer(hidden_states, past_cache=past_cache)
628
+ if isinstance(attn_outputs, tuple):
629
+ attn_outputs = attn_outputs[0]
630
+
631
+ attn_outputs = self.resid_dropout(attn_outputs)
632
+ feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
633
+
634
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
635
+
636
+ return hidden_states
637
+
638
+ class CausalLMHead(nn.Module):
639
+ """Causal Language Modeling head.
640
+
641
+ Reference:
642
+ Improving Language Understanding by Generative Pre-Training.
643
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
644
+
645
+ """
646
+
647
+ def __init__(self, config: PretrainedConfig) -> None:
648
+ super().__init__()
649
+
650
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
651
+ self.linear = nn.Linear(config.n_embd, config.vocab_size)
652
+
653
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
654
+ hidden_states = self.ln(hidden_states)
655
+ logits = self.linear(hidden_states).to(torch.float32)
656
+
657
+ return logits
658
+
659
+
660
+ class CausalLMLoss(nn.Module):
661
+ """Causal Language Modeling loss.
662
+
663
+ Reference:
664
+ Improving Language Understanding by Generative Pre-Training.
665
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
666
+
667
+ """
668
+
669
+ def __init__(self, shift_labels: Optional[bool] = True) -> None:
670
+ super().__init__()
671
+
672
+ self.shift_labels = shift_labels
673
+ self.loss_fct = nn.CrossEntropyLoss()
674
+
675
+ def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
676
+ if self.shift_labels:
677
+ logits = logits[..., :-1, :].contiguous()
678
+ labels = labels[..., 1:].contiguous()
679
+
680
+ loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
681
+
682
+ return loss
683
+
684
+ class MixFormerSequentialPreTrainedModel(PreTrainedModel):
685
+ """MixFormer (sequential for DeepSpeed) pre-trained model."""
686
+
687
+ config_class = MixFormerSequentialConfig
688
+ base_model_prefix = "transformer"
689
+ supports_gradient_checkpointing = True
690
+
691
+ def __init__(self, *inputs, **kwargs) -> None:
692
+ super().__init__(*inputs, **kwargs)
693
+
694
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs) -> Dict[str, Any]:
695
+ if "use_cache" in kwargs and not kwargs["use_cache"]:
696
+ return {"input_ids": input_ids}
697
+
698
+ if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
699
+ past_key_values = InferenceParams(
700
+ max_batch_size=input_ids.shape[0],
701
+ max_sequence_len=self.config.n_positions,
702
+ sequence_len_offset=0,
703
+ batch_size_offset=0,
704
+ fused_ft_kernel=False,
705
+ key_value_memory_dict={},
706
+ )
707
+ else:
708
+ # assume past_key_values has cached all but last token in input_ids
709
+ past_key_values.sequence_len_offset = len(input_ids[0]) - 1
710
+ input_ids = input_ids[:, -1].unsqueeze(-1)
711
+
712
+ return {"input_ids": input_ids, "past_key_values": past_key_values, **kwargs}
713
+
714
+
715
+ class MixFormerSequentialForCausalLM(MixFormerSequentialPreTrainedModel):
716
+ """MixFormer (sequential for DeepSpeed) for Causal Language Modeling."""
717
+
718
+ _keys_to_ignore_on_load_missing = [""]
719
+ _keys_to_ignore_on_load_unexpected = [r"layers\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
720
+
721
+ def __init__(self, config: MixFormerSequentialConfig) -> None:
722
+ super().__init__(config)
723
+
724
+ modules = [Embedding(config)]
725
+ block_config = config.architecture
726
+
727
+ if not isinstance(block_config, list):
728
+ block_config = [block_config for _ in range(config.n_layer)]
729
+
730
+ if config.n_layer != len(block_config):
731
+ config.n_layer = len(block_config)
732
+
733
+ for block_idx, block in enumerate(block_config):
734
+ # `block_cls` with `legacy` value is for backward compatibility
735
+ # `path` key is for backward compatibility
736
+ block = copy.deepcopy(block) or {"block_cls": "parallel"}
737
+ block_cls = block.pop("path", None) or block.pop("block_cls", None)
738
+
739
+ block["block_idx"] = block_idx
740
+ modules.append(ParallelBlock(config, **block))
741
+
742
+ modules.append(CausalLMHead(config))
743
+
744
+ self.layers = nn.Sequential(*modules)
745
+ self.loss = CausalLMLoss()
746
+
747
+ self.post_init()
748
+
749
+ def get_input_embeddings(self) -> nn.Embedding:
750
+ return self.layers[0].wte
751
+
752
+ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
753
+ self.layers[0].wte = new_embeddings
754
+
755
+ def get_output_embeddings(self) -> nn.Linear:
756
+ return self.layers[-1].linear
757
+
758
+ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
759
+ self.layers[-1].linear = new_embeddings
760
+
761
+ def forward(
762
+ self, input_ids: torch.LongTensor, labels: Optional[torch.LongTensor] = None,
763
+ past_key_values: Optional[torch.FloatTensor] = None, **kwargs
764
+ ) -> CausalLMOutputWithPast:
765
+
766
+ if not past_key_values:
767
+ lm_logits = self.layers(input_ids)
768
+ else:
769
+ hidden_layer = self.layers[0](input_ids)
770
+ for module in self.layers[1:-1]:
771
+ hidden_layer = module(hidden_layer, past_cache=past_key_values)
772
+ lm_logits = self.layers[-1](hidden_layer)
773
+
774
+ loss = None
775
+ if labels is not None:
776
+ loss = self.loss(lm_logits, labels)
777
+
778
+ return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853d337655539907af7065c571a1c84bc70cdc932182625223df61ca2c13804b
3
+ size 2829283838
rng_state_0.pth ADDED
Binary file (14.5 kB). View file
 
rng_state_1.pth ADDED
Binary file (14.5 kB). View file
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
Binary file (6.33 kB). View file
 
zero_to_fp32.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
+ elif zero_stage == 3:
216
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
+
218
+
219
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
+ return
222
+
223
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
+
226
+ if debug:
227
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
+
230
+ wanted_params = len(frozen_param_shapes)
231
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
+ print(f'Frozen params: Have {avail_numel} numels to process.')
234
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
+
236
+ total_params = 0
237
+ total_numel = 0
238
+ for name, shape in frozen_param_shapes.items():
239
+ total_params += 1
240
+ unpartitioned_numel = shape.numel()
241
+ total_numel += unpartitioned_numel
242
+
243
+ state_dict[name] = frozen_param_fragments[name]
244
+
245
+ if debug:
246
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
+
248
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
+
250
+
251
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
252
+ param_shapes = zero_model_states[0].param_shapes
253
+
254
+ # Reconstruction protocol:
255
+ #
256
+ # XXX: document this
257
+
258
+ if debug:
259
+ for i in range(world_size):
260
+ for j in range(len(fp32_flat_groups[0])):
261
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
262
+
263
+ # XXX: memory usage doubles here (zero2)
264
+ num_param_groups = len(fp32_flat_groups[0])
265
+ merged_single_partition_of_fp32_groups = []
266
+ for i in range(num_param_groups):
267
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
268
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
269
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
270
+ avail_numel = sum(
271
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
272
+
273
+ if debug:
274
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
275
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
276
+ # not asserting if there is a mismatch due to possible padding
277
+ print(f"Have {avail_numel} numels to process.")
278
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
279
+
280
+ # params
281
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
282
+ # out-of-core computing solution
283
+ total_numel = 0
284
+ total_params = 0
285
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
286
+ offset = 0
287
+ avail_numel = full_single_fp32_vector.numel()
288
+ for name, shape in shapes.items():
289
+
290
+ unpartitioned_numel = shape.numel()
291
+ total_numel += unpartitioned_numel
292
+ total_params += 1
293
+
294
+ if debug:
295
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
296
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
297
+ offset += unpartitioned_numel
298
+
299
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
300
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
301
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
302
+ # live optimizer object, so we are checking that the numbers are within the right range
303
+ align_to = 2 * world_size
304
+
305
+ def zero2_align(x):
306
+ return align_to * math.ceil(x / align_to)
307
+
308
+ if debug:
309
+ print(f"original offset={offset}, avail_numel={avail_numel}")
310
+
311
+ offset = zero2_align(offset)
312
+ avail_numel = zero2_align(avail_numel)
313
+
314
+ if debug:
315
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
316
+
317
+ # Sanity check
318
+ if offset != avail_numel:
319
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
320
+
321
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
322
+
323
+
324
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
325
+ state_dict = OrderedDict()
326
+
327
+ # buffers
328
+ buffers = zero_model_states[0].buffers
329
+ state_dict.update(buffers)
330
+ if debug:
331
+ print(f"added {len(buffers)} buffers")
332
+
333
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
334
+
335
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
336
+
337
+ # recover shared parameters
338
+ for pair in zero_model_states[0].shared_params:
339
+ if pair[1] in state_dict:
340
+ state_dict[pair[0]] = state_dict[pair[1]]
341
+
342
+ return state_dict
343
+
344
+
345
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
346
+ remainder = unpartitioned_numel % world_size
347
+ padding_numel = (world_size - remainder) if remainder else 0
348
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
349
+ return partitioned_numel, padding_numel
350
+
351
+
352
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
353
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
354
+ return
355
+
356
+ if debug:
357
+ for i in range(world_size):
358
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
359
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
360
+
361
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
362
+ wanted_params = len(frozen_param_shapes)
363
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
364
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
365
+ print(f'Frozen params: Have {avail_numel} numels to process.')
366
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
367
+
368
+ total_params = 0
369
+ total_numel = 0
370
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
371
+ total_params += 1
372
+ unpartitioned_numel = shape.numel()
373
+ total_numel += unpartitioned_numel
374
+
375
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
376
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
377
+
378
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
379
+
380
+ if debug:
381
+ print(
382
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
383
+ )
384
+
385
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
386
+
387
+
388
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
389
+ param_shapes = zero_model_states[0].param_shapes
390
+ avail_numel = fp32_flat_groups[0].numel() * world_size
391
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
392
+ # param, re-consolidating each param, while dealing with padding if any
393
+
394
+ # merge list of dicts, preserving order
395
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
396
+
397
+ if debug:
398
+ for i in range(world_size):
399
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
400
+
401
+ wanted_params = len(param_shapes)
402
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
403
+ # not asserting if there is a mismatch due to possible padding
404
+ avail_numel = fp32_flat_groups[0].numel() * world_size
405
+ print(f"Trainable params: Have {avail_numel} numels to process.")
406
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
407
+
408
+ # params
409
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
410
+ # out-of-core computing solution
411
+ offset = 0
412
+ total_numel = 0
413
+ total_params = 0
414
+ for name, shape in param_shapes.items():
415
+
416
+ unpartitioned_numel = shape.numel()
417
+ total_numel += unpartitioned_numel
418
+ total_params += 1
419
+
420
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
421
+
422
+ if debug:
423
+ print(
424
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
425
+ )
426
+
427
+ # XXX: memory usage doubles here
428
+ state_dict[name] = torch.cat(
429
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
430
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
431
+ offset += partitioned_numel
432
+
433
+ offset *= world_size
434
+
435
+ # Sanity check
436
+ if offset != avail_numel:
437
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
438
+
439
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
440
+
441
+
442
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
443
+ state_dict = OrderedDict()
444
+
445
+ # buffers
446
+ buffers = zero_model_states[0].buffers
447
+ state_dict.update(buffers)
448
+ if debug:
449
+ print(f"added {len(buffers)} buffers")
450
+
451
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
452
+
453
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
454
+
455
+ # recover shared parameters
456
+ for pair in zero_model_states[0].shared_params:
457
+ if pair[1] in state_dict:
458
+ state_dict[pair[0]] = state_dict[pair[1]]
459
+
460
+ return state_dict
461
+
462
+
463
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
464
+ """
465
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
466
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
467
+ via a model hub.
468
+
469
+ Args:
470
+ - ``checkpoint_dir``: path to the desired checkpoint folder
471
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
472
+
473
+ Returns:
474
+ - pytorch ``state_dict``
475
+
476
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
477
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
478
+ the checkpoint.
479
+
480
+ A typical usage might be ::
481
+
482
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
483
+ # do the training and checkpoint saving
484
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
485
+ model = model.cpu() # move to cpu
486
+ model.load_state_dict(state_dict)
487
+ # submit to model hub or save the model to share with others
488
+
489
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
490
+ application. i.e. you will need to re-initialize the deepspeed engine, since
491
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
492
+
493
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
494
+
495
+ """
496
+ if tag is None:
497
+ latest_path = os.path.join(checkpoint_dir, 'latest')
498
+ if os.path.isfile(latest_path):
499
+ with open(latest_path, 'r') as fd:
500
+ tag = fd.read().strip()
501
+ else:
502
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
503
+
504
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
505
+
506
+ if not os.path.isdir(ds_checkpoint_dir):
507
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
508
+
509
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
510
+
511
+
512
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
513
+ """
514
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
515
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
516
+
517
+ Args:
518
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
519
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
520
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
521
+ """
522
+
523
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
524
+ print(f"Saving fp32 state dict to {output_file}")
525
+ torch.save(state_dict, output_file)
526
+
527
+
528
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
529
+ """
530
+ 1. Put the provided model to cpu
531
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
532
+ 3. Load it into the provided model
533
+
534
+ Args:
535
+ - ``model``: the model object to update
536
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
537
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
538
+
539
+ Returns:
540
+ - ``model`: modified model
541
+
542
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
543
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
544
+ conveniently placed for you in the checkpoint folder.
545
+
546
+ A typical usage might be ::
547
+
548
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
549
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
550
+ # submit to model hub or save the model to share with others
551
+
552
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
553
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
554
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
555
+
556
+ """
557
+ logger.info(f"Extracting fp32 weights")
558
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
559
+
560
+ logger.info(f"Overwriting model with fp32 weights")
561
+ model = model.cpu()
562
+ model.load_state_dict(state_dict, strict=False)
563
+
564
+ return model
565
+
566
+
567
+ if __name__ == "__main__":
568
+
569
+ parser = argparse.ArgumentParser()
570
+ parser.add_argument("checkpoint_dir",
571
+ type=str,
572
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
573
+ parser.add_argument(
574
+ "output_file",
575
+ type=str,
576
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
577
+ parser.add_argument("-t",
578
+ "--tag",
579
+ type=str,
580
+ default=None,
581
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
582
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
583
+ args = parser.parse_args()
584
+
585
+ debug = args.debug
586
+
587
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)