liang.zhao
commited on
Commit
•
67b506b
1
Parent(s):
8c1c087
update model and config
Browse files- .gitattributes +4 -0
- config.json +2 -2
- modeling_skywork.py +29 -1
- pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin +2 -2
- pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin +2 -2
- pytorch_model-00003-of-00004.bin +3 -0
- pytorch_model-00004-of-00004.bin +3 -0
- pytorch_model.bin.index.json +0 -0
- skywork_13b_sft.sh +128 -0
.gitattributes
CHANGED
@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
|
38 |
+
pytorch_model-00001-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
39 |
+
pytorch_model-00002-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
40 |
+
pytorch_model-00003-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
41 |
+
pytorch_model-00004-of-00004.bin filter=lfs diff=lfs merge=lfs -text
|
config.json
CHANGED
@@ -33,7 +33,7 @@
|
|
33 |
"rms_norm_eps": 1e-06,
|
34 |
"tie_word_embeddings": false,
|
35 |
"torch_dtype": "bfloat16",
|
36 |
-
"transformers_version": "4.
|
37 |
"use_cache": true,
|
38 |
-
"vocab_size":
|
39 |
}
|
|
|
33 |
"rms_norm_eps": 1e-06,
|
34 |
"tie_word_embeddings": false,
|
35 |
"torch_dtype": "bfloat16",
|
36 |
+
"transformers_version": "4.34.0",
|
37 |
"use_cache": true,
|
38 |
+
"vocab_size": 65536
|
39 |
}
|
modeling_skywork.py
CHANGED
@@ -179,6 +179,27 @@ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
|
|
179 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
180 |
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
def rotate_half(x):
|
183 |
"""Rotates half the hidden dims of the input."""
|
184 |
x1 = x[..., : x.shape[-1] // 2]
|
@@ -189,7 +210,7 @@ def rotate_half(x):
|
|
189 |
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
|
190 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
191 |
cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
|
192 |
-
sin = sin[position_ids].unsqueeze(1)
|
193 |
q_embed = (q * cos) + (rotate_half(q) * sin)
|
194 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
195 |
return q_embed, k_embed
|
@@ -290,6 +311,13 @@ class SkyworkAttention(nn.Module):
|
|
290 |
scaling_factor=scaling_factor,
|
291 |
base=self.rope_theta,
|
292 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
else:
|
294 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
295 |
|
|
|
179 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
180 |
|
181 |
|
182 |
+
class SkyworkNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
|
183 |
+
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
184 |
+
self.scaling_factor = scaling_factor
|
185 |
+
super().__init__(dim, max_position_embeddings, base, device)
|
186 |
+
|
187 |
+
def _set_cos_sin_cache(self, seq_len, device, dtype):
|
188 |
+
self.max_seq_len_cached = seq_len
|
189 |
+
|
190 |
+
base = (self.base * self.scaling_factor) ** (self.dim / (self.dim - 2))
|
191 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
|
192 |
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
193 |
+
|
194 |
+
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
|
195 |
+
|
196 |
+
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
197 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
198 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
199 |
+
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
|
200 |
+
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
201 |
+
|
202 |
+
|
203 |
def rotate_half(x):
|
204 |
"""Rotates half the hidden dims of the input."""
|
205 |
x1 = x[..., : x.shape[-1] // 2]
|
|
|
210 |
# Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
|
211 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
212 |
cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
|
213 |
+
sin = sin[position_ids].unsqueeze(1) #
|
214 |
q_embed = (q * cos) + (rotate_half(q) * sin)
|
215 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
216 |
return q_embed, k_embed
|
|
|
311 |
scaling_factor=scaling_factor,
|
312 |
base=self.rope_theta,
|
313 |
)
|
314 |
+
elif scaling_type == "ntk":
|
315 |
+
self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
|
316 |
+
self.head_dim,
|
317 |
+
max_position_embeddings=self.max_position_embeddings,
|
318 |
+
scaling_factor=scaling_factor,
|
319 |
+
base=self.rope_theta,
|
320 |
+
)
|
321 |
else:
|
322 |
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
323 |
|
pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f344f1c62a065f471de22d3a9ac6a4a4d2c1b8f98a2251080e59be55f7d77632
|
3 |
+
size 3982977952
|
pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2fcff58e56b6d24abd9588ba1b17c58fbfe76cbfe1c35014c5bd59aa69f8fb7a
|
3 |
+
size 3959875181
|
pytorch_model-00003-of-00004.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b903e4ba9505d009982736e9eb21f77c7151f91d0bb8d846756929f147e3eb9
|
3 |
+
size 3966949023
|
pytorch_model-00004-of-00004.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:573ab5962dc857d263679e9e6e7493a0d150ec905ba5172af38cbb2eedac5f29
|
3 |
+
size 2559125753
|
pytorch_model.bin.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
skywork_13b_sft.sh
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set -x
|
2 |
+
export WANDB_API_KEY=${WANDB_API_KEY:-YOUR_WANDB_API_KEY}
|
3 |
+
export WANDB_ENTITY=${WANDB_ENTITY:-YOUR_WANDB_ENTITY}
|
4 |
+
export WANDB_PROJECT=${WANDB_PROJECT:-YOUR_WANDB_PROJECT}
|
5 |
+
|
6 |
+
GPUS_PER_NODE=8
|
7 |
+
NODE_RANK=$([ -z "$RANK" ] && echo -n 0 || echo -n $RANK)
|
8 |
+
NNODES=$([ -z "$WORLD_SIZE" ] && echo -n 1 || echo -n $WORLD_SIZE)
|
9 |
+
|
10 |
+
DEBUG="false"
|
11 |
+
USE_LORA="false"
|
12 |
+
TASK_TYPE="sft"
|
13 |
+
|
14 |
+
MAX_STEP=1000
|
15 |
+
LR=1e-4
|
16 |
+
MAX_LENGTH=4096
|
17 |
+
|
18 |
+
GLOBAL_BATCH_SIZE=32 # 8 * 4
|
19 |
+
MICRO_BATCH_SIZE=1
|
20 |
+
SAVE_STEP=500
|
21 |
+
EVAL_STEP=500
|
22 |
+
GRAD_ACC=$((${GLOBAL_BATCH_SIZE} / (${GPUS_PER_NODE} * $NNODES * ${MICRO_BATCH_SIZE}) ))
|
23 |
+
|
24 |
+
FLAG=Skywork-13B-Base-sft-peaklr${LR}-steps${MAX_STEP}-gbs${GLOBAL_BATCH_SIZE}
|
25 |
+
|
26 |
+
ROOT_PATH=${ROOT_PATH:-/data/user/your_name}
|
27 |
+
MODEL_PATH=${MODEL_PATH:-SKYWORK_13B_BASE_MODEL_PATH}
|
28 |
+
|
29 |
+
SFT_DATA_DIR=${SFT_DATA_DIR:-"YOUR_DATA_DIR"}
|
30 |
+
DATA_CACHE_DIR=${DATA_CACHE_DIR:-"YOUR_DATA_CACHE_DIR"}
|
31 |
+
|
32 |
+
OUTPUT_DIR=$ROOT_PATH/run_output/skywork-13b-sft-trainer/$FLAG
|
33 |
+
LOAD_MODEL_PATH=$([ -z "$MODEL_PATH" ] && echo -n "$OUTPUT_DIR" || echo -n "$MODEL_PATH")
|
34 |
+
|
35 |
+
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --master_port 29501"
|
36 |
+
if [[ $NNODES -gt 1 ]]; then
|
37 |
+
|
38 |
+
export NCCL_IB_HCA=mlx5
|
39 |
+
export NCCL_IB_TC=136
|
40 |
+
export NCCL_IB_SL=5
|
41 |
+
export NCCL_IB_GID_INDEX=3
|
42 |
+
export NCCL_IB_TIMEOUT=22
|
43 |
+
export NCCL_SOCKET_IFNAME=bond0
|
44 |
+
export NCCL_DEBUG=INFO
|
45 |
+
NODE_RANK=$RANK
|
46 |
+
if [ "$MASTER_ADDR" == "localhost" ] ; then $MASTER_ADDR=`hostname`; fi
|
47 |
+
|
48 |
+
echo $MASTER_ADDR
|
49 |
+
echo $MASTER_PORT
|
50 |
+
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
|
51 |
+
fi
|
52 |
+
|
53 |
+
if [ "$DEBUG" = "true" ]; then
|
54 |
+
EVAL_STEP=5
|
55 |
+
GLOBAL_BATCH_SIZE=8
|
56 |
+
GRAD_ACC=1
|
57 |
+
|
58 |
+
fi
|
59 |
+
|
60 |
+
DS_CONFIG=${DS_CONFIG:-train/ds_config/zero3_offload.json}
|
61 |
+
|
62 |
+
LOG_ARGS="
|
63 |
+
--logging_steps 1 \
|
64 |
+
--logging_dir tensorboard/$FLAG \
|
65 |
+
--logging_strategy steps \
|
66 |
+
--logging_first_step True \
|
67 |
+
--report_to wandb \
|
68 |
+
--run_name $FLAG
|
69 |
+
"
|
70 |
+
|
71 |
+
OUTPUT_ARGS="
|
72 |
+
--save_strategy steps \
|
73 |
+
--save_total_limit 500 \
|
74 |
+
--save_steps $SAVE_STEP \
|
75 |
+
--output_dir $OUTPUT_DIR \
|
76 |
+
--overwrite_output_dir
|
77 |
+
"
|
78 |
+
|
79 |
+
TRAIN_ARGS="
|
80 |
+
--task_type $TASK_TYPE \
|
81 |
+
--do_train \
|
82 |
+
--max_seq_length $MAX_LENGTH \
|
83 |
+
--max_steps $MAX_STEP \
|
84 |
+
--lr_scheduler_type constant_with_warmup \
|
85 |
+
--learning_rate $LR \
|
86 |
+
--weight_decay 0.1 \
|
87 |
+
--warmup_steps 20 \
|
88 |
+
--adam_beta1 0.9 \
|
89 |
+
--adam_beta2 0.95 \
|
90 |
+
--gradient_accumulation_steps $GRAD_ACC \
|
91 |
+
--per_device_train_batch_size $MICRO_BATCH_SIZE
|
92 |
+
"
|
93 |
+
|
94 |
+
EVAL_ARGS="
|
95 |
+
--do_eval \
|
96 |
+
--evaluation_strategy steps \
|
97 |
+
--eval_steps $EVAL_STEP \
|
98 |
+
--per_device_eval_batch_size 1
|
99 |
+
"
|
100 |
+
|
101 |
+
INPUT_ARGS="
|
102 |
+
--model_name_or_path $LOAD_MODEL_PATH \
|
103 |
+
--tokenizer_name_or_path $LOAD_MODEL_PATH \
|
104 |
+
--sft_dataset_dir $SFT_DATA_DIR \
|
105 |
+
--data_cache_dir $DATA_CACHE_DIR
|
106 |
+
"
|
107 |
+
|
108 |
+
EXTRA_ARGS="
|
109 |
+
--seed 1234 \
|
110 |
+
--deepspeed $DS_CONFIG \
|
111 |
+
--gradient_checkpointing \
|
112 |
+
--ddp_find_unused_parameters False \
|
113 |
+
--preprocessing_num_workers 12 \
|
114 |
+
--ddp_timeout 30000 \
|
115 |
+
--torch_dtype bfloat16 \
|
116 |
+
--bf16 \
|
117 |
+
--load_in_kbits 16
|
118 |
+
"
|
119 |
+
|
120 |
+
mkdir -p logs/$FLAG || True
|
121 |
+
torchrun $DISTRIBUTED_ARGS train/train.py \
|
122 |
+
$LOG_ARGS \
|
123 |
+
$OUTPUT_ARGS \
|
124 |
+
$TRAIN_ARGS \
|
125 |
+
$EVAL_ARGS \
|
126 |
+
$INPUT_ARGS \
|
127 |
+
$EXTRA_ARGS 2>&1 | tee -a logs/$FLAG/$RANK.log
|
128 |
+
|