File size: 6,706 Bytes
e5873af 9caa169 e5873af 9caa169 e5873af 9caa169 e5873af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 128
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 8
virtual_pipeline_model_parallel_size: null
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 80
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
init_method_std: 0.02
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
num_query_groups: 8
tokenizer:
library: huggingface
type: meta-llama/Meta-Llama-3-70B
use_fast: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
bias_activation_fusion: false
bias_dropout_add_fusion: false
masked_softmax_fusion: true
get_attention_mask_from_fusion: true
apply_rope_fusion: false
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: false
sync_batch_comm: false
activations_checkpoint_granularity: full
activations_checkpoint_method: uniform
activations_checkpoint_num_layers: 1
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
transformer_engine: true
fp8: false
fp8_e4m3: false
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
reduce_amax: true
use_emha: false
data:
chat: true
chat_prompt_tokens:
system_turn_start: <extra_id_0>
turn_start: <extra_id_1>
label_start: <extra_id_2>
end_of_turn: '
'
end_of_name: '
'
sample: true
num_workers: 2
dataloader_type: single
train_ds:
file_path: /dataset/train.jsonl
global_batch_size: 384
micro_batch_size: 1
shuffle: true
memmap_workers: null
max_seq_length: 4096
min_seq_length: 1
drop_last: true
concat_sampling_probabilities: null
label_key: output
add_eos: false
add_sep: false
add_bos: false
truncation_field: input
index_mapping_dir: /indexmap_dir
prompt_template: '<extra_id_0>System
{system message}
<extra_id_1>User
{turn 1 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 1 assistant label}
{turn 1 assistant message}
<extra_id_1>User
{turn 2 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 2 assistant label}
{turn 2 assistant message}
<extra_id_1>'
hf_dataset: true
truncation_method: right
validation_ds:
file_path: /dataset/train.jsonl
names: null
global_batch_size: 384
micro_batch_size: 1
shuffle: false
memmap_workers: null
max_seq_length: 4096
min_seq_length: 1
drop_last: false
label_key: output
add_eos: false
add_sep: false
add_bos: false
write_predictions_to_file: false
output_file_path_prefix: null
truncation_field: input
index_mapping_dir: /indexmap_dir
prompt_template: '<extra_id_0>System
{system message}
<extra_id_1>User
{turn 1 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 1 assistant label}
{turn 1 assistant message}
<extra_id_1>User
{turn 2 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 2 assistant label}
{turn 2 assistant message}
<extra_id_1>'
tokens_to_generate: 32
hf_dataset: true
truncation_method: right
metric:
name: loss
average: null
num_classes: null
test_ds:
prompt_template: '<extra_id_0>System
{system message}
<extra_id_1>User
{turn 1 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 1 assistant label}
{turn 1 assistant message}
<extra_id_1>User
{turn 2 user message}
<extra_id_1>Assistant
<extra_id_2>{turn 2 assistant label}
{turn 2 assistant message}
<extra_id_1>'
data_impl: jsonl
splits_string: null
seq_length: 8192
skip_warmup: true
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
index_mapping_dir: null
data_prefix:
train:
- /dataset/train.jsonl
validation:
- /dataset/val.jsonl
test:
- /dataset/val.jsonl
nsys_profile:
enabled: false
start_step: 10
end_step: 10
ranks:
- 0
gen_shape: false
optim:
name: distributed_fused_adam
lr: 2.0e-07
weight_decay: 0.1
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 10
constant_steps: 1000
min_lr: 1.9999e-07
max_steps: 88
bucket_cap_mb: 200
overlap_grad_sync: false
contiguous_grad_buffer: true
rotary_base: 500000.0
precision: bf16
answer_only_loss: true
restore_from_path: /models/unpacked_llama3_70b_base
save_nemo_on_validation_end: true
use_flash_attention: null
pipeline_model_parallel_split_rank: 0
ppo:
trt_llm:
enable: true
reshard: true
max_input_len: 4096
max_input_tokens: 32768
model_type: LLaMAForCausalLM
unload_engine_train: false
rollout_micro_batch_size: 8
num_rollout_samples: 128
forward_micro_batch_size: 4
val_rollout_micro_batch_size: 8
num_val_samples: 128
offload_adam_states: true
entropy_bonus: 0.0
ratio_eps: 0.2
sampling_params:
use_greedy: false
temperature: 1.0
top_k: 0
top_p: 1.0
repetition_penalty: 1.0
add_BOS: false
all_probs: false
compute_logprob: false
end_strings:
- <|endoftext|>
- <extra_id_1>
length_params:
max_length: 2048
min_length: 1
peft:
peft_scheme: none
restore_from_path: null
restore_from_ckpt:
checkpoint_dir: null
checkpoint_name: null
lora_tuning:
target_modules:
- attention_qkv
adapter_dim: 32
adapter_dropout: 0.0
column_init_method: xavier
row_init_method: zero
layer_selection: null
weight_tying: false
position_embedding_strategy: null
offload_adam_states: true
enable_nge: true
target: nemo_aligner.models.nlp.gpt.megatron_gpt_ppo_actor.MegatronGPTActorModel
nemo_version: 2.0.0rc0
|