NeMo
English
nvidia
rlhf
llama3
File size: 6,706 Bytes
e5873af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9caa169
e5873af
9caa169
e5873af
9caa169
e5873af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 128
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 8
virtual_pipeline_model_parallel_size: null
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 80
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
init_method_std: 0.02
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
num_query_groups: 8
tokenizer:
  library: huggingface
  type: meta-llama/Meta-Llama-3-70B
  use_fast: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
bias_activation_fusion: false
bias_dropout_add_fusion: false
masked_softmax_fusion: true
get_attention_mask_from_fusion: true
apply_rope_fusion: false
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: false
sync_batch_comm: false
activations_checkpoint_granularity: full
activations_checkpoint_method: uniform
activations_checkpoint_num_layers: 1
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
transformer_engine: true
fp8: false
fp8_e4m3: false
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
reduce_amax: true
use_emha: false
data:
  chat: true
  chat_prompt_tokens:
    system_turn_start: <extra_id_0>
    turn_start: <extra_id_1>
    label_start: <extra_id_2>
    end_of_turn: '

      '
    end_of_name: '

      '
  sample: true
  num_workers: 2
  dataloader_type: single
  train_ds:
    file_path: /dataset/train.jsonl
    global_batch_size: 384
    micro_batch_size: 1
    shuffle: true
    memmap_workers: null
    max_seq_length: 4096
    min_seq_length: 1
    drop_last: true
    concat_sampling_probabilities: null
    label_key: output
    add_eos: false
    add_sep: false
    add_bos: false
    truncation_field: input
    index_mapping_dir: /indexmap_dir
    prompt_template: '<extra_id_0>System

      {system message}

      <extra_id_1>User

      {turn 1 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 1 assistant label}

      {turn 1 assistant message}

      <extra_id_1>User

      {turn 2 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 2 assistant label}

      {turn 2 assistant message}

      <extra_id_1>'
    hf_dataset: true
    truncation_method: right
  validation_ds:
    file_path: /dataset/train.jsonl
    names: null
    global_batch_size: 384
    micro_batch_size: 1
    shuffle: false
    memmap_workers: null
    max_seq_length: 4096
    min_seq_length: 1
    drop_last: false
    label_key: output
    add_eos: false
    add_sep: false
    add_bos: false
    write_predictions_to_file: false
    output_file_path_prefix: null
    truncation_field: input
    index_mapping_dir: /indexmap_dir
    prompt_template: '<extra_id_0>System

      {system message}

      <extra_id_1>User

      {turn 1 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 1 assistant label}

      {turn 1 assistant message}

      <extra_id_1>User

      {turn 2 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 2 assistant label}

      {turn 2 assistant message}

      <extra_id_1>'
    tokens_to_generate: 32
    hf_dataset: true
    truncation_method: right
    metric:
      name: loss
      average: null
      num_classes: null
  test_ds:
    prompt_template: '<extra_id_0>System

      {system message}

      <extra_id_1>User

      {turn 1 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 1 assistant label}

      {turn 1 assistant message}

      <extra_id_1>User

      {turn 2 user message}

      <extra_id_1>Assistant

      <extra_id_2>{turn 2 assistant label}

      {turn 2 assistant message}

      <extra_id_1>'
  data_impl: jsonl
  splits_string: null
  seq_length: 8192
  skip_warmup: true
  reset_position_ids: false
  reset_attention_mask: false
  eod_mask_loss: false
  index_mapping_dir: null
  data_prefix:
    train:
    - /dataset/train.jsonl
    validation:
    - /dataset/val.jsonl
    test:
    - /dataset/val.jsonl
nsys_profile:
  enabled: false
  start_step: 10
  end_step: 10
  ranks:
  - 0
  gen_shape: false
optim:
  name: distributed_fused_adam
  lr: 2.0e-07
  weight_decay: 0.1
  betas:
  - 0.9
  - 0.98
  sched:
    name: CosineAnnealing
    warmup_steps: 10
    constant_steps: 1000
    min_lr: 1.9999e-07
    max_steps: 88
  bucket_cap_mb: 200
  overlap_grad_sync: false
  contiguous_grad_buffer: true
rotary_base: 500000.0
precision: bf16
answer_only_loss: true
restore_from_path: /models/unpacked_llama3_70b_base
save_nemo_on_validation_end: true
use_flash_attention: null
pipeline_model_parallel_split_rank: 0
ppo:
  trt_llm:
    enable: true
    reshard: true
    max_input_len: 4096
    max_input_tokens: 32768
    model_type: LLaMAForCausalLM
    unload_engine_train: false
  rollout_micro_batch_size: 8
  num_rollout_samples: 128
  forward_micro_batch_size: 4
  val_rollout_micro_batch_size: 8
  num_val_samples: 128
  offload_adam_states: true
  entropy_bonus: 0.0
  ratio_eps: 0.2
  sampling_params:
    use_greedy: false
    temperature: 1.0
    top_k: 0
    top_p: 1.0
    repetition_penalty: 1.0
    add_BOS: false
    all_probs: false
    compute_logprob: false
    end_strings:
    - <|endoftext|>
    - <extra_id_1>
  length_params:
    max_length: 2048
    min_length: 1
peft:
  peft_scheme: none
  restore_from_path: null
  restore_from_ckpt:
    checkpoint_dir: null
    checkpoint_name: null
  lora_tuning:
    target_modules:
    - attention_qkv
    adapter_dim: 32
    adapter_dropout: 0.0
    column_init_method: xavier
    row_init_method: zero
    layer_selection: null
    weight_tying: false
    position_embedding_strategy: null
offload_adam_states: true
enable_nge: true
target: nemo_aligner.models.nlp.gpt.megatron_gpt_ppo_actor.MegatronGPTActorModel
nemo_version: 2.0.0rc0