ayrnb commited on
Commit
1ebceda
1 Parent(s): bcdd41e
Files changed (2) hide show
  1. hparams.yaml +118 -0
  2. megatron_gpt_te_false_bf16.nemo +3 -0
hparams.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cfg:
2
+ micro_batch_size: 4
3
+ global_batch_size: 32
4
+ rampup_batch_size: null
5
+ context_parallel_size: 1
6
+ tensor_model_parallel_size: 1
7
+ pipeline_model_parallel_size: 1
8
+ virtual_pipeline_model_parallel_size: null
9
+ resume_from_checkpoint: null
10
+ encoder_seq_length: 2048
11
+ max_position_embeddings: 2048
12
+ num_layers: 24
13
+ hidden_size: 4096
14
+ ffn_hidden_size: 16384
15
+ num_attention_heads: 32
16
+ init_method_std: 0.01
17
+ hidden_dropout: 0.1
18
+ attention_dropout: 0.1
19
+ kv_channels: null
20
+ apply_query_key_layer_scaling: true
21
+ layernorm_epsilon: 1.0e-05
22
+ make_vocab_size_divisible_by: 128
23
+ pre_process: true
24
+ post_process: true
25
+ persist_layer_norm: true
26
+ gradient_as_bucket_view: true
27
+ grad_div_ar_fusion: true
28
+ gradient_accumulation_fusion: true
29
+ bias_activation_fusion: true
30
+ bias_dropout_add_fusion: true
31
+ masked_softmax_fusion: true
32
+ activations_checkpoint_granularity: null
33
+ activations_checkpoint_method: null
34
+ activations_checkpoint_num_layers: null
35
+ num_micro_batches_with_partial_activation_checkpoints: null
36
+ activations_checkpoint_layers_per_pipeline: null
37
+ fsdp: false
38
+ fsdp_sharding_strategy: full
39
+ fsdp_grad_reduce_dtype: 32
40
+ fsdp_sharded_checkpoint: false
41
+ sequence_parallel: false
42
+ overlap_p2p_comm: false
43
+ batch_p2p_comm: true
44
+ num_query_groups: null
45
+ tokenizer:
46
+ library: megatron
47
+ type: GPT2BPETokenizer
48
+ model: null
49
+ delimiter: null
50
+ vocab_file: /gpt3_dataset//bpe/vocab.json
51
+ merge_file: /gpt3_dataset//bpe/merges.txt
52
+ native_amp_init_scale: 4294967296
53
+ native_amp_growth_interval: 1000
54
+ hysteresis: 2
55
+ fp32_residual_connection: false
56
+ fp16_lm_cross_entropy: false
57
+ megatron_amp_O2: true
58
+ grad_allreduce_chunk_size_mb: 125
59
+ sharp: false
60
+ mcore_gpt: true
61
+ transformer_engine: false
62
+ fp8: false
63
+ fp8_e4m3: false
64
+ fp8_hybrid: true
65
+ fp8_margin: 0
66
+ fp8_interval: 1
67
+ fp8_amax_history_len: 1024
68
+ fp8_amax_compute_algo: max
69
+ fp8_wgrad: true
70
+ ub_tp_comm_overlap: false
71
+ tp_comm_atomic_ag: false
72
+ tp_comm_atomic_rs: false
73
+ seed: 1234
74
+ sync_batch_comm: false
75
+ use_cpu_initialization: false
76
+ onnx_safe: false
77
+ apex_transformer_log_level: 30
78
+ nsys_profile:
79
+ enabled: false
80
+ trace:
81
+ - nvtx
82
+ - cuda
83
+ start_step: 10
84
+ end_step: 10
85
+ ranks:
86
+ - 0
87
+ gen_shape: false
88
+ optim:
89
+ name: distributed_fused_adam
90
+ bucket_cap_mb: 400
91
+ overlap_grad_sync: true
92
+ overlap_param_sync: true
93
+ contiguous_grad_buffer: true
94
+ lr: 0.00016
95
+ weight_decay: 0.1
96
+ betas:
97
+ - 0.9
98
+ - 0.95
99
+ sched:
100
+ name: CosineAnnealing
101
+ warmup_steps: 115
102
+ constant_steps: 12500
103
+ min_lr: 1.6e-05
104
+ data:
105
+ data_impl: mmap
106
+ splits_string: 99990,8,2
107
+ seq_length: 2048
108
+ skip_warmup: true
109
+ num_workers: 2
110
+ dataloader_type: single
111
+ reset_position_ids: false
112
+ reset_attention_mask: false
113
+ eod_mask_loss: false
114
+ index_mapping_dir: null
115
+ data_prefix:
116
+ - 0.0333
117
+ - /gpt3_dataset/wiki_text_document
118
+ precision: bf16-mixed
megatron_gpt_te_false_bf16.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cf10a38537f7335cfc30fd4e2b5048111a4499db46e27446dd27a45fee93b0
3
+ size 2825287680