ayrnb
/

megatron_nemo

Model card Files Files and versions Community

megatron_nemo / hparams.yaml

ayrnb's picture

fix

1ebceda 3 months ago

raw history blame contribute delete

No virus

2.95 kB

	cfg:
	micro_batch_size: 4
	global_batch_size: 32
	rampup_batch_size: null
	context_parallel_size: 1
	tensor_model_parallel_size: 1
	pipeline_model_parallel_size: 1
	virtual_pipeline_model_parallel_size: null
	resume_from_checkpoint: null
	encoder_seq_length: 2048
	max_position_embeddings: 2048
	num_layers: 24
	hidden_size: 4096
	ffn_hidden_size: 16384
	num_attention_heads: 32
	init_method_std: 0.01
	hidden_dropout: 0.1
	attention_dropout: 0.1
	kv_channels: null
	apply_query_key_layer_scaling: true
	layernorm_epsilon: 1.0e-05
	make_vocab_size_divisible_by: 128
	pre_process: true
	post_process: true
	persist_layer_norm: true
	gradient_as_bucket_view: true
	grad_div_ar_fusion: true
	gradient_accumulation_fusion: true
	bias_activation_fusion: true
	bias_dropout_add_fusion: true
	masked_softmax_fusion: true
	activations_checkpoint_granularity: null
	activations_checkpoint_method: null
	activations_checkpoint_num_layers: null
	num_micro_batches_with_partial_activation_checkpoints: null
	activations_checkpoint_layers_per_pipeline: null
	fsdp: false
	fsdp_sharding_strategy: full
	fsdp_grad_reduce_dtype: 32
	fsdp_sharded_checkpoint: false
	sequence_parallel: false
	overlap_p2p_comm: false
	batch_p2p_comm: true
	num_query_groups: null
	tokenizer:
	library: megatron
	type: GPT2BPETokenizer
	model: null
	delimiter: null
	vocab_file: /gpt3_dataset//bpe/vocab.json
	merge_file: /gpt3_dataset//bpe/merges.txt
	native_amp_init_scale: 4294967296
	native_amp_growth_interval: 1000
	hysteresis: 2
	fp32_residual_connection: false
	fp16_lm_cross_entropy: false
	megatron_amp_O2: true
	grad_allreduce_chunk_size_mb: 125
	sharp: false
	mcore_gpt: true
	transformer_engine: false
	fp8: false
	fp8_e4m3: false
	fp8_hybrid: true
	fp8_margin: 0
	fp8_interval: 1
	fp8_amax_history_len: 1024
	fp8_amax_compute_algo: max
	fp8_wgrad: true
	ub_tp_comm_overlap: false
	tp_comm_atomic_ag: false
	tp_comm_atomic_rs: false
	seed: 1234
	sync_batch_comm: false
	use_cpu_initialization: false
	onnx_safe: false
	apex_transformer_log_level: 30
	nsys_profile:
	enabled: false
	trace:
	- nvtx
	- cuda
	start_step: 10
	end_step: 10
	ranks:
	- 0
	gen_shape: false
	optim:
	name: distributed_fused_adam
	bucket_cap_mb: 400
	overlap_grad_sync: true
	overlap_param_sync: true
	contiguous_grad_buffer: true
	lr: 0.00016
	weight_decay: 0.1
	betas:
	- 0.9
	- 0.95
	sched:
	name: CosineAnnealing
	warmup_steps: 115
	constant_steps: 12500
	min_lr: 1.6e-05
	data:
	data_impl: mmap
	splits_string: 99990,8,2
	seq_length: 2048
	skip_warmup: true
	num_workers: 2
	dataloader_type: single
	reset_position_ids: false
	reset_attention_mask: false
	eod_mask_loss: false
	index_mapping_dir: null
	data_prefix:
	- 0.0333
	- /gpt3_dataset/wiki_text_document
	precision: bf16-mixed