Aleph-Alpha
/

sp-baseline-research-1b-bf16

Text Generation

Model card Files Files and versions Community

sp-baseline-research-1b-bf16 / config.yml

GregorZiegltrumAA's picture

GregorZiegltrumAA

Add model

890a986 about 2 months ago

1.83 kB

	optimizer:
	allreduce_bucket_size: 500000000
	beta1: 0.9
	beta2: 0.95
	debug_log: false
	eps: 1e-08
	gradient_clipping: 1.0
	zero: true
	zero_save_static: false
	topology:
	activation_checkpointing_type: disabled
	global_batch_size: 1024
	gradient_accumulation_steps: 4
	micro_batch_size: 2
	model_parallel_size: 1
	pipe_parallel_size: 1
	pipe_partition_method: balanced
	pipe_partition_overwrite: null
	sequence_parallel: false
	trainer:
	seed: 42
	train_iterations: 72000
	training:
	allow_missing_params_in_optimizer: true
	training_groups:
	- group_name: param_group
	independent_weight_decay: false
	learning_rate_scheduler:
	learning_rate: 0.0006
	learning_rate_decay_iters: 72000
	learning_rate_decay_style: cosine
	learning_rate_minimum: 6e-05
	learning_rate_warmup_steps: 500
	parameters_exclude: null
	weight_decay: 0.1
	transformer_architecture:
	attention_bias: false
	attention_num_kv_heads: null
	attention_qkv_in_one: true
	dropout_after_attention: 0.0
	dropout_after_mlp: 0.0
	dropout_attention_probs: 0.0
	dropout_embedding: 0.0
	dropout_image_encoder: 0.0
	hidden_size: 2048
	image_encoder: false
	key_query_norm: false
	layernorm:
	layernorm_epsilon: 1e-05
	optimization_type: torch
	local_attention_window_size: null
	masked_softmax:
	kernel: flash_attention
	scale: 1.0
	softmax_in_fp32: false
	mlp_bias: false
	mlp_factor: 2.6640625
	mlp_type: swiglu
	norm_type: rms
	num_attention_heads: 16
	num_layers: 16
	num_local_attention_heads: 0
	precision: bfloat16
	relative_position_embedding_type: rotary_complex
	reset_attention_mask: false
	reset_position_ids: false
	rotary_embedding_base: 10000
	rotary_percentage: 1.0
	sequence_length: 4096
	umup:
	enable: false
	vocab_file: null
	vocab_size: 65536
	weight_tying: false