flexitok
/

llama_1b_init_model

Model card Files Files and versions

llama_1b_init_model / config.yaml

gsaltintas's picture

Upload folder using huggingface_hub

1ed190d verified 10 days ago

history blame contribute delete

3.03 kB

	name: flexitok_llama
	dump_dir: /scratch/gsa/flexitok/init_models
	seed: 777
	grad_acc_steps: 8
	gc_collect_freq: 1000
	probe_freq: null
	steps: 100000
	data:
	root_dir: /scratch/craffel/lingua/data/flexitok/
	sources:
	fw_edu: 0.4
	dan_Latn: 0.0216582869670702
	swe_Latn: 0.0216359765418466
	vie_Latn: 0.0197485510268674
	hun_Latn: 0.0247194573562308
	fas_Arab: 0.0205634624231076
	tur_Latn: 0.0235455794841729
	ces_Latn: 0.0248024455266208
	arb_Arab: 0.0234323706569333
	ell_Grek: 0.0233670886888026
	ind_Latn: 0.0269322054593488
	nld_Latn: 0.0277796326621489
	pol_Latn: 0.0294120104572311
	por_Latn: 0.0301413168306825
	ita_Latn: 0.0324056371021865
	jpn_Jpan: 0.03553104151369
	fra_Latn: 0.0381835560678536
	spa_Latn: 0.0387222793083669
	deu_Latn: 0.0419925340453022
	cmn_Hani: 0.0454067521384114
	rus_Cyrl: 0.0500198157431261
	batch_size: 4
	seq_len: 4096
	n_views: 2
	seed: 42
	add_bos: true
	add_eos: true
	load_async: true
	prefetch_size: 1024
	tokenizer:
	name: huggingface
	path: meta-llama/Llama-3.2-1B
	tokenizers: null
	load_supermapping: false
	dropout: 0.0
	seed: 42
	superset_code_name: super_vocab
	n_words: null
	routing:
	source_to_tokenizer: {}
	task_to_tokenizer: {}
	suitable_tokenizer_probability: 1.0
	optim:
	lr: 0.001
	weight_decay: 0.1
	epsilon: 1.0e-08
	beta1: 0.9
	beta2: 0.95
	clip: 1.0
	scheduler: cosine
	warmup: 2000
	lr_min_ratio: 1.0e-06
	cycle_length: 1.0
	cosine_theta: 1.0
	annealing_step: 1000
	decay_fraction: 0.1
	exp_factor: 0.5
	model:
	dim: 2048
	n_layers: 25
	head_dim: null
	n_heads: 16
	n_kv_heads: null
	ffn_dim_multiplier: null
	multiple_of: 256
	norm_eps: 1.0e-05
	rope_theta: 10000.0
	init_base_std: null
	init_std_factor: disabled
	max_seqlen: 4096
	seed: 42
	vocab_size: 128256
	weight_tying: false
	sliding_window: null
	use_factorized_embeddings: false
	factorized_embedding_dim: 0
	distributed:
	dp_shard: 1
	dp_replicate: 1
	tp_size: 1
	selective_activation_checkpointing: false
	compile: true
	fsdp_type: full_shard
	model_dtype: bf16
	float8_recipe: null
	float8_filter: layers\.[0-9]+\.
	matmul_allow_tf32: false
	detect_anomaly: false
	compile_cache_size_limit: 8
	spawn_method: forkserver
	env:
	MKL_SERVICE_FORCE_INTEL: GNU
	OMP_NUM_THREADS: '1'
	MKL_NUM_THREADS: '1'
	ENABLE_INTRA_NODE_COMM: '1'
	TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
	NCCL_IB_TIMEOUT: '22'
	NCCL_DEBUG: INFO
	TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
	checkpoint:
	dump:
	every: 10000
	keep: -1
	eval:
	every: 10000
	keep: -1
	path: null
	init_ckpt_path: null
	load_init_optimizer_state: false
	save_init_ckpt: false
	profiling:
	run: true
	trace_folder: profiling
	mem_warmup: 0
	mem_steps: 4
	profile_warmup: 100
	profile_steps: 4
	logging:
	freq: 1
	acc_freq: null
	wandb: null
	async_eval_gpus: 8
	eval:
	harness:
	tasks:
	- hellaswag
	- xnli_vi
	generator:
	max_tokens: 16384
	dtype: bf16
	add_bos: false