jijihuny
/

llama3-qlora-r16-a32

Model card Files Files and versions Community

llama3-qlora-r16-a32 / config.yaml

jijihuny's picture

Training in progress, step 2891

05ab353 verified 6 months ago

history blame contribute delete

2.57 kB

	model:
	task: text-generation
	system_prompt: "너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. Context에서 Answer가 될 수 있는 부분을 찾아서 그대로 적어줘. 단, Answer는 주관식이 아니라 단답형으로 적어야 해."
	path: MLP-KTLim/llama-3-Korean-Bllossom-8B
	torch_dtype: auto
	device_map: auto
	attn_implementation: sdpa
	dataset:
	path: jijihuny/economics_qa
	name: train
	shuffle: false
	test_size: null
	include_answer: true
	metric:
	path: jijihuny/ecqa
	generation:
	# 프롬프트를 포함하지 않음(false)
	return_full_text: false
	# 생성할 최대 토큰 숫자
	max_new_tokens: null
	# Stochastic Decoding Algorithm
	do_sample: false
	# 상위 K개의 Vocab
	top_k: 1
	# Smallest subset V' s.t \sum_{v \in V} v \geq p
	top_p: 0.95
	# softmax(x/T)
	# T > 1 => smooth(uniform as T -> \infty)
	# 0 <= T < 1 => sharpen(deterministic as T -> 0+)
	temperature: 1.0
	# penalty on generated token. temperature보다 높아야함
	repetition_penalty: null

	# Contrastive search
	# Degeneration penalty
	# argmax (1-alpha) * p(v, x_{<i}) - alpha * max_{j<i}(similarity(v, x_j))
	penalty_alpha: null

	# https://arxiv.org/abs/2309.03883
	dola_layers: null

	train:
	instruction_template: "<\|start_header_id\|>user<\|end_header_id\|>"
	response_template: "<\|start_header_id\|>assistant<\|end_header_id\|>"
	use_completion_only_data_collator: false
	quantization:
	load_in_4bit: true
	bnb_4bit_quant_type: nf4
	bnb_4bit_compute_dtype: bfloat16
	bnb_4bit_use_double_quant: true
	lora:
	r: 16
	lora_alpha: 32
	lora_dropout: 0.05
	bias: none
	target_modules:
	- up_proj
	- down_proj
	- gate_proj
	- k_proj
	- q_proj
	- v_proj
	- o_proj
	# - lm_head
	task_type: CAUSAL_LM
	args:
	output_dir: llama3-qlora-r16-a32
	run_name: llama3-qlora-r16-a32
	report_to: wandb
	# dataloader_num_workers: 4
	torch_empty_cache_steps: 3

	# group_by_length: true
	max_seq_length: 2048
	eval_strategy: steps
	per_device_train_batch_size: 16
	per_device_eval_batch_size: 32
	gradient_accumulation_steps: 1
	eval_accumulation_steps: 1

	optim: paged_adamw_8bit
	bf16: true
	bf16_full_eval: true
	learning_rate: 0.0002
	weight_decay: 0.01
	num_train_epochs: 3
	warmup_ratio: 0.005
	max_grad_norm: 2.0

	eval_steps: 0.2
	eval_on_start: false
	save_steps: 0.2
	logging_steps: 1

	push_to_hub: true

	# torch_compile: true
	seed: 42