llama3-qlora-r16-a32 / config.yaml
jijihuny's picture
Training in progress, step 2891
05ab353 verified
model:
task: text-generation
system_prompt: "๋„ˆ๋Š” ์ฃผ์–ด์ง„ Context์—์„œ Question์— ๋Œ€ํ•œ Answer๋ฅผ ์ฐพ๋Š” ์ฑ—๋ด‡์ด์•ผ. Context์—์„œ Answer๊ฐ€ ๋  ์ˆ˜ ์žˆ๋Š” ๋ถ€๋ถ„์„ ์ฐพ์•„์„œ ๊ทธ๋Œ€๋กœ ์ ์–ด์ค˜. ๋‹จ, Answer๋Š” ์ฃผ๊ด€์‹์ด ์•„๋‹ˆ๋ผ ๋‹จ๋‹ตํ˜•์œผ๋กœ ์ ์–ด์•ผ ํ•ด."
path: MLP-KTLim/llama-3-Korean-Bllossom-8B
torch_dtype: auto
device_map: auto
attn_implementation: sdpa
dataset:
path: jijihuny/economics_qa
name: train
shuffle: false
test_size: null
include_answer: true
metric:
path: jijihuny/ecqa
generation:
# ํ”„๋กฌํ”„ํŠธ๋ฅผ ํฌํ•จํ•˜์ง€ ์•Š์Œ(false)
return_full_text: false
# ์ƒ์„ฑํ•  ์ตœ๋Œ€ ํ† ํฐ ์ˆซ์ž
max_new_tokens: null
# Stochastic Decoding Algorithm
do_sample: false
# ์ƒ์œ„ K๊ฐœ์˜ Vocab
top_k: 1
# Smallest subset V' s.t \sum_{v \in V} v \geq p
top_p: 0.95
# softmax(x/T)
# T > 1 => smooth(uniform as T -> \infty)
# 0 <= T < 1 => sharpen(deterministic as T -> 0+)
temperature: 1.0
# penalty on generated token. temperature๋ณด๋‹ค ๋†’์•„์•ผํ•จ
repetition_penalty: null
# Contrastive search
# Degeneration penalty
# argmax (1-alpha) * p(v, x_{<i}) - alpha * max_{j<i}(similarity(v, x_j))
penalty_alpha: null
# https://arxiv.org/abs/2309.03883
dola_layers: null
train:
instruction_template: "<|start_header_id|>user<|end_header_id|>"
response_template: "<|start_header_id|>assistant<|end_header_id|>"
use_completion_only_data_collator: false
quantization:
load_in_4bit: true
bnb_4bit_quant_type: nf4
bnb_4bit_compute_dtype: bfloat16
bnb_4bit_use_double_quant: true
lora:
r: 16
lora_alpha: 32
lora_dropout: 0.05
bias: none
target_modules:
- up_proj
- down_proj
- gate_proj
- k_proj
- q_proj
- v_proj
- o_proj
# - lm_head
task_type: CAUSAL_LM
args:
output_dir: llama3-qlora-r16-a32
run_name: llama3-qlora-r16-a32
report_to: wandb
# dataloader_num_workers: 4
torch_empty_cache_steps: 3
# group_by_length: true
max_seq_length: 2048
eval_strategy: steps
per_device_train_batch_size: 16
per_device_eval_batch_size: 32
gradient_accumulation_steps: 1
eval_accumulation_steps: 1
optim: paged_adamw_8bit
bf16: true
bf16_full_eval: true
learning_rate: 0.0002
weight_decay: 0.01
num_train_epochs: 3
warmup_ratio: 0.005
max_grad_norm: 2.0
eval_steps: 0.2
eval_on_start: false
save_steps: 0.2
logging_steps: 1
push_to_hub: true
# torch_compile: true
seed: 42