model: | |
task: text-generation | |
system_prompt: "๋๋ ์ฃผ์ด์ง Context์์ Question์ ๋ํ Answer๋ฅผ ์ฐพ๋ ์ฑ๋ด์ด์ผ. Context์์ Answer๊ฐ ๋ ์ ์๋ ๋ถ๋ถ์ ์ฐพ์์ ๊ทธ๋๋ก ์ ์ด์ค. ๋จ, Answer๋ ์ฃผ๊ด์์ด ์๋๋ผ ๋จ๋ตํ์ผ๋ก ์ ์ด์ผ ํด." | |
path: MLP-KTLim/llama-3-Korean-Bllossom-8B | |
torch_dtype: auto | |
device_map: auto | |
attn_implementation: sdpa | |
dataset: | |
path: jijihuny/economics_qa | |
name: train | |
shuffle: false | |
test_size: null | |
include_answer: true | |
metric: | |
path: jijihuny/ecqa | |
generation: | |
# ํ๋กฌํํธ๋ฅผ ํฌํจํ์ง ์์(false) | |
return_full_text: false | |
# ์์ฑํ ์ต๋ ํ ํฐ ์ซ์ | |
max_new_tokens: null | |
# Stochastic Decoding Algorithm | |
do_sample: false | |
# ์์ K๊ฐ์ Vocab | |
top_k: 1 | |
# Smallest subset V' s.t \sum_{v \in V} v \geq p | |
top_p: 0.95 | |
# softmax(x/T) | |
# T > 1 => smooth(uniform as T -> \infty) | |
# 0 <= T < 1 => sharpen(deterministic as T -> 0+) | |
temperature: 1.0 | |
# penalty on generated token. temperature๋ณด๋ค ๋์์ผํจ | |
repetition_penalty: null | |
# Contrastive search | |
# Degeneration penalty | |
# argmax (1-alpha) * p(v, x_{<i}) - alpha * max_{j<i}(similarity(v, x_j)) | |
penalty_alpha: null | |
# https://arxiv.org/abs/2309.03883 | |
dola_layers: null | |
train: | |
instruction_template: "<|start_header_id|>user<|end_header_id|>" | |
response_template: "<|start_header_id|>assistant<|end_header_id|>" | |
use_completion_only_data_collator: false | |
quantization: | |
load_in_4bit: true | |
bnb_4bit_quant_type: nf4 | |
bnb_4bit_compute_dtype: bfloat16 | |
bnb_4bit_use_double_quant: true | |
lora: | |
r: 16 | |
lora_alpha: 32 | |
lora_dropout: 0.05 | |
bias: none | |
target_modules: | |
- up_proj | |
- down_proj | |
- gate_proj | |
- k_proj | |
- q_proj | |
- v_proj | |
- o_proj | |
# - lm_head | |
task_type: CAUSAL_LM | |
args: | |
output_dir: llama3-qlora-r16-a32 | |
run_name: llama3-qlora-r16-a32 | |
report_to: wandb | |
# dataloader_num_workers: 4 | |
torch_empty_cache_steps: 3 | |
# group_by_length: true | |
max_seq_length: 2048 | |
eval_strategy: steps | |
per_device_train_batch_size: 16 | |
per_device_eval_batch_size: 32 | |
gradient_accumulation_steps: 1 | |
eval_accumulation_steps: 1 | |
optim: paged_adamw_8bit | |
bf16: true | |
bf16_full_eval: true | |
learning_rate: 0.0002 | |
weight_decay: 0.01 | |
num_train_epochs: 3 | |
warmup_ratio: 0.005 | |
max_grad_norm: 2.0 | |
eval_steps: 0.2 | |
eval_on_start: false | |
save_steps: 0.2 | |
logging_steps: 1 | |
push_to_hub: true | |
# torch_compile: true | |
seed: 42 |