File size: 1,935 Bytes
659759f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
base_model: Qwen/Qwen1.5-0.5B
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
trust_remote_code: true
save_safetensors: true

load_in_8bit: false
load_in_4bit: false 
strict: false

datasets:
  - path: garage-bAInd/Open-Platypus
    type: alpaca
    prompt_style: chatml
  - path: teknium/OpenHermes-2.5
    type: sharegpt
    conversation: qwen-7b-chat
  - path: databricks/databricks-dolly-15k
    type:
      field_system: ""
      field_instruction: instruction
      field_input: context
      field_output: response
      format: |-
        <|im_start|>system
        You are a helpful assistant. Please give a concise and accurate answer<|im_end|>
        <|im_start|>user
        {instruction} {input}<|im_end|>
        <|im_start|>assistant
      no_input_format: |-
        <|im_start|>system
        You are a helpful assistant. Please give a concise and accurate answer<|im_end|>
        <|im_start|>user
        {instruction}<|im_end|>
        <|im_start|>assistant
shuffle_merged_datasets: true
val_set_size: 0.04
chat_template: chatml
default_system_message: "You are a helpful assistant. Please give a concise and accurate answer"
output_dir: ./qwen_out

sequence_len: 2048
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

adapter: lora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - v_proj
lora_target_linear: true
lora_modules_to_save:
  - embed_tokens
  - lm_head

wandb_project: qwen-0.5b-lora
wandb_name: qwen-lora
wandb_log_model: checkpoint

gradient_accumulation_steps: 16
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002
max_grad_norm: 1.0

train_on_inputs: false
group_by_length: false
bf16: true

gradient_checkpointing: false
logging_steps: 1
flash_attention: false
deepspeed: deepspeed_configs/zero1.json

warmup_steps: 4
evals_per_epoch: 0
saves_per_epoch: 1
weight_decay: 0.01