File size: 3,716 Bytes
3b37c31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# accelerate launch ./scripts/finetune.py 2-PKTDC-llama-13B-gptq-lora-24gb.yml
#
# base model settings (local or huggingface repo)
base_model: PocketDoc/llama-13b-gptq-4bit-128g
base_model_config: PocketDoc/llama-13b-gptq-4bit-128g
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
trust_remote_code:

# wandb configuration
wandb_project: llama-13b-gptq-4bit-128g-lora
wandb_watch:
wandb_run_id:
wandb_log_model: 

# where to save the finished model to
output_dir: ./llama-13b-gptq-4bit-128g-lora

# dataset settings (local or huggingface repo)
datasets:
  - path: dansmeth.json
    type: pygmalion

dataset_prepared_path: data/last_run_prepared

# percentage of the dataset to set aside as evaluation.
val_set_size: 0.02

# max token length / prompt
sequence_len: 2048

# max sequence length to concatenate training samples together up to
# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
max_packed_sequence_len: 2048

# quantized model loading settings
gptq: true
gptq_groupsize: 128 # group size
gptq_model_v1: false # v1 or v2
strict: false

# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
load_in_8bit: true

load_in_4bit: 

# Use CUDA bf16
bf16: false
# Use CUDA fp16
fp16: true
# Use CUDA tf32
tf32: true

# training hyperparameters
gradient_accumulation_steps: 30
micro_batch_size: 6
eval_batch_size: 6
num_epochs: 12
warmup_steps: 10
learning_rate: 0.000004

logging_steps: 1
eval_steps: 5
save_steps: 10

# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
early_stopping_patience: 
# specify a scheduler to use with the optimizer. only one_cycle is supported currently
lr_scheduler: linear
# specify optimizer
optimizer: paged_adamw_8bit
# specify weight decay
weight_decay: 0.0001


# if you already have a lora model trained that you want to load, put that here
lora_model_dir:

# LoRA hyperparameters
adapter: lora # blank for full finetune
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: 
lora_target_modules:
  - q_proj
  - v_proj
#  - k_proj
#  - o_proj
#  - gate_proj
#  - down_proj
#  - up_proj
lora_modules_to_save:
#  - embed_tokens
#  - lm_head
lora_out_dir:
lora_fan_in_fan_out: false


# whether to mask out or include the human's prompt from the training labels
train_on_inputs: false
# don't use this, leads to wonky training (according to someone on the internet)
group_by_length: true


# does not work with current implementation of 4-bit LoRA
gradient_checkpointing: true


# whether to use xformers attention patch https://github.com/facebookresearch/xformers:
xformers_attention: true
# whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
flash_attention:  # require a100 for llama
# whether to use scaled-dot-product attention
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
sdp_attention: 


# resume from a specific checkpoint dir
resume_from_checkpoint: 
# if resume_from_checkpoint isn't set and you simply want it to start where it left off
# be careful with this being turned on between different models
auto_resume_from_checkpoints: false


# don't mess with this, it's here for accelerate and torchrun
local_rank:

# add or change special tokens
special_tokens:
  # sys_role_token: "<|system|>"
  # user_role_token: "<|user|>"
  # model_role_token: "<|model|>"
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
  
# add extra tokens
tokens:


# FSDP
fsdp:

fsdp_config:

# Deepspeed
deepspeed: 

# TODO
torchdistx_path:

# Debug mode
debug: