End of training
Browse files- README.md +29 -181
- adapter_model.bin +1 -1
README.md
CHANGED
@@ -20,123 +20,15 @@ should probably proofread and complete it, then remove this comment. -->
|
|
20 |
|
21 |
axolotl version: `0.6.0`
|
22 |
```yaml
|
23 |
-
# base_model: meta-llama/Llama-3.2-1B-Instruct
|
24 |
-
# # Automatically upload checkpoint and final model to HF
|
25 |
-
# # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-MetaMathQA
|
26 |
-
# hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-gsm8k
|
27 |
-
|
28 |
-
# load_in_8bit: false
|
29 |
-
# load_in_4bit: true
|
30 |
-
# strict: false
|
31 |
-
|
32 |
-
|
33 |
-
# datasets:
|
34 |
-
# - path: openai/gsm8k
|
35 |
-
# type: alpaca_chat.load_qa
|
36 |
-
# name: "main"
|
37 |
-
# train_on_split: "train"
|
38 |
-
|
39 |
-
|
40 |
-
# # datasets:
|
41 |
-
# # - path: meta-math/MetaMathQA
|
42 |
-
# # type:
|
43 |
-
# # field_instruction: query
|
44 |
-
# # field_output: response
|
45 |
-
|
46 |
-
# val_set_size: 0.1
|
47 |
-
# # output_dir: "/mnt/disks/gcs/axolotl/outputs/out"
|
48 |
-
# output_dir: "/mnt/disks/gcs/axolotl/outputs/gsm8k-out"
|
49 |
-
# # output_dir: "/mnt/disks/gcs/axolotl/outputs/MetaMathQA-out"
|
50 |
-
|
51 |
-
# adapter: qlora
|
52 |
-
# lora_model_dir:
|
53 |
-
|
54 |
-
# sequence_len: 2048
|
55 |
-
# sample_packing: true
|
56 |
-
# eval_sample_packing: true
|
57 |
-
# pad_to_sequence_len: true
|
58 |
-
|
59 |
-
# lora_r: 32
|
60 |
-
# lora_alpha: 16
|
61 |
-
# lora_dropout: 0.05
|
62 |
-
# lora_fan_in_fan_out:
|
63 |
-
# lora_target_modules:
|
64 |
-
# - gate_proj
|
65 |
-
# - down_proj
|
66 |
-
# - up_proj
|
67 |
-
# - q_proj
|
68 |
-
# - v_proj
|
69 |
-
# - k_proj
|
70 |
-
# - o_proj
|
71 |
-
|
72 |
-
# wandb_project:
|
73 |
-
# wandb_entity:
|
74 |
-
# wandb_watch:
|
75 |
-
# wandb_name:
|
76 |
-
# wandb_log_model:
|
77 |
-
|
78 |
-
# gradient_accumulation_steps: 4
|
79 |
-
# micro_batch_size: 2
|
80 |
-
# num_epochs: 3
|
81 |
-
# # optimizer: adamw_bnb_8bit
|
82 |
-
# optimizer: adamw_torch
|
83 |
-
# lr_scheduler: cosine
|
84 |
-
# learning_rate: 2e-5
|
85 |
-
|
86 |
-
# train_on_inputs: false
|
87 |
-
# group_by_length: false
|
88 |
-
# bf16: auto
|
89 |
-
# fp16:
|
90 |
-
# tf32: false
|
91 |
-
|
92 |
-
# # gradient_checkpointing: true
|
93 |
-
# gradient_checkpointing: false
|
94 |
-
# early_stopping_patience:
|
95 |
-
# resume_from_checkpoint:
|
96 |
-
# local_rank:
|
97 |
-
# logging_steps: 1
|
98 |
-
# xformers_attention:
|
99 |
-
# flash_attention: true
|
100 |
-
|
101 |
-
# loss_watchdog_threshold: 5.0
|
102 |
-
# loss_watchdog_patience: 3
|
103 |
-
|
104 |
-
# warmup_steps: 10
|
105 |
-
# evals_per_epoch: 4
|
106 |
-
# eval_table_size:
|
107 |
-
# eval_max_new_tokens: 128
|
108 |
-
# saves_per_epoch: 1
|
109 |
-
# debug:
|
110 |
-
# deepspeed:
|
111 |
-
# weight_decay: 0.0
|
112 |
-
# # fsdp:
|
113 |
-
# # fsdp_config:
|
114 |
-
# fsdp:
|
115 |
-
# - full_shard
|
116 |
-
# - auto_wrap
|
117 |
-
# fsdp_config:
|
118 |
-
# fsdp_limit_all_gathers: true
|
119 |
-
# fsdp_sync_module_states: true
|
120 |
-
# fsdp_offload_params: true
|
121 |
-
# fsdp_use_orig_params: false
|
122 |
-
# fsdp_cpu_ram_efficient_loading: true
|
123 |
-
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
124 |
-
# fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
125 |
-
# fsdp_state_dict_type: FULL_STATE_DICT
|
126 |
-
# fsdp_sharding_strategy: FULL_SHARD
|
127 |
-
# fsdp_activation_checkpointing: true
|
128 |
-
# special_tokens:
|
129 |
-
# # pad_token: "<|end_of_text|>"
|
130 |
-
# special_tokens:
|
131 |
-
# bos_token: "<|begin_of_text|>"
|
132 |
-
# eos_token: "<|eot_id|>"
|
133 |
-
# pad_token: "<|finetune_right_pad_id|>"
|
134 |
-
|
135 |
base_model: google/gemma-2-27b-it
|
136 |
-
|
137 |
-
|
138 |
hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
|
139 |
|
|
|
|
|
|
|
|
|
140 |
load_in_8bit: false
|
141 |
load_in_4bit: true
|
142 |
strict: false
|
@@ -152,7 +44,6 @@ val_set_size: 0.1
|
|
152 |
output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
|
153 |
|
154 |
adapter: qlora
|
155 |
-
|
156 |
lora_r: 32
|
157 |
lora_alpha: 16
|
158 |
lora_dropout: 0.05
|
@@ -160,26 +51,22 @@ lora_target_linear: true
|
|
160 |
|
161 |
sequence_len: 2048
|
162 |
sample_packing: true
|
163 |
-
|
164 |
pad_to_sequence_len: true
|
165 |
|
166 |
gradient_accumulation_steps: 4
|
167 |
-
micro_batch_size:
|
168 |
num_epochs: 3
|
169 |
-
# optimizer: adamw_bnb_8bit
|
170 |
optimizer: adamw_torch
|
171 |
lr_scheduler: cosine
|
172 |
learning_rate: 2e-5
|
173 |
|
174 |
-
|
175 |
train_on_inputs: false
|
176 |
group_by_length: false
|
177 |
bf16: auto
|
178 |
fp16:
|
179 |
-
tf32:
|
180 |
-
|
181 |
|
182 |
-
# gradient_checkpointing: false
|
183 |
gradient_checkpointing: true
|
184 |
early_stopping_patience:
|
185 |
resume_from_checkpoint:
|
@@ -188,56 +75,17 @@ logging_steps: 1
|
|
188 |
xformers_attention:
|
189 |
flash_attention: false
|
190 |
|
191 |
-
# loss_watchdog_threshold: 5.0
|
192 |
-
# loss_watchdog_patience: 3
|
193 |
-
|
194 |
-
|
195 |
warmup_ratio: 0.1
|
196 |
evals_per_epoch: 4
|
197 |
eval_max_new_tokens: 128
|
198 |
saves_per_epoch: 1
|
199 |
debug:
|
200 |
-
# deepspeed:
|
201 |
-
weight_decay: 0.0
|
202 |
-
|
203 |
deepspeed: deepspeed_configs/zero1.json
|
|
|
204 |
|
205 |
fsdp:
|
206 |
fsdp_config:
|
207 |
-
|
208 |
-
# - full_shard
|
209 |
-
# - auto_wrap
|
210 |
-
|
211 |
-
# fsdp_config:
|
212 |
-
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
213 |
-
# fsdp_backward_prefetch: BACKWARD_PRE
|
214 |
-
# fsdp_cpu_ram_efficient_loading: true
|
215 |
-
# fsdp_forward_prefetch: false
|
216 |
-
# fsdp_offload_params: true
|
217 |
-
# fsdp_sharding_strategy: FULL_SHARD
|
218 |
-
# fsdp_state_dict_type: SHARDED_STATE_DICT
|
219 |
-
# fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
|
220 |
-
# fsdp_sync_module_states: true
|
221 |
-
# fsdp_use_orig_params: true
|
222 |
-
|
223 |
-
|
224 |
-
# fsdp_config:
|
225 |
-
# fsdp_limit_all_gathers: true
|
226 |
-
# fsdp_sync_module_states: true
|
227 |
-
# fsdp_offload_params: true
|
228 |
-
# fsdp_use_orig_params: false
|
229 |
-
# fsdp_cpu_ram_efficient_loading: true
|
230 |
-
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
231 |
-
# fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
|
232 |
-
# fsdp_state_dict_type: FULL_STATE_DICT
|
233 |
-
# fsdp_sharding_strategy: FULL_SHARD
|
234 |
-
# fsdp_activation_checkpointing: true
|
235 |
-
# special_tokens:
|
236 |
-
# # pad_token: "<|end_of_text|>"
|
237 |
-
# special_tokens:
|
238 |
-
# bos_token: "<|begin_of_text|>"
|
239 |
-
# eos_token: "<|eot_id|>"
|
240 |
-
# pad_token: "<|finetune_right_pad_id|>"
|
241 |
```
|
242 |
|
243 |
</details><br>
|
@@ -246,7 +94,7 @@ fsdp_config:
|
|
246 |
|
247 |
This model is a fine-tuned version of [google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) on the databricks/databricks-dolly-15k dataset.
|
248 |
It achieves the following results on the evaluation set:
|
249 |
-
- Loss: 1.
|
250 |
|
251 |
## Model description
|
252 |
|
@@ -266,35 +114,35 @@ More information needed
|
|
266 |
|
267 |
The following hyperparameters were used during training:
|
268 |
- learning_rate: 2e-05
|
269 |
-
- train_batch_size:
|
270 |
-
- eval_batch_size:
|
271 |
- seed: 42
|
272 |
- distributed_type: multi-GPU
|
273 |
- num_devices: 2
|
274 |
- gradient_accumulation_steps: 4
|
275 |
-
- total_train_batch_size:
|
276 |
-
- total_eval_batch_size:
|
277 |
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
278 |
- lr_scheduler_type: cosine
|
279 |
-
- lr_scheduler_warmup_steps:
|
280 |
- num_epochs: 3
|
281 |
|
282 |
### Training results
|
283 |
|
284 |
| Training Loss | Epoch | Step | Validation Loss |
|
285 |
|:-------------:|:------:|:----:|:---------------:|
|
286 |
-
|
|
287 |
-
| 3.
|
288 |
-
|
|
289 |
-
|
|
290 |
-
| 1.
|
291 |
-
| 1.
|
292 |
-
| 1.
|
293 |
-
| 1.
|
294 |
-
| 1.
|
295 |
-
| 1.
|
296 |
-
| 1.
|
297 |
-
| 1.
|
298 |
|
299 |
|
300 |
### Framework versions
|
|
|
20 |
|
21 |
axolotl version: `0.6.0`
|
22 |
```yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
base_model: google/gemma-2-27b-it
|
24 |
+
model_type: AutoModelForCausalLM
|
25 |
+
tokenizer_type: AutoTokenizer
|
26 |
hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
|
27 |
|
28 |
+
# https://github.com/vllm-project/vllm/issues/10590
|
29 |
+
bnb_config_kwargs:
|
30 |
+
bnb_4bit_quant_storage: uint8
|
31 |
+
|
32 |
load_in_8bit: false
|
33 |
load_in_4bit: true
|
34 |
strict: false
|
|
|
44 |
output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
|
45 |
|
46 |
adapter: qlora
|
|
|
47 |
lora_r: 32
|
48 |
lora_alpha: 16
|
49 |
lora_dropout: 0.05
|
|
|
51 |
|
52 |
sequence_len: 2048
|
53 |
sample_packing: true
|
54 |
+
eval_sample_packing: false
|
55 |
pad_to_sequence_len: true
|
56 |
|
57 |
gradient_accumulation_steps: 4
|
58 |
+
micro_batch_size: 1
|
59 |
num_epochs: 3
|
|
|
60 |
optimizer: adamw_torch
|
61 |
lr_scheduler: cosine
|
62 |
learning_rate: 2e-5
|
63 |
|
|
|
64 |
train_on_inputs: false
|
65 |
group_by_length: false
|
66 |
bf16: auto
|
67 |
fp16:
|
68 |
+
tf32: true
|
|
|
69 |
|
|
|
70 |
gradient_checkpointing: true
|
71 |
early_stopping_patience:
|
72 |
resume_from_checkpoint:
|
|
|
75 |
xformers_attention:
|
76 |
flash_attention: false
|
77 |
|
|
|
|
|
|
|
|
|
78 |
warmup_ratio: 0.1
|
79 |
evals_per_epoch: 4
|
80 |
eval_max_new_tokens: 128
|
81 |
saves_per_epoch: 1
|
82 |
debug:
|
|
|
|
|
|
|
83 |
deepspeed: deepspeed_configs/zero1.json
|
84 |
+
weight_decay: 0.0
|
85 |
|
86 |
fsdp:
|
87 |
fsdp_config:
|
88 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
```
|
90 |
|
91 |
</details><br>
|
|
|
94 |
|
95 |
This model is a fine-tuned version of [google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) on the databricks/databricks-dolly-15k dataset.
|
96 |
It achieves the following results on the evaluation set:
|
97 |
+
- Loss: 1.4649
|
98 |
|
99 |
## Model description
|
100 |
|
|
|
114 |
|
115 |
The following hyperparameters were used during training:
|
116 |
- learning_rate: 2e-05
|
117 |
+
- train_batch_size: 1
|
118 |
+
- eval_batch_size: 1
|
119 |
- seed: 42
|
120 |
- distributed_type: multi-GPU
|
121 |
- num_devices: 2
|
122 |
- gradient_accumulation_steps: 4
|
123 |
+
- total_train_batch_size: 8
|
124 |
+
- total_eval_batch_size: 2
|
125 |
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
126 |
- lr_scheduler_type: cosine
|
127 |
+
- lr_scheduler_warmup_steps: 46
|
128 |
- num_epochs: 3
|
129 |
|
130 |
### Training results
|
131 |
|
132 |
| Training Loss | Epoch | Step | Validation Loss |
|
133 |
|:-------------:|:------:|:----:|:---------------:|
|
134 |
+
| 4.0853 | 0.0065 | 1 | 2.5485 |
|
135 |
+
| 3.4071 | 0.2524 | 39 | 2.1938 |
|
136 |
+
| 1.9159 | 0.5049 | 78 | 1.6474 |
|
137 |
+
| 1.6968 | 0.7573 | 117 | 1.5546 |
|
138 |
+
| 1.7757 | 1.0129 | 156 | 1.5193 |
|
139 |
+
| 1.7768 | 1.2654 | 195 | 1.4965 |
|
140 |
+
| 1.3735 | 1.5178 | 234 | 1.4835 |
|
141 |
+
| 1.7285 | 1.7702 | 273 | 1.4744 |
|
142 |
+
| 1.6601 | 2.0259 | 312 | 1.4701 |
|
143 |
+
| 1.6477 | 2.2783 | 351 | 1.4657 |
|
144 |
+
| 1.3795 | 2.5307 | 390 | 1.4645 |
|
145 |
+
| 1.6575 | 2.7832 | 429 | 1.4649 |
|
146 |
|
147 |
|
148 |
### Framework versions
|
adapter_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 456822394
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:661b80aaae193a2bc65f5ebb67429f6c202da3bca1f700c37e0d8c4737584c7c
|
3 |
size 456822394
|