kweinmeister commited on
Commit
ebaadec
·
verified ·
1 Parent(s): 11837e8

End of training

Browse files
Files changed (2) hide show
  1. README.md +29 -181
  2. adapter_model.bin +1 -1
README.md CHANGED
@@ -20,123 +20,15 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  axolotl version: `0.6.0`
22
  ```yaml
23
- # base_model: meta-llama/Llama-3.2-1B-Instruct
24
- # # Automatically upload checkpoint and final model to HF
25
- # # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-MetaMathQA
26
- # hub_model_id: kweinmeister/Llama-3.2-1B-Instruct-gsm8k
27
-
28
- # load_in_8bit: false
29
- # load_in_4bit: true
30
- # strict: false
31
-
32
-
33
- # datasets:
34
- # - path: openai/gsm8k
35
- # type: alpaca_chat.load_qa
36
- # name: "main"
37
- # train_on_split: "train"
38
-
39
-
40
- # # datasets:
41
- # # - path: meta-math/MetaMathQA
42
- # # type:
43
- # # field_instruction: query
44
- # # field_output: response
45
-
46
- # val_set_size: 0.1
47
- # # output_dir: "/mnt/disks/gcs/axolotl/outputs/out"
48
- # output_dir: "/mnt/disks/gcs/axolotl/outputs/gsm8k-out"
49
- # # output_dir: "/mnt/disks/gcs/axolotl/outputs/MetaMathQA-out"
50
-
51
- # adapter: qlora
52
- # lora_model_dir:
53
-
54
- # sequence_len: 2048
55
- # sample_packing: true
56
- # eval_sample_packing: true
57
- # pad_to_sequence_len: true
58
-
59
- # lora_r: 32
60
- # lora_alpha: 16
61
- # lora_dropout: 0.05
62
- # lora_fan_in_fan_out:
63
- # lora_target_modules:
64
- # - gate_proj
65
- # - down_proj
66
- # - up_proj
67
- # - q_proj
68
- # - v_proj
69
- # - k_proj
70
- # - o_proj
71
-
72
- # wandb_project:
73
- # wandb_entity:
74
- # wandb_watch:
75
- # wandb_name:
76
- # wandb_log_model:
77
-
78
- # gradient_accumulation_steps: 4
79
- # micro_batch_size: 2
80
- # num_epochs: 3
81
- # # optimizer: adamw_bnb_8bit
82
- # optimizer: adamw_torch
83
- # lr_scheduler: cosine
84
- # learning_rate: 2e-5
85
-
86
- # train_on_inputs: false
87
- # group_by_length: false
88
- # bf16: auto
89
- # fp16:
90
- # tf32: false
91
-
92
- # # gradient_checkpointing: true
93
- # gradient_checkpointing: false
94
- # early_stopping_patience:
95
- # resume_from_checkpoint:
96
- # local_rank:
97
- # logging_steps: 1
98
- # xformers_attention:
99
- # flash_attention: true
100
-
101
- # loss_watchdog_threshold: 5.0
102
- # loss_watchdog_patience: 3
103
-
104
- # warmup_steps: 10
105
- # evals_per_epoch: 4
106
- # eval_table_size:
107
- # eval_max_new_tokens: 128
108
- # saves_per_epoch: 1
109
- # debug:
110
- # deepspeed:
111
- # weight_decay: 0.0
112
- # # fsdp:
113
- # # fsdp_config:
114
- # fsdp:
115
- # - full_shard
116
- # - auto_wrap
117
- # fsdp_config:
118
- # fsdp_limit_all_gathers: true
119
- # fsdp_sync_module_states: true
120
- # fsdp_offload_params: true
121
- # fsdp_use_orig_params: false
122
- # fsdp_cpu_ram_efficient_loading: true
123
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
124
- # fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
125
- # fsdp_state_dict_type: FULL_STATE_DICT
126
- # fsdp_sharding_strategy: FULL_SHARD
127
- # fsdp_activation_checkpointing: true
128
- # special_tokens:
129
- # # pad_token: "<|end_of_text|>"
130
- # special_tokens:
131
- # bos_token: "<|begin_of_text|>"
132
- # eos_token: "<|eot_id|>"
133
- # pad_token: "<|finetune_right_pad_id|>"
134
-
135
  base_model: google/gemma-2-27b-it
136
- # model_type: AutoModelForCausalLM
137
- # tokenizer_type: AutoTokenizer
138
  hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
139
 
 
 
 
 
140
  load_in_8bit: false
141
  load_in_4bit: true
142
  strict: false
@@ -152,7 +44,6 @@ val_set_size: 0.1
152
  output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
153
 
154
  adapter: qlora
155
-
156
  lora_r: 32
157
  lora_alpha: 16
158
  lora_dropout: 0.05
@@ -160,26 +51,22 @@ lora_target_linear: true
160
 
161
  sequence_len: 2048
162
  sample_packing: true
163
- # eval_sample_packing: true
164
  pad_to_sequence_len: true
165
 
166
  gradient_accumulation_steps: 4
167
- micro_batch_size: 2
168
  num_epochs: 3
169
- # optimizer: adamw_bnb_8bit
170
  optimizer: adamw_torch
171
  lr_scheduler: cosine
172
  learning_rate: 2e-5
173
 
174
-
175
  train_on_inputs: false
176
  group_by_length: false
177
  bf16: auto
178
  fp16:
179
- tf32: false
180
-
181
 
182
- # gradient_checkpointing: false
183
  gradient_checkpointing: true
184
  early_stopping_patience:
185
  resume_from_checkpoint:
@@ -188,56 +75,17 @@ logging_steps: 1
188
  xformers_attention:
189
  flash_attention: false
190
 
191
- # loss_watchdog_threshold: 5.0
192
- # loss_watchdog_patience: 3
193
-
194
-
195
  warmup_ratio: 0.1
196
  evals_per_epoch: 4
197
  eval_max_new_tokens: 128
198
  saves_per_epoch: 1
199
  debug:
200
- # deepspeed:
201
- weight_decay: 0.0
202
-
203
  deepspeed: deepspeed_configs/zero1.json
 
204
 
205
  fsdp:
206
  fsdp_config:
207
- # fsdp:
208
- # - full_shard
209
- # - auto_wrap
210
-
211
- # fsdp_config:
212
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
213
- # fsdp_backward_prefetch: BACKWARD_PRE
214
- # fsdp_cpu_ram_efficient_loading: true
215
- # fsdp_forward_prefetch: false
216
- # fsdp_offload_params: true
217
- # fsdp_sharding_strategy: FULL_SHARD
218
- # fsdp_state_dict_type: SHARDED_STATE_DICT
219
- # fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
220
- # fsdp_sync_module_states: true
221
- # fsdp_use_orig_params: true
222
-
223
-
224
- # fsdp_config:
225
- # fsdp_limit_all_gathers: true
226
- # fsdp_sync_module_states: true
227
- # fsdp_offload_params: true
228
- # fsdp_use_orig_params: false
229
- # fsdp_cpu_ram_efficient_loading: true
230
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
231
- # fsdp_transformer_layer_cls_to_wrap: GemmaDecoderLayer
232
- # fsdp_state_dict_type: FULL_STATE_DICT
233
- # fsdp_sharding_strategy: FULL_SHARD
234
- # fsdp_activation_checkpointing: true
235
- # special_tokens:
236
- # # pad_token: "<|end_of_text|>"
237
- # special_tokens:
238
- # bos_token: "<|begin_of_text|>"
239
- # eos_token: "<|eot_id|>"
240
- # pad_token: "<|finetune_right_pad_id|>"
241
  ```
242
 
243
  </details><br>
@@ -246,7 +94,7 @@ fsdp_config:
246
 
247
  This model is a fine-tuned version of [google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) on the databricks/databricks-dolly-15k dataset.
248
  It achieves the following results on the evaluation set:
249
- - Loss: 1.6809
250
 
251
  ## Model description
252
 
@@ -266,35 +114,35 @@ More information needed
266
 
267
  The following hyperparameters were used during training:
268
  - learning_rate: 2e-05
269
- - train_batch_size: 2
270
- - eval_batch_size: 2
271
  - seed: 42
272
  - distributed_type: multi-GPU
273
  - num_devices: 2
274
  - gradient_accumulation_steps: 4
275
- - total_train_batch_size: 16
276
- - total_eval_batch_size: 4
277
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
278
  - lr_scheduler_type: cosine
279
- - lr_scheduler_warmup_steps: 23
280
  - num_epochs: 3
281
 
282
  ### Training results
283
 
284
  | Training Loss | Epoch | Step | Validation Loss |
285
  |:-------------:|:------:|:----:|:---------------:|
286
- | 3.8741 | 0.0129 | 1 | 4.1287 |
287
- | 3.5275 | 0.2589 | 20 | 3.7627 |
288
- | 2.5496 | 0.5178 | 40 | 2.5361 |
289
- | 2.1047 | 0.7767 | 60 | 2.0215 |
290
- | 1.8435 | 1.0259 | 80 | 1.8475 |
291
- | 1.8821 | 1.2848 | 100 | 1.7748 |
292
- | 1.834 | 1.5437 | 120 | 1.7345 |
293
- | 1.7633 | 1.8026 | 140 | 1.7098 |
294
- | 1.6382 | 2.0647 | 160 | 1.6954 |
295
- | 1.9356 | 2.3236 | 180 | 1.6863 |
296
- | 1.6196 | 2.5825 | 200 | 1.6819 |
297
- | 1.7489 | 2.8414 | 220 | 1.6809 |
298
 
299
 
300
  ### Framework versions
 
20
 
21
  axolotl version: `0.6.0`
22
  ```yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  base_model: google/gemma-2-27b-it
24
+ model_type: AutoModelForCausalLM
25
+ tokenizer_type: AutoTokenizer
26
  hub_model_id: kweinmeister/gemma-2-27b-it-dolly-15k
27
 
28
+ # https://github.com/vllm-project/vllm/issues/10590
29
+ bnb_config_kwargs:
30
+ bnb_4bit_quant_storage: uint8
31
+
32
  load_in_8bit: false
33
  load_in_4bit: true
34
  strict: false
 
44
  output_dir: "/mnt/disks/gcs/axolotl/outputs/dolly-15k-out"
45
 
46
  adapter: qlora
 
47
  lora_r: 32
48
  lora_alpha: 16
49
  lora_dropout: 0.05
 
51
 
52
  sequence_len: 2048
53
  sample_packing: true
54
+ eval_sample_packing: false
55
  pad_to_sequence_len: true
56
 
57
  gradient_accumulation_steps: 4
58
+ micro_batch_size: 1
59
  num_epochs: 3
 
60
  optimizer: adamw_torch
61
  lr_scheduler: cosine
62
  learning_rate: 2e-5
63
 
 
64
  train_on_inputs: false
65
  group_by_length: false
66
  bf16: auto
67
  fp16:
68
+ tf32: true
 
69
 
 
70
  gradient_checkpointing: true
71
  early_stopping_patience:
72
  resume_from_checkpoint:
 
75
  xformers_attention:
76
  flash_attention: false
77
 
 
 
 
 
78
  warmup_ratio: 0.1
79
  evals_per_epoch: 4
80
  eval_max_new_tokens: 128
81
  saves_per_epoch: 1
82
  debug:
 
 
 
83
  deepspeed: deepspeed_configs/zero1.json
84
+ weight_decay: 0.0
85
 
86
  fsdp:
87
  fsdp_config:
88
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  ```
90
 
91
  </details><br>
 
94
 
95
  This model is a fine-tuned version of [google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it) on the databricks/databricks-dolly-15k dataset.
96
  It achieves the following results on the evaluation set:
97
+ - Loss: 1.4649
98
 
99
  ## Model description
100
 
 
114
 
115
  The following hyperparameters were used during training:
116
  - learning_rate: 2e-05
117
+ - train_batch_size: 1
118
+ - eval_batch_size: 1
119
  - seed: 42
120
  - distributed_type: multi-GPU
121
  - num_devices: 2
122
  - gradient_accumulation_steps: 4
123
+ - total_train_batch_size: 8
124
+ - total_eval_batch_size: 2
125
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
126
  - lr_scheduler_type: cosine
127
+ - lr_scheduler_warmup_steps: 46
128
  - num_epochs: 3
129
 
130
  ### Training results
131
 
132
  | Training Loss | Epoch | Step | Validation Loss |
133
  |:-------------:|:------:|:----:|:---------------:|
134
+ | 4.0853 | 0.0065 | 1 | 2.5485 |
135
+ | 3.4071 | 0.2524 | 39 | 2.1938 |
136
+ | 1.9159 | 0.5049 | 78 | 1.6474 |
137
+ | 1.6968 | 0.7573 | 117 | 1.5546 |
138
+ | 1.7757 | 1.0129 | 156 | 1.5193 |
139
+ | 1.7768 | 1.2654 | 195 | 1.4965 |
140
+ | 1.3735 | 1.5178 | 234 | 1.4835 |
141
+ | 1.7285 | 1.7702 | 273 | 1.4744 |
142
+ | 1.6601 | 2.0259 | 312 | 1.4701 |
143
+ | 1.6477 | 2.2783 | 351 | 1.4657 |
144
+ | 1.3795 | 2.5307 | 390 | 1.4645 |
145
+ | 1.6575 | 2.7832 | 429 | 1.4649 |
146
 
147
 
148
  ### Framework versions
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89ea0874183b876bfbac6d55eff0dbfeaf282abd3be07d67d0ef8029990dc192
3
  size 456822394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:661b80aaae193a2bc65f5ebb67429f6c202da3bca1f700c37e0d8c4737584c7c
3
  size 456822394