juyongjiang commited on
Commit
9d869fd
1 Parent(s): e2dc056

update model checkpoint

Browse files
README.md CHANGED
@@ -1,14 +1,11 @@
1
  ---
2
- license: gemma
3
  library_name: peft
4
  tags:
5
  - alignment-handbook
6
- - trl
7
- - sft
8
  - generated_from_trainer
9
- base_model: google/gemma-7b
10
  datasets:
11
  - llama-duo/synth_summarize_dataset_dedup
 
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-1k
14
  results: []
@@ -21,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.8104
25
 
26
  ## Model description
27
 
@@ -42,13 +39,13 @@ More information needed
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
  - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 2
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 16
51
- - total_eval_batch_size: 8
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -56,24 +53,24 @@ The following hyperparameters were used during training:
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:------:|:----:|:---------------:|
61
- | 37.3003 | 0.9231 | 6 | 12.3240 |
62
- | 21.0453 | 2.0 | 13 | 7.5884 |
63
- | 17.809 | 2.9231 | 19 | 5.6549 |
64
- | 3.5386 | 4.0 | 26 | 3.1607 |
65
- | 2.02 | 4.9231 | 32 | 2.8709 |
66
- | 1.732 | 6.0 | 39 | 2.7893 |
67
- | 1.4072 | 6.9231 | 45 | 2.7970 |
68
- | 1.3061 | 8.0 | 52 | 2.8125 |
69
- | 1.2613 | 8.9231 | 58 | 2.8098 |
70
- | 1.2304 | 9.2308 | 60 | 2.8104 |
71
 
72
 
73
  ### Framework versions
74
 
75
  - PEFT 0.10.0
76
  - Transformers 4.40.0
77
- - Pytorch 2.2.1+cu121
78
  - Datasets 2.18.0
79
  - Tokenizers 0.19.1
 
1
  ---
 
2
  library_name: peft
3
  tags:
4
  - alignment-handbook
 
 
5
  - generated_from_trainer
 
6
  datasets:
7
  - llama-duo/synth_summarize_dataset_dedup
8
+ base_model: google/gemma-7b
9
  model-index:
10
  - name: gemma7b-summarize-gpt4o-1k
11
  results: []
 
18
 
19
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 8.6199
22
 
23
  ## Model description
24
 
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 0.0002
41
  - train_batch_size: 4
42
+ - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 8
46
  - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
 
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 45.5656 | 1.0 | 2 | 16.5046 |
59
+ | 45.5656 | 2.0 | 4 | 14.2000 |
60
+ | 35.6654 | 3.0 | 6 | 12.9944 |
61
+ | 35.6654 | 4.0 | 8 | 11.5695 |
62
+ | 22.2461 | 5.0 | 10 | 10.3065 |
63
+ | 22.2461 | 6.0 | 12 | 9.3645 |
64
+ | 22.2461 | 7.0 | 14 | 8.9071 |
65
+ | 19.7508 | 8.0 | 16 | 8.6934 |
66
+ | 19.7508 | 9.0 | 18 | 8.6287 |
67
+ | 19.172 | 10.0 | 20 | 8.6199 |
68
 
69
 
70
  ### Framework versions
71
 
72
  - PEFT 0.10.0
73
  - Transformers 4.40.0
74
+ - Pytorch 2.1.2+cu121
75
  - Datasets 2.18.0
76
  - Tokenizers 0.19.1
adapter_config.json CHANGED
@@ -20,12 +20,7 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "up_proj",
25
- "o_proj",
26
- "down_proj",
27
  "q_proj",
28
- "gate_proj",
29
  "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
 
23
  "q_proj",
 
24
  "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b104e6f5540b08ace63471b211aa65fe88708966056627cf08b1936b4b1d53a
3
- size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ba5026dc8a4332f1bda391e69642e0e08f75c80cded8fc13e67619b6649812b
3
+ size 6437384
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 9.23076923076923,
3
- "eval_loss": 2.8104407787323,
4
- "eval_runtime": 2.0654,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.842,
7
- "eval_steps_per_second": 0.968,
8
- "total_flos": 4.58661949299753e+16,
9
- "train_loss": 8.682566889127095,
10
- "train_runtime": 414.6045,
11
- "train_samples": 923,
12
- "train_samples_per_second": 2.484,
13
- "train_steps_per_second": 0.145
14
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 8.61988639831543,
4
+ "eval_runtime": 0.2332,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.89,
7
+ "eval_steps_per_second": 4.289,
8
+ "total_flos": 6.098355666236211e+16,
9
+ "train_loss": 24.70357437133789,
10
+ "train_runtime": 50.9898,
11
+ "train_samples": 1009,
12
+ "train_samples_per_second": 21.769,
13
+ "train_steps_per_second": 0.392
14
  }
config.json CHANGED
@@ -23,9 +23,9 @@
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
- "bnb_4bit_quant_storage": "bfloat16",
27
  "bnb_4bit_quant_type": "nf4",
28
- "bnb_4bit_use_double_quant": true,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
 
23
  "_load_in_4bit": true,
24
  "_load_in_8bit": false,
25
  "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "uint8",
27
  "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": false,
29
  "llm_int8_enable_fp32_cpu_offload": false,
30
  "llm_int8_has_fp16_weight": false,
31
  "llm_int8_skip_modules": null,
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.23076923076923,
3
- "eval_loss": 2.8104407787323,
4
- "eval_runtime": 2.0654,
5
  "eval_samples": 25,
6
- "eval_samples_per_second": 4.842,
7
- "eval_steps_per_second": 0.968
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 8.61988639831543,
4
+ "eval_runtime": 0.2332,
5
  "eval_samples": 25,
6
+ "eval_samples_per_second": 42.89,
7
+ "eval_steps_per_second": 4.289
8
  }
runs/Jun13_05-28-24_gpu1-2/events.out.tfevents.1718227795.gpu1-2.1098203.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdd0a506e72160437f30230d4c3227102b55ed318f1b0059505260852041fe02
3
+ size 9490
runs/Jun13_05-28-24_gpu1-2/events.out.tfevents.1718227846.gpu1-2.1098203.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed3a6b5fed727579c9ab25cc18830e87be1f35058e8396aea0dbb0a11ab82028
3
+ size 354
runs/Jun13_05-43-12_gpu1-2/events.out.tfevents.1718228630.gpu1-2.1115325.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:325dfcd53bfd352d0daeed642f6bd91679bc5b6b0a03e3f41a7e5df262dcdd05
3
+ size 9490
runs/Jun13_05-43-12_gpu1-2/events.out.tfevents.1718228682.gpu1-2.1115325.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af5f66d4099cd4f2e4d3a9e694905bf6b6039f28e4bd1d81bcd158ddf908bf0
3
+ size 354
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 9.23076923076923,
3
- "total_flos": 4.58661949299753e+16,
4
- "train_loss": 8.682566889127095,
5
- "train_runtime": 414.6045,
6
- "train_samples": 923,
7
- "train_samples_per_second": 2.484,
8
- "train_steps_per_second": 0.145
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 6.098355666236211e+16,
4
+ "train_loss": 24.70357437133789,
5
+ "train_runtime": 50.9898,
6
+ "train_samples": 1009,
7
+ "train_samples_per_second": 21.769,
8
+ "train_steps_per_second": 0.392
9
  }
trainer_state.json CHANGED
@@ -1,200 +1,144 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.23076923076923,
5
  "eval_steps": 500,
6
- "global_step": 60,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.15384615384615385,
13
- "grad_norm": 390.0,
14
- "learning_rate": 3.3333333333333335e-05,
15
- "loss": 40.7928,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.7692307692307693,
20
- "grad_norm": 69.0,
21
- "learning_rate": 0.0001666666666666667,
22
- "loss": 37.3003,
23
- "step": 5
24
- },
25
- {
26
- "epoch": 0.9230769230769231,
27
- "eval_loss": 12.323984146118164,
28
- "eval_runtime": 2.0206,
29
- "eval_samples_per_second": 4.949,
30
- "eval_steps_per_second": 0.99,
31
- "step": 6
32
- },
33
- {
34
- "epoch": 1.5384615384615383,
35
- "grad_norm": 26.0,
36
- "learning_rate": 0.00019730448705798239,
37
- "loss": 21.0453,
38
- "step": 10
39
  },
40
  {
41
  "epoch": 2.0,
42
- "eval_loss": 7.588448524475098,
43
- "eval_runtime": 2.0384,
44
- "eval_samples_per_second": 4.906,
45
- "eval_steps_per_second": 0.981,
46
- "step": 13
47
  },
48
  {
49
- "epoch": 2.3076923076923075,
50
- "grad_norm": 16.75,
51
  "learning_rate": 0.00018660254037844388,
52
- "loss": 17.809,
53
- "step": 15
54
- },
55
- {
56
- "epoch": 2.9230769230769234,
57
- "eval_loss": 5.654947757720947,
58
- "eval_runtime": 2.0499,
59
- "eval_samples_per_second": 4.878,
60
- "eval_steps_per_second": 0.976,
61
- "step": 19
62
- },
63
- {
64
- "epoch": 3.076923076923077,
65
- "grad_norm": 58.0,
66
- "learning_rate": 0.0001686241637868734,
67
- "loss": 13.3008,
68
- "step": 20
69
  },
70
  {
71
- "epoch": 3.8461538461538463,
72
- "grad_norm": 8.0,
73
- "learning_rate": 0.00014487991802004623,
74
- "loss": 3.5386,
75
- "step": 25
 
76
  },
77
  {
78
  "epoch": 4.0,
79
- "eval_loss": 3.160724401473999,
80
- "eval_runtime": 2.0507,
81
- "eval_samples_per_second": 4.876,
82
- "eval_steps_per_second": 0.975,
83
- "step": 26
84
  },
85
  {
86
- "epoch": 4.615384615384615,
87
- "grad_norm": 3.21875,
88
  "learning_rate": 0.00011736481776669306,
89
- "loss": 2.02,
90
- "step": 30
91
- },
92
- {
93
- "epoch": 4.923076923076923,
94
- "eval_loss": 2.8709278106689453,
95
- "eval_runtime": 2.0507,
96
- "eval_samples_per_second": 4.876,
97
- "eval_steps_per_second": 0.975,
98
- "step": 32
99
  },
100
  {
101
- "epoch": 5.384615384615385,
102
- "grad_norm": 3.375,
103
- "learning_rate": 8.839070858747697e-05,
104
- "loss": 1.732,
105
- "step": 35
 
106
  },
107
  {
108
  "epoch": 6.0,
109
- "eval_loss": 2.789313793182373,
110
- "eval_runtime": 2.0444,
111
- "eval_samples_per_second": 4.891,
112
- "eval_steps_per_second": 0.978,
113
- "step": 39
114
  },
115
  {
116
- "epoch": 6.153846153846154,
117
- "grad_norm": 4.0,
118
- "learning_rate": 6.039202339608432e-05,
119
- "loss": 1.5413,
120
- "step": 40
 
121
  },
122
  {
123
- "epoch": 6.923076923076923,
124
- "grad_norm": 2.015625,
125
  "learning_rate": 3.5721239031346066e-05,
126
- "loss": 1.4072,
127
- "step": 45
128
- },
129
- {
130
- "epoch": 6.923076923076923,
131
- "eval_loss": 2.797020435333252,
132
- "eval_runtime": 2.049,
133
- "eval_samples_per_second": 4.88,
134
- "eval_steps_per_second": 0.976,
135
- "step": 45
136
- },
137
- {
138
- "epoch": 7.6923076923076925,
139
- "grad_norm": 1.53125,
140
- "learning_rate": 1.6451218858706374e-05,
141
- "loss": 1.3061,
142
- "step": 50
143
  },
144
  {
145
  "epoch": 8.0,
146
- "eval_loss": 2.8124804496765137,
147
- "eval_runtime": 2.0441,
148
- "eval_samples_per_second": 4.892,
149
- "eval_steps_per_second": 0.978,
150
- "step": 52
151
  },
152
  {
153
- "epoch": 8.461538461538462,
154
- "grad_norm": 1.453125,
155
- "learning_rate": 4.20104876845111e-06,
156
- "loss": 1.2613,
157
- "step": 55
 
158
  },
159
  {
160
- "epoch": 8.923076923076923,
161
- "eval_loss": 2.8097996711730957,
162
- "eval_runtime": 2.0484,
163
- "eval_samples_per_second": 4.882,
164
- "eval_steps_per_second": 0.976,
165
- "step": 58
166
- },
167
- {
168
- "epoch": 9.23076923076923,
169
- "grad_norm": 1.296875,
170
  "learning_rate": 0.0,
171
- "loss": 1.2304,
172
- "step": 60
173
  },
174
  {
175
- "epoch": 9.23076923076923,
176
- "eval_loss": 2.8104407787323,
177
- "eval_runtime": 2.0425,
178
- "eval_samples_per_second": 4.896,
179
- "eval_steps_per_second": 0.979,
180
- "step": 60
181
  },
182
  {
183
- "epoch": 9.23076923076923,
184
- "step": 60,
185
- "total_flos": 4.58661949299753e+16,
186
- "train_loss": 8.682566889127095,
187
- "train_runtime": 414.6045,
188
- "train_samples_per_second": 2.484,
189
- "train_steps_per_second": 0.145
190
  }
191
  ],
192
  "logging_steps": 5,
193
- "max_steps": 60,
194
  "num_input_tokens_seen": 0,
195
  "num_train_epochs": 10,
196
  "save_steps": 100,
197
- "total_flos": 4.58661949299753e+16,
198
  "train_batch_size": 4,
199
  "trial_name": null,
200
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.5,
13
+ "grad_norm": 173.0,
14
+ "learning_rate": 0.0001,
15
+ "loss": 45.5656,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_loss": 16.504554748535156,
21
+ "eval_runtime": 0.2461,
22
+ "eval_samples_per_second": 40.635,
23
+ "eval_steps_per_second": 4.063,
24
+ "step": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  },
26
  {
27
  "epoch": 2.0,
28
+ "eval_loss": 14.199999809265137,
29
+ "eval_runtime": 0.2394,
30
+ "eval_samples_per_second": 41.776,
31
+ "eval_steps_per_second": 4.178,
32
+ "step": 4
33
  },
34
  {
35
+ "epoch": 2.5,
36
+ "grad_norm": 31.5,
37
  "learning_rate": 0.00018660254037844388,
38
+ "loss": 35.6654,
39
+ "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  },
41
  {
42
+ "epoch": 3.0,
43
+ "eval_loss": 12.994392395019531,
44
+ "eval_runtime": 0.2412,
45
+ "eval_samples_per_second": 41.452,
46
+ "eval_steps_per_second": 4.145,
47
+ "step": 6
48
  },
49
  {
50
  "epoch": 4.0,
51
+ "eval_loss": 11.569496154785156,
52
+ "eval_runtime": 0.2326,
53
+ "eval_samples_per_second": 42.992,
54
+ "eval_steps_per_second": 4.299,
55
+ "step": 8
56
  },
57
  {
58
+ "epoch": 5.0,
59
+ "grad_norm": 10.5,
60
  "learning_rate": 0.00011736481776669306,
61
+ "loss": 22.2461,
62
+ "step": 10
 
 
 
 
 
 
 
 
63
  },
64
  {
65
+ "epoch": 5.0,
66
+ "eval_loss": 10.306487083435059,
67
+ "eval_runtime": 0.2322,
68
+ "eval_samples_per_second": 43.073,
69
+ "eval_steps_per_second": 4.307,
70
+ "step": 10
71
  },
72
  {
73
  "epoch": 6.0,
74
+ "eval_loss": 9.36452579498291,
75
+ "eval_runtime": 0.2332,
76
+ "eval_samples_per_second": 42.887,
77
+ "eval_steps_per_second": 4.289,
78
+ "step": 12
79
  },
80
  {
81
+ "epoch": 7.0,
82
+ "eval_loss": 8.907111167907715,
83
+ "eval_runtime": 0.2345,
84
+ "eval_samples_per_second": 42.648,
85
+ "eval_steps_per_second": 4.265,
86
+ "step": 14
87
  },
88
  {
89
+ "epoch": 7.5,
90
+ "grad_norm": 4.5,
91
  "learning_rate": 3.5721239031346066e-05,
92
+ "loss": 19.7508,
93
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 8.0,
97
+ "eval_loss": 8.693410873413086,
98
+ "eval_runtime": 0.2425,
99
+ "eval_samples_per_second": 41.229,
100
+ "eval_steps_per_second": 4.123,
101
+ "step": 16
102
  },
103
  {
104
+ "epoch": 9.0,
105
+ "eval_loss": 8.628682136535645,
106
+ "eval_runtime": 0.2394,
107
+ "eval_samples_per_second": 41.765,
108
+ "eval_steps_per_second": 4.177,
109
+ "step": 18
110
  },
111
  {
112
+ "epoch": 10.0,
113
+ "grad_norm": 4.125,
 
 
 
 
 
 
 
 
114
  "learning_rate": 0.0,
115
+ "loss": 19.172,
116
+ "step": 20
117
  },
118
  {
119
+ "epoch": 10.0,
120
+ "eval_loss": 8.61988639831543,
121
+ "eval_runtime": 0.232,
122
+ "eval_samples_per_second": 43.094,
123
+ "eval_steps_per_second": 4.309,
124
+ "step": 20
125
  },
126
  {
127
+ "epoch": 10.0,
128
+ "step": 20,
129
+ "total_flos": 6.098355666236211e+16,
130
+ "train_loss": 24.70357437133789,
131
+ "train_runtime": 50.9898,
132
+ "train_samples_per_second": 21.769,
133
+ "train_steps_per_second": 0.392
134
  }
135
  ],
136
  "logging_steps": 5,
137
+ "max_steps": 20,
138
  "num_input_tokens_seen": 0,
139
  "num_train_epochs": 10,
140
  "save_steps": 100,
141
+ "total_flos": 6.098355666236211e+16,
142
  "train_batch_size": 4,
143
  "trial_name": null,
144
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d621e364c8574aed19ac641cf9318bb861f0211c8e6432fdb14829c73625eadd
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e21acb43b827015a9b74447a5fa950df5fd32ebdb581d53558420047a768950e
3
  size 5176