chansung commited on
Commit
dcd4ca3
1 Parent(s): 8642953

Model save

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: google/gemma-2b
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma2b-summarize-gemini1_5flash-8k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma2b-summarize-gemini1_5flash-8k
20
+
21
+ This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.5133
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 4
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 64
50
+ - total_eval_batch_size: 32
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 2.7544 | 0.9630 | 13 | 2.8722 |
61
+ | 1.7723 | 2.0 | 27 | 2.6064 |
62
+ | 1.4023 | 2.9630 | 40 | 2.5710 |
63
+ | 1.2778 | 4.0 | 54 | 2.5349 |
64
+ | 1.1848 | 4.9630 | 67 | 2.5176 |
65
+ | 1.1522 | 6.0 | 81 | 2.5045 |
66
+ | 1.1305 | 6.9630 | 94 | 2.5065 |
67
+ | 1.1075 | 8.0 | 108 | 2.5136 |
68
+ | 1.1049 | 8.9630 | 121 | 2.5129 |
69
+ | 1.1048 | 9.6296 | 130 | 2.5133 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.11.1
75
+ - Transformers 4.40.1
76
+ - Pytorch 2.2.0+cu121
77
+ - Datasets 2.19.2
78
+ - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04403c70702e213b40b51a08ec413dc89fe64a46a4924056d80717e7806f25d9
3
  size 19644912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcdbb5f08f94646f460cb0b194766f06ecf4dc3911c357818fe8b05fd0aa04e
3
  size 19644912
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.62962962962963,
3
+ "total_flos": 1.018113810235392e+17,
4
+ "train_loss": 1.4375356710874116,
5
+ "train_runtime": 456.268,
6
+ "train_samples": 7919,
7
+ "train_samples_per_second": 18.301,
8
+ "train_steps_per_second": 0.285
9
+ }
runs/Jun05_04-41-00_7bdd13775218/events.out.tfevents.1717562483.7bdd13775218.60620.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeaa39eb350c1555f5aaf958591bb51edcab04a3523b709678c27703b5d15f36
3
- size 11656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fe1d5ed141ea5a428a7f0149c31d790110c7191fa46a1f8319e55041803abe5
3
+ size 14059
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.62962962962963,
3
+ "total_flos": 1.018113810235392e+17,
4
+ "train_loss": 1.4375356710874116,
5
+ "train_runtime": 456.268,
6
+ "train_samples": 7919,
7
+ "train_samples_per_second": 18.301,
8
+ "train_steps_per_second": 0.285
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.62962962962963,
5
+ "eval_steps": 500,
6
+ "global_step": 130,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07407407407407407,
13
+ "grad_norm": 5.0625,
14
+ "learning_rate": 1.5384615384615387e-05,
15
+ "loss": 3.0102,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.37037037037037035,
20
+ "grad_norm": 2.390625,
21
+ "learning_rate": 7.692307692307693e-05,
22
+ "loss": 3.0215,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.7407407407407407,
27
+ "grad_norm": 11.5,
28
+ "learning_rate": 0.00015384615384615385,
29
+ "loss": 2.7544,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.9629629629629629,
34
+ "eval_loss": 2.8721797466278076,
35
+ "eval_runtime": 0.652,
36
+ "eval_samples_per_second": 15.338,
37
+ "eval_steps_per_second": 1.534,
38
+ "step": 13
39
+ },
40
+ {
41
+ "epoch": 1.1111111111111112,
42
+ "grad_norm": 1.3671875,
43
+ "learning_rate": 0.00019985583705641418,
44
+ "loss": 2.379,
45
+ "step": 15
46
+ },
47
+ {
48
+ "epoch": 1.4814814814814814,
49
+ "grad_norm": 2.609375,
50
+ "learning_rate": 0.00019823877374156647,
51
+ "loss": 2.0489,
52
+ "step": 20
53
+ },
54
+ {
55
+ "epoch": 1.8518518518518519,
56
+ "grad_norm": 1.5234375,
57
+ "learning_rate": 0.00019485364419471454,
58
+ "loss": 1.7723,
59
+ "step": 25
60
+ },
61
+ {
62
+ "epoch": 2.0,
63
+ "eval_loss": 2.60640549659729,
64
+ "eval_runtime": 0.5469,
65
+ "eval_samples_per_second": 18.283,
66
+ "eval_steps_per_second": 1.828,
67
+ "step": 27
68
+ },
69
+ {
70
+ "epoch": 2.2222222222222223,
71
+ "grad_norm": 0.66015625,
72
+ "learning_rate": 0.0001897613727639014,
73
+ "loss": 1.6022,
74
+ "step": 30
75
+ },
76
+ {
77
+ "epoch": 2.5925925925925926,
78
+ "grad_norm": 0.65234375,
79
+ "learning_rate": 0.00018305360832480117,
80
+ "loss": 1.4855,
81
+ "step": 35
82
+ },
83
+ {
84
+ "epoch": 2.962962962962963,
85
+ "grad_norm": 0.58203125,
86
+ "learning_rate": 0.00017485107481711012,
87
+ "loss": 1.4023,
88
+ "step": 40
89
+ },
90
+ {
91
+ "epoch": 2.962962962962963,
92
+ "eval_loss": 2.5709500312805176,
93
+ "eval_runtime": 0.7217,
94
+ "eval_samples_per_second": 13.857,
95
+ "eval_steps_per_second": 1.386,
96
+ "step": 40
97
+ },
98
+ {
99
+ "epoch": 3.3333333333333335,
100
+ "grad_norm": 0.458984375,
101
+ "learning_rate": 0.0001653013984983585,
102
+ "loss": 1.3253,
103
+ "step": 45
104
+ },
105
+ {
106
+ "epoch": 3.7037037037037037,
107
+ "grad_norm": 0.50390625,
108
+ "learning_rate": 0.00015457645101945046,
109
+ "loss": 1.2778,
110
+ "step": 50
111
+ },
112
+ {
113
+ "epoch": 4.0,
114
+ "eval_loss": 2.5349316596984863,
115
+ "eval_runtime": 0.547,
116
+ "eval_samples_per_second": 18.283,
117
+ "eval_steps_per_second": 1.828,
118
+ "step": 54
119
+ },
120
+ {
121
+ "epoch": 4.074074074074074,
122
+ "grad_norm": 0.376953125,
123
+ "learning_rate": 0.00014286925614030542,
124
+ "loss": 1.2498,
125
+ "step": 55
126
+ },
127
+ {
128
+ "epoch": 4.444444444444445,
129
+ "grad_norm": 0.267578125,
130
+ "learning_rate": 0.0001303905157574247,
131
+ "loss": 1.2221,
132
+ "step": 60
133
+ },
134
+ {
135
+ "epoch": 4.814814814814815,
136
+ "grad_norm": 0.296875,
137
+ "learning_rate": 0.00011736481776669306,
138
+ "loss": 1.1848,
139
+ "step": 65
140
+ },
141
+ {
142
+ "epoch": 4.962962962962963,
143
+ "eval_loss": 2.5175788402557373,
144
+ "eval_runtime": 0.6693,
145
+ "eval_samples_per_second": 14.942,
146
+ "eval_steps_per_second": 1.494,
147
+ "step": 67
148
+ },
149
+ {
150
+ "epoch": 5.185185185185185,
151
+ "grad_norm": 0.291015625,
152
+ "learning_rate": 0.00010402659401094152,
153
+ "loss": 1.1814,
154
+ "step": 70
155
+ },
156
+ {
157
+ "epoch": 5.555555555555555,
158
+ "grad_norm": 0.33203125,
159
+ "learning_rate": 9.061590105968208e-05,
160
+ "loss": 1.1574,
161
+ "step": 75
162
+ },
163
+ {
164
+ "epoch": 5.925925925925926,
165
+ "grad_norm": 0.349609375,
166
+ "learning_rate": 7.73740997570278e-05,
167
+ "loss": 1.1522,
168
+ "step": 80
169
+ },
170
+ {
171
+ "epoch": 6.0,
172
+ "eval_loss": 2.5044538974761963,
173
+ "eval_runtime": 0.5444,
174
+ "eval_samples_per_second": 18.369,
175
+ "eval_steps_per_second": 1.837,
176
+ "step": 81
177
+ },
178
+ {
179
+ "epoch": 6.296296296296296,
180
+ "grad_norm": 0.2412109375,
181
+ "learning_rate": 6.453951129574644e-05,
182
+ "loss": 1.1367,
183
+ "step": 85
184
+ },
185
+ {
186
+ "epoch": 6.666666666666667,
187
+ "grad_norm": 0.28515625,
188
+ "learning_rate": 5.234312799786921e-05,
189
+ "loss": 1.1305,
190
+ "step": 90
191
+ },
192
+ {
193
+ "epoch": 6.962962962962963,
194
+ "eval_loss": 2.506514310836792,
195
+ "eval_runtime": 0.685,
196
+ "eval_samples_per_second": 14.598,
197
+ "eval_steps_per_second": 1.46,
198
+ "step": 94
199
+ },
200
+ {
201
+ "epoch": 7.037037037037037,
202
+ "grad_norm": 0.2265625,
203
+ "learning_rate": 4.100445599768774e-05,
204
+ "loss": 1.1188,
205
+ "step": 95
206
+ },
207
+ {
208
+ "epoch": 7.407407407407407,
209
+ "grad_norm": 0.2099609375,
210
+ "learning_rate": 3.072756464904006e-05,
211
+ "loss": 1.1222,
212
+ "step": 100
213
+ },
214
+ {
215
+ "epoch": 7.777777777777778,
216
+ "grad_norm": 0.248046875,
217
+ "learning_rate": 2.1697413758237784e-05,
218
+ "loss": 1.1075,
219
+ "step": 105
220
+ },
221
+ {
222
+ "epoch": 8.0,
223
+ "eval_loss": 2.5136494636535645,
224
+ "eval_runtime": 0.5462,
225
+ "eval_samples_per_second": 18.307,
226
+ "eval_steps_per_second": 1.831,
227
+ "step": 108
228
+ },
229
+ {
230
+ "epoch": 8.148148148148149,
231
+ "grad_norm": 0.2470703125,
232
+ "learning_rate": 1.4076524743778319e-05,
233
+ "loss": 1.1126,
234
+ "step": 110
235
+ },
236
+ {
237
+ "epoch": 8.518518518518519,
238
+ "grad_norm": 0.236328125,
239
+ "learning_rate": 8.002055634117578e-06,
240
+ "loss": 1.1118,
241
+ "step": 115
242
+ },
243
+ {
244
+ "epoch": 8.88888888888889,
245
+ "grad_norm": 0.2275390625,
246
+ "learning_rate": 3.5833325466437694e-06,
247
+ "loss": 1.1049,
248
+ "step": 120
249
+ },
250
+ {
251
+ "epoch": 8.962962962962964,
252
+ "eval_loss": 2.512882709503174,
253
+ "eval_runtime": 0.6584,
254
+ "eval_samples_per_second": 15.188,
255
+ "eval_steps_per_second": 1.519,
256
+ "step": 121
257
+ },
258
+ {
259
+ "epoch": 9.25925925925926,
260
+ "grad_norm": 0.2216796875,
261
+ "learning_rate": 8.998820754091531e-07,
262
+ "loss": 1.1116,
263
+ "step": 125
264
+ },
265
+ {
266
+ "epoch": 9.62962962962963,
267
+ "grad_norm": 0.20703125,
268
+ "learning_rate": 0.0,
269
+ "loss": 1.1048,
270
+ "step": 130
271
+ },
272
+ {
273
+ "epoch": 9.62962962962963,
274
+ "eval_loss": 2.513336658477783,
275
+ "eval_runtime": 0.5375,
276
+ "eval_samples_per_second": 18.606,
277
+ "eval_steps_per_second": 1.861,
278
+ "step": 130
279
+ },
280
+ {
281
+ "epoch": 9.62962962962963,
282
+ "step": 130,
283
+ "total_flos": 1.018113810235392e+17,
284
+ "train_loss": 1.4375356710874116,
285
+ "train_runtime": 456.268,
286
+ "train_samples_per_second": 18.301,
287
+ "train_steps_per_second": 0.285
288
+ }
289
+ ],
290
+ "logging_steps": 5,
291
+ "max_steps": 130,
292
+ "num_input_tokens_seen": 0,
293
+ "num_train_epochs": 10,
294
+ "save_steps": 100,
295
+ "total_flos": 1.018113810235392e+17,
296
+ "train_batch_size": 8,
297
+ "trial_name": null,
298
+ "trial_params": null
299
+ }