lilferrit commited on
Commit
bfd0468
1 Parent(s): 0faa663

Training in progress, step 10000

Browse files
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 0.354862233609357,
3
- "eval_bleu": 26.3346,
4
- "eval_gen_len": 26.6907,
5
- "eval_loss": 1.6253596544265747,
6
- "eval_runtime": 221.231,
7
  "eval_samples": 3000,
8
- "eval_samples_per_second": 13.56,
9
- "eval_steps_per_second": 1.695,
10
- "total_flos": 3.589813132276531e+16,
11
- "train_loss": 2.132816611328125,
12
- "train_runtime": 15981.9131,
13
- "train_samples": 4508785,
14
- "train_samples_per_second": 100.113,
15
- "train_steps_per_second": 6.257
16
  }
 
1
  {
2
+ "epoch": 2.7777777777777777,
3
+ "eval_bleu": 23.6596,
4
+ "eval_gen_len": 27.526,
5
+ "eval_loss": 1.7469114065170288,
6
+ "eval_runtime": 235.9606,
7
  "eval_samples": 3000,
8
+ "eval_samples_per_second": 12.714,
9
+ "eval_steps_per_second": 1.589,
10
+ "total_flos": 3.803274433029734e+16,
11
+ "train_loss": 1.5316169482421875,
12
+ "train_runtime": 15895.0874,
13
+ "train_samples": 576000,
14
+ "train_samples_per_second": 100.66,
15
+ "train_steps_per_second": 6.291
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 0.354862233609357,
3
- "eval_bleu": 26.3346,
4
- "eval_gen_len": 26.6907,
5
- "eval_loss": 1.6253596544265747,
6
- "eval_runtime": 221.231,
7
  "eval_samples": 3000,
8
- "eval_samples_per_second": 13.56,
9
- "eval_steps_per_second": 1.695
10
  }
 
1
  {
2
+ "epoch": 2.7777777777777777,
3
+ "eval_bleu": 23.6596,
4
+ "eval_gen_len": 27.526,
5
+ "eval_loss": 1.7469114065170288,
6
+ "eval_runtime": 235.9606,
7
  "eval_samples": 3000,
8
+ "eval_samples_per_second": 12.714,
9
+ "eval_steps_per_second": 1.589
10
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba60101c5760711223e20065dd911fd73d4bf74287240235d072840395e8aaaa
3
  size 241984552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e0cb2c8f5b00f2545b91fa799ac84d339f1cf8f11cbb40722e1f7b08bf1d74
3
  size 241984552
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 128,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.354862233609357,
3
- "total_flos": 3.589813132276531e+16,
4
- "train_loss": 2.132816611328125,
5
- "train_runtime": 15981.9131,
6
- "train_samples": 4508785,
7
- "train_samples_per_second": 100.113,
8
- "train_steps_per_second": 6.257
9
  }
 
1
  {
2
+ "epoch": 2.7777777777777777,
3
+ "total_flos": 3.803274433029734e+16,
4
+ "train_loss": 1.5316169482421875,
5
+ "train_runtime": 15895.0874,
6
+ "train_samples": 576000,
7
+ "train_samples_per_second": 100.66,
8
+ "train_steps_per_second": 6.291
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 26.3346,
3
  "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14/checkpoint-100000",
4
- "epoch": 0.354862233609357,
5
  "eval_steps": 10000,
6
  "global_step": 100000,
7
  "is_hyper_param_search": false,
@@ -9,261 +9,261 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01774311168046785,
13
- "grad_norm": 1.5128649473190308,
14
  "learning_rate": 0.000475,
15
- "loss": 2.4524,
16
  "step": 5000
17
  },
18
  {
19
- "epoch": 0.0354862233609357,
20
- "grad_norm": 1.242777705192566,
21
  "learning_rate": 0.00045000000000000004,
22
- "loss": 2.3103,
23
  "step": 10000
24
  },
25
  {
26
- "epoch": 0.0354862233609357,
27
- "eval_bleu": 22.058,
28
- "eval_gen_len": 27.7263,
29
- "eval_loss": 1.8454290628433228,
30
- "eval_runtime": 238.9627,
31
- "eval_samples_per_second": 12.554,
32
- "eval_steps_per_second": 1.569,
33
  "step": 10000
34
  },
35
  {
36
- "epoch": 0.05322933504140355,
37
- "grad_norm": 0.988516628742218,
38
  "learning_rate": 0.000425,
39
- "loss": 2.2594,
40
  "step": 15000
41
  },
42
  {
43
- "epoch": 0.0709724467218714,
44
- "grad_norm": 1.4387503862380981,
45
  "learning_rate": 0.0004,
46
- "loss": 2.2141,
47
  "step": 20000
48
  },
49
  {
50
- "epoch": 0.0709724467218714,
51
- "eval_bleu": 23.339,
52
- "eval_gen_len": 26.7147,
53
- "eval_loss": 1.7811188697814941,
54
- "eval_runtime": 225.1917,
55
- "eval_samples_per_second": 13.322,
56
- "eval_steps_per_second": 1.665,
57
  "step": 20000
58
  },
59
  {
60
- "epoch": 0.08871555840233925,
61
- "grad_norm": 1.2291666269302368,
62
  "learning_rate": 0.000375,
63
- "loss": 2.1898,
64
  "step": 25000
65
  },
66
  {
67
- "epoch": 0.1064586700828071,
68
- "grad_norm": 1.1732761859893799,
69
  "learning_rate": 0.00035,
70
- "loss": 2.176,
71
  "step": 30000
72
  },
73
  {
74
- "epoch": 0.1064586700828071,
75
- "eval_bleu": 24.3234,
76
- "eval_gen_len": 27.125,
77
- "eval_loss": 1.7360602617263794,
78
- "eval_runtime": 227.357,
79
- "eval_samples_per_second": 13.195,
80
- "eval_steps_per_second": 1.649,
81
  "step": 30000
82
  },
83
  {
84
- "epoch": 0.12420178176327495,
85
- "grad_norm": 1.187321662902832,
86
  "learning_rate": 0.00032500000000000004,
87
- "loss": 2.1468,
88
  "step": 35000
89
  },
90
  {
91
- "epoch": 0.1419448934437428,
92
- "grad_norm": 1.3599053621292114,
93
  "learning_rate": 0.0003,
94
- "loss": 2.139,
95
  "step": 40000
96
  },
97
  {
98
- "epoch": 0.1419448934437428,
99
- "eval_bleu": 25.0888,
100
- "eval_gen_len": 26.8213,
101
- "eval_loss": 1.7130982875823975,
102
- "eval_runtime": 221.7983,
103
- "eval_samples_per_second": 13.526,
104
- "eval_steps_per_second": 1.691,
105
  "step": 40000
106
  },
107
  {
108
- "epoch": 0.15968800512421066,
109
- "grad_norm": 1.4392811059951782,
110
  "learning_rate": 0.000275,
111
- "loss": 2.1151,
112
  "step": 45000
113
  },
114
  {
115
- "epoch": 0.1774311168046785,
116
- "grad_norm": 1.4162044525146484,
117
  "learning_rate": 0.00025,
118
- "loss": 2.1084,
119
  "step": 50000
120
  },
121
  {
122
- "epoch": 0.1774311168046785,
123
- "eval_bleu": 24.9992,
124
- "eval_gen_len": 26.824,
125
- "eval_loss": 1.687427043914795,
126
- "eval_runtime": 224.0057,
127
- "eval_samples_per_second": 13.393,
128
- "eval_steps_per_second": 1.674,
129
  "step": 50000
130
  },
131
  {
132
- "epoch": 0.19517422848514634,
133
- "grad_norm": 1.2046048641204834,
134
  "learning_rate": 0.00022500000000000002,
135
- "loss": 2.0914,
136
  "step": 55000
137
  },
138
  {
139
- "epoch": 0.2129173401656142,
140
- "grad_norm": 1.2651879787445068,
141
  "learning_rate": 0.0002,
142
- "loss": 2.0826,
143
  "step": 60000
144
  },
145
  {
146
- "epoch": 0.2129173401656142,
147
- "eval_bleu": 25.7297,
148
- "eval_gen_len": 26.62,
149
- "eval_loss": 1.6685482263565063,
150
- "eval_runtime": 221.6914,
151
- "eval_samples_per_second": 13.532,
152
- "eval_steps_per_second": 1.692,
153
  "step": 60000
154
  },
155
  {
156
- "epoch": 0.23066045184608205,
157
- "grad_norm": 1.212643027305603,
158
  "learning_rate": 0.000175,
159
- "loss": 2.0778,
160
  "step": 65000
161
  },
162
  {
163
- "epoch": 0.2484035635265499,
164
- "grad_norm": 1.2400418519973755,
165
  "learning_rate": 0.00015,
166
- "loss": 2.068,
167
  "step": 70000
168
  },
169
  {
170
- "epoch": 0.2484035635265499,
171
- "eval_bleu": 25.9031,
172
- "eval_gen_len": 26.685,
173
- "eval_loss": 1.648539662361145,
174
- "eval_runtime": 223.211,
175
- "eval_samples_per_second": 13.44,
176
- "eval_steps_per_second": 1.68,
177
  "step": 70000
178
  },
179
  {
180
- "epoch": 0.26614667520701774,
181
- "grad_norm": 1.3389995098114014,
182
  "learning_rate": 0.000125,
183
- "loss": 2.0566,
184
  "step": 75000
185
  },
186
  {
187
- "epoch": 0.2838897868874856,
188
- "grad_norm": 1.1512677669525146,
189
  "learning_rate": 0.0001,
190
- "loss": 2.05,
191
  "step": 80000
192
  },
193
  {
194
- "epoch": 0.2838897868874856,
195
- "eval_bleu": 26.143,
196
- "eval_gen_len": 26.8693,
197
- "eval_loss": 1.6370748281478882,
198
- "eval_runtime": 225.2245,
199
- "eval_samples_per_second": 13.32,
200
- "eval_steps_per_second": 1.665,
201
  "step": 80000
202
  },
203
  {
204
- "epoch": 0.30163289856795344,
205
- "grad_norm": 1.1607016324996948,
206
  "learning_rate": 7.5e-05,
207
- "loss": 2.0235,
208
  "step": 85000
209
  },
210
  {
211
- "epoch": 0.3193760102484213,
212
- "grad_norm": 1.2967106103897095,
213
  "learning_rate": 5e-05,
214
- "loss": 2.0331,
215
  "step": 90000
216
  },
217
  {
218
- "epoch": 0.3193760102484213,
219
- "eval_bleu": 26.3038,
220
- "eval_gen_len": 26.5183,
221
- "eval_loss": 1.6311123371124268,
222
- "eval_runtime": 219.2546,
223
- "eval_samples_per_second": 13.683,
224
- "eval_steps_per_second": 1.71,
225
  "step": 90000
226
  },
227
  {
228
- "epoch": 0.33711912192888915,
229
- "grad_norm": 1.2956724166870117,
230
  "learning_rate": 2.5e-05,
231
- "loss": 2.0346,
232
  "step": 95000
233
  },
234
  {
235
- "epoch": 0.354862233609357,
236
- "grad_norm": 1.1822398900985718,
237
  "learning_rate": 0.0,
238
- "loss": 2.0273,
239
  "step": 100000
240
  },
241
  {
242
- "epoch": 0.354862233609357,
243
- "eval_bleu": 26.3346,
244
- "eval_gen_len": 26.6907,
245
- "eval_loss": 1.6253596544265747,
246
- "eval_runtime": 221.5579,
247
- "eval_samples_per_second": 13.54,
248
- "eval_steps_per_second": 1.693,
249
  "step": 100000
250
  },
251
  {
252
- "epoch": 0.354862233609357,
253
  "step": 100000,
254
- "total_flos": 3.589813132276531e+16,
255
- "train_loss": 2.132816611328125,
256
- "train_runtime": 15981.9131,
257
- "train_samples_per_second": 100.113,
258
- "train_steps_per_second": 6.257
259
  }
260
  ],
261
  "logging_steps": 5000,
262
  "max_steps": 100000,
263
  "num_input_tokens_seen": 0,
264
- "num_train_epochs": 1,
265
  "save_steps": 10000,
266
- "total_flos": 3.589813132276531e+16,
267
  "train_batch_size": 8,
268
  "trial_name": null,
269
  "trial_params": null
 
1
  {
2
+ "best_metric": 23.6596,
3
  "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14/checkpoint-100000",
4
+ "epoch": 2.7777777777777777,
5
  "eval_steps": 10000,
6
  "global_step": 100000,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.1388888888888889,
13
+ "grad_norm": 1.066943645477295,
14
  "learning_rate": 0.000475,
15
+ "loss": 1.9627,
16
  "step": 5000
17
  },
18
  {
19
+ "epoch": 0.2777777777777778,
20
+ "grad_norm": 0.9774492383003235,
21
  "learning_rate": 0.00045000000000000004,
22
+ "loss": 1.7738,
23
  "step": 10000
24
  },
25
  {
26
+ "epoch": 0.2777777777777778,
27
+ "eval_bleu": 20.1598,
28
+ "eval_gen_len": 28.1563,
29
+ "eval_loss": 1.914583444595337,
30
+ "eval_runtime": 241.8013,
31
+ "eval_samples_per_second": 12.407,
32
+ "eval_steps_per_second": 1.551,
33
  "step": 10000
34
  },
35
  {
36
+ "epoch": 0.4166666666666667,
37
+ "grad_norm": 1.4306731224060059,
38
  "learning_rate": 0.000425,
39
+ "loss": 1.6951,
40
  "step": 15000
41
  },
42
  {
43
+ "epoch": 0.5555555555555556,
44
+ "grad_norm": 1.1782424449920654,
45
  "learning_rate": 0.0004,
46
+ "loss": 1.6498,
47
  "step": 20000
48
  },
49
  {
50
+ "epoch": 0.5555555555555556,
51
+ "eval_bleu": 21.4167,
52
+ "eval_gen_len": 27.853,
53
+ "eval_loss": 1.855008840560913,
54
+ "eval_runtime": 242.3949,
55
+ "eval_samples_per_second": 12.376,
56
+ "eval_steps_per_second": 1.547,
57
  "step": 20000
58
  },
59
  {
60
+ "epoch": 0.6944444444444444,
61
+ "grad_norm": 1.219376802444458,
62
  "learning_rate": 0.000375,
63
+ "loss": 1.6172,
64
  "step": 25000
65
  },
66
  {
67
+ "epoch": 0.8333333333333334,
68
+ "grad_norm": 1.2735612392425537,
69
  "learning_rate": 0.00035,
70
+ "loss": 1.5903,
71
  "step": 30000
72
  },
73
  {
74
+ "epoch": 0.8333333333333334,
75
+ "eval_bleu": 22.604,
76
+ "eval_gen_len": 27.7613,
77
+ "eval_loss": 1.8276705741882324,
78
+ "eval_runtime": 240.5149,
79
+ "eval_samples_per_second": 12.473,
80
+ "eval_steps_per_second": 1.559,
81
  "step": 30000
82
  },
83
  {
84
+ "epoch": 0.9722222222222222,
85
+ "grad_norm": 1.0282609462738037,
86
  "learning_rate": 0.00032500000000000004,
87
+ "loss": 1.5633,
88
  "step": 35000
89
  },
90
  {
91
+ "epoch": 1.1111111111111112,
92
+ "grad_norm": 1.406827688217163,
93
  "learning_rate": 0.0003,
94
+ "loss": 1.5151,
95
  "step": 40000
96
  },
97
  {
98
+ "epoch": 1.1111111111111112,
99
+ "eval_bleu": 22.1273,
100
+ "eval_gen_len": 27.3187,
101
+ "eval_loss": 1.8127936124801636,
102
+ "eval_runtime": 234.7049,
103
+ "eval_samples_per_second": 12.782,
104
+ "eval_steps_per_second": 1.598,
105
  "step": 40000
106
  },
107
  {
108
+ "epoch": 1.25,
109
+ "grad_norm": 1.174306035041809,
110
  "learning_rate": 0.000275,
111
+ "loss": 1.5004,
112
  "step": 45000
113
  },
114
  {
115
+ "epoch": 1.3888888888888888,
116
+ "grad_norm": 1.5665515661239624,
117
  "learning_rate": 0.00025,
118
+ "loss": 1.4866,
119
  "step": 50000
120
  },
121
  {
122
+ "epoch": 1.3888888888888888,
123
+ "eval_bleu": 22.8295,
124
+ "eval_gen_len": 27.419,
125
+ "eval_loss": 1.7999275922775269,
126
+ "eval_runtime": 233.8115,
127
+ "eval_samples_per_second": 12.831,
128
+ "eval_steps_per_second": 1.604,
129
  "step": 50000
130
  },
131
  {
132
+ "epoch": 1.5277777777777777,
133
+ "grad_norm": 1.1425319910049438,
134
  "learning_rate": 0.00022500000000000002,
135
+ "loss": 1.4799,
136
  "step": 55000
137
  },
138
  {
139
+ "epoch": 1.6666666666666665,
140
+ "grad_norm": 1.123904824256897,
141
  "learning_rate": 0.0002,
142
+ "loss": 1.4696,
143
  "step": 60000
144
  },
145
  {
146
+ "epoch": 1.6666666666666665,
147
+ "eval_bleu": 22.9923,
148
+ "eval_gen_len": 27.7387,
149
+ "eval_loss": 1.780959963798523,
150
+ "eval_runtime": 240.0938,
151
+ "eval_samples_per_second": 12.495,
152
+ "eval_steps_per_second": 1.562,
153
  "step": 60000
154
  },
155
  {
156
+ "epoch": 1.8055555555555556,
157
+ "grad_norm": 1.4292243719100952,
158
  "learning_rate": 0.000175,
159
+ "loss": 1.4613,
160
  "step": 65000
161
  },
162
  {
163
+ "epoch": 1.9444444444444444,
164
+ "grad_norm": 1.1662226915359497,
165
  "learning_rate": 0.00015,
166
+ "loss": 1.4508,
167
  "step": 70000
168
  },
169
  {
170
+ "epoch": 1.9444444444444444,
171
+ "eval_bleu": 23.1046,
172
+ "eval_gen_len": 27.7057,
173
+ "eval_loss": 1.7654317617416382,
174
+ "eval_runtime": 236.6367,
175
+ "eval_samples_per_second": 12.678,
176
+ "eval_steps_per_second": 1.585,
177
  "step": 70000
178
  },
179
  {
180
+ "epoch": 2.0833333333333335,
181
+ "grad_norm": 0.9245423674583435,
182
  "learning_rate": 0.000125,
183
+ "loss": 1.4235,
184
  "step": 75000
185
  },
186
  {
187
+ "epoch": 2.2222222222222223,
188
+ "grad_norm": 1.2502944469451904,
189
  "learning_rate": 0.0001,
190
+ "loss": 1.4053,
191
  "step": 80000
192
  },
193
  {
194
+ "epoch": 2.2222222222222223,
195
+ "eval_bleu": 23.5079,
196
+ "eval_gen_len": 27.643,
197
+ "eval_loss": 1.758699655532837,
198
+ "eval_runtime": 237.5663,
199
+ "eval_samples_per_second": 12.628,
200
+ "eval_steps_per_second": 1.579,
201
  "step": 80000
202
  },
203
  {
204
+ "epoch": 2.361111111111111,
205
+ "grad_norm": 0.9593023061752319,
206
  "learning_rate": 7.5e-05,
207
+ "loss": 1.408,
208
  "step": 85000
209
  },
210
  {
211
+ "epoch": 2.5,
212
+ "grad_norm": 1.440004825592041,
213
  "learning_rate": 5e-05,
214
+ "loss": 1.3956,
215
  "step": 90000
216
  },
217
  {
218
+ "epoch": 2.5,
219
+ "eval_bleu": 23.3848,
220
+ "eval_gen_len": 27.6637,
221
+ "eval_loss": 1.752461552619934,
222
+ "eval_runtime": 237.0184,
223
+ "eval_samples_per_second": 12.657,
224
+ "eval_steps_per_second": 1.582,
225
  "step": 90000
226
  },
227
  {
228
+ "epoch": 2.638888888888889,
229
+ "grad_norm": 1.1929932832717896,
230
  "learning_rate": 2.5e-05,
231
+ "loss": 1.3938,
232
  "step": 95000
233
  },
234
  {
235
+ "epoch": 2.7777777777777777,
236
+ "grad_norm": 1.0216492414474487,
237
  "learning_rate": 0.0,
238
+ "loss": 1.3903,
239
  "step": 100000
240
  },
241
  {
242
+ "epoch": 2.7777777777777777,
243
+ "eval_bleu": 23.6596,
244
+ "eval_gen_len": 27.526,
245
+ "eval_loss": 1.7469114065170288,
246
+ "eval_runtime": 235.9542,
247
+ "eval_samples_per_second": 12.714,
248
+ "eval_steps_per_second": 1.589,
249
  "step": 100000
250
  },
251
  {
252
+ "epoch": 2.7777777777777777,
253
  "step": 100000,
254
+ "total_flos": 3.803274433029734e+16,
255
+ "train_loss": 1.5316169482421875,
256
+ "train_runtime": 15895.0874,
257
+ "train_samples_per_second": 100.66,
258
+ "train_steps_per_second": 6.291
259
  }
260
  ],
261
  "logging_steps": 5000,
262
  "max_steps": 100000,
263
  "num_input_tokens_seen": 0,
264
+ "num_train_epochs": 3,
265
  "save_steps": 10000,
266
+ "total_flos": 3.803274433029734e+16,
267
  "train_batch_size": 8,
268
  "trial_name": null,
269
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c032e73464a2058a913f4433764c71c06f52d322360e96ce929b33d49f129624
3
- size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca3b2463e1aca65e73c3842cf3e0162ea69920dc5160ae31c9b16756f63072a
3
+ size 5112