Daoguang commited on
Commit
011253a
1 Parent(s): 612c423

Upload 19 files

Browse files
latest CHANGED
@@ -1 +1 @@
1
- global_step100
 
1
+ global_step75
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08282b46825aa78d10fe10e3fea89555c5b5a691b261a3ddfd58fcb58370edff
3
  size 15984
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbab71d98a3a9a92df82a6bba463947327c3a1bcf35cd9f4f46114641fc42dd9
3
  size 15984
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caac82d57d878d30219a4f9ec289a97ff90c53afc160b968f251b3fd3454b8d8
3
  size 15984
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19762d2d370222b01817da11bbaa6665d542293373186d66f754e7246bb861ed
3
  size 15984
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00c7508b346a7d3c5c23392845f1d013331114ade778794b76e919cb3ed5d33e
3
  size 15984
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b89de7d14dd20a191f56b74c816ef8b7fe5c171e31efbeadbf321c4539ed68c3
3
  size 15984
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c71152053553e6e22d670fbc4fd7550bf8a046b54cad7b71869787986a6a42c
3
  size 15984
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b67db12a26a26ffe03d9afc84a43857eb2e5b2fec2dd189653b415f74208190
3
  size 15984
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:864006527e551a970ff14e22c9bcc63fb4197ebca7f1b26b2aceedaf636346f3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1a13f720f74c7a907e66559bf8eb2582d7eef15e971987100e6a23b1264031
3
  size 1064
trainer_state.json CHANGED
@@ -1,320 +1,242 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.125,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
- "learning_rate": 4.9980725906018074e-05,
14
- "loss": 0.7547,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.12,
19
- "learning_rate": 4.99229333433282e-05,
20
- "loss": 0.566,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.19,
25
- "learning_rate": 4.982671142387316e-05,
26
- "loss": 0.5632,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.25,
31
- "learning_rate": 4.9692208514878444e-05,
32
- "loss": 0.5385,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.31,
37
- "learning_rate": 4.951963201008076e-05,
38
- "loss": 0.5242,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.38,
43
- "learning_rate": 4.9309248009941914e-05,
44
- "loss": 0.4853,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.44,
49
- "learning_rate": 4.906138091134118e-05,
50
- "loss": 0.5078,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.5,
55
- "learning_rate": 4.877641290737884e-05,
56
- "loss": 0.519,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.56,
61
- "learning_rate": 4.8454783398062106e-05,
62
- "loss": 0.5759,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.62,
67
- "learning_rate": 4.8096988312782174e-05,
68
- "loss": 0.5073,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.69,
73
- "learning_rate": 4.7703579345627035e-05,
74
- "loss": 0.4996,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.75,
79
- "learning_rate": 4.72751631047092e-05,
80
- "loss": 0.498,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.81,
85
- "learning_rate": 4.681240017681993e-05,
86
- "loss": 0.5313,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.88,
91
- "learning_rate": 4.6316004108852305e-05,
92
- "loss": 0.51,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.94,
97
- "learning_rate": 4.5786740307563636e-05,
98
- "loss": 0.5182,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 1.0,
103
- "learning_rate": 4.522542485937369e-05,
104
- "loss": 0.5222,
105
  "step": 32
106
  },
107
  {
108
  "epoch": 1.06,
109
- "learning_rate": 4.463292327201862e-05,
110
- "loss": 0.3137,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 1.12,
115
- "learning_rate": 4.401014914000078e-05,
116
- "loss": 0.2685,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 1.19,
121
- "learning_rate": 4.335806273589214e-05,
122
- "loss": 0.2834,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 1.25,
127
- "learning_rate": 4.267766952966369e-05,
128
- "loss": 0.2713,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 1.31,
133
- "learning_rate": 4.197001863832355e-05,
134
- "loss": 0.2705,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 1.38,
139
- "learning_rate": 4.123620120825459e-05,
140
- "loss": 0.2517,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 1.44,
145
- "learning_rate": 4.047734873274586e-05,
146
- "loss": 0.2626,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 1.5,
151
- "learning_rate": 3.969463130731183e-05,
152
- "loss": 0.2754,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 1.56,
157
- "learning_rate": 3.888925582549006e-05,
158
- "loss": 0.2645,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 1.62,
163
- "learning_rate": 3.8062464117898724e-05,
164
- "loss": 0.2405,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 1.69,
169
- "learning_rate": 3.721553103742388e-05,
170
- "loss": 0.2412,
171
  "step": 54
172
  },
173
  {
174
  "epoch": 1.75,
175
- "learning_rate": 3.634976249348867e-05,
176
- "loss": 0.2519,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 1.81,
181
- "learning_rate": 3.54664934384357e-05,
182
- "loss": 0.27,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 1.88,
187
- "learning_rate": 3.456708580912725e-05,
188
- "loss": 0.2511,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 1.94,
193
- "learning_rate": 3.365292642693732e-05,
194
- "loss": 0.2409,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 2.0,
199
- "learning_rate": 3.272542485937369e-05,
200
- "loss": 0.2486,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 2.06,
205
- "learning_rate": 3.178601124662686e-05,
206
- "loss": 0.1282,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 2.12,
211
- "learning_rate": 3.083613409639764e-05,
212
- "loss": 0.1339,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 2.19,
217
- "learning_rate": 2.9877258050403212e-05,
218
- "loss": 0.1175,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 2.25,
223
- "learning_rate": 2.8910861626005776e-05,
224
- "loss": 0.109,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 2.31,
229
- "learning_rate": 2.7938434936445945e-05,
230
- "loss": 0.12,
231
  "step": 74
232
- },
233
- {
234
- "epoch": 2.38,
235
- "learning_rate": 2.6961477393196126e-05,
236
- "loss": 0.1263,
237
- "step": 76
238
- },
239
- {
240
- "epoch": 2.44,
241
- "learning_rate": 2.598149539397672e-05,
242
- "loss": 0.1206,
243
- "step": 78
244
- },
245
- {
246
- "epoch": 2.5,
247
- "learning_rate": 2.5e-05,
248
- "loss": 0.1147,
249
- "step": 80
250
- },
251
- {
252
- "epoch": 2.56,
253
- "learning_rate": 2.4018504606023293e-05,
254
- "loss": 0.1148,
255
- "step": 82
256
- },
257
- {
258
- "epoch": 2.62,
259
- "learning_rate": 2.303852260680388e-05,
260
- "loss": 0.1162,
261
- "step": 84
262
- },
263
- {
264
- "epoch": 2.69,
265
- "learning_rate": 2.2061565063554064e-05,
266
- "loss": 0.1083,
267
- "step": 86
268
- },
269
- {
270
- "epoch": 2.75,
271
- "learning_rate": 2.1089138373994223e-05,
272
- "loss": 0.117,
273
- "step": 88
274
- },
275
- {
276
- "epoch": 2.81,
277
- "learning_rate": 2.0122741949596797e-05,
278
- "loss": 0.0933,
279
- "step": 90
280
- },
281
- {
282
- "epoch": 2.88,
283
- "learning_rate": 1.9163865903602374e-05,
284
- "loss": 0.1052,
285
- "step": 92
286
- },
287
- {
288
- "epoch": 2.94,
289
- "learning_rate": 1.8213988753373146e-05,
290
- "loss": 0.1112,
291
- "step": 94
292
- },
293
- {
294
- "epoch": 3.0,
295
- "learning_rate": 1.7274575140626318e-05,
296
- "loss": 0.1048,
297
- "step": 96
298
- },
299
- {
300
- "epoch": 3.06,
301
- "learning_rate": 1.6347073573062672e-05,
302
- "loss": 0.0463,
303
- "step": 98
304
- },
305
- {
306
- "epoch": 3.12,
307
- "learning_rate": 1.5432914190872757e-05,
308
- "loss": 0.0518,
309
- "step": 100
310
  }
311
  ],
312
  "logging_steps": 2,
313
- "max_steps": 160,
314
  "num_input_tokens_seen": 0,
315
- "num_train_epochs": 5,
316
- "save_steps": 20,
317
- "total_flos": 1.0844211055626813e+18,
318
  "train_batch_size": 2,
319
  "trial_name": null,
320
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.34375,
5
  "eval_steps": 500,
6
+ "global_step": 75,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
+ "learning_rate": 4.996988640512931e-05,
14
+ "loss": 0.7489,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.12,
19
+ "learning_rate": 4.987961816680492e-05,
20
+ "loss": 0.5822,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.19,
25
+ "learning_rate": 4.972941274911953e-05,
26
+ "loss": 0.5713,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.25,
31
+ "learning_rate": 4.951963201008076e-05,
32
+ "loss": 0.5457,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.31,
37
+ "learning_rate": 4.9250781329863606e-05,
38
+ "loss": 0.5473,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.38,
43
+ "learning_rate": 4.892350839330522e-05,
44
+ "loss": 0.5339,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.44,
49
+ "learning_rate": 4.853860162957552e-05,
50
+ "loss": 0.5368,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.5,
55
+ "learning_rate": 4.8096988312782174e-05,
56
+ "loss": 0.5598,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.56,
61
+ "learning_rate": 4.759973232808609e-05,
62
+ "loss": 0.517,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.62,
67
+ "learning_rate": 4.7048031608708876e-05,
68
+ "loss": 0.5279,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.69,
73
+ "learning_rate": 4.6443215250006806e-05,
74
+ "loss": 0.5079,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.75,
79
+ "learning_rate": 4.5786740307563636e-05,
80
+ "loss": 0.5045,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.81,
85
+ "learning_rate": 4.508018828701612e-05,
86
+ "loss": 0.5158,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.88,
91
+ "learning_rate": 4.4325261334068426e-05,
92
+ "loss": 0.5064,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.94,
97
+ "learning_rate": 4.352377813387398e-05,
98
+ "loss": 0.5072,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 1.0,
103
+ "learning_rate": 4.267766952966369e-05,
104
+ "loss": 0.5087,
105
  "step": 32
106
  },
107
  {
108
  "epoch": 1.06,
109
+ "learning_rate": 4.178897387117546e-05,
110
+ "loss": 0.3132,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 1.12,
115
+ "learning_rate": 4.085983210409114e-05,
116
+ "loss": 0.2972,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 1.19,
121
+ "learning_rate": 3.9892482612310836e-05,
122
+ "loss": 0.2747,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 1.25,
127
+ "learning_rate": 3.888925582549006e-05,
128
+ "loss": 0.2399,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 1.31,
133
+ "learning_rate": 3.785256860483054e-05,
134
+ "loss": 0.2741,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 1.38,
139
+ "learning_rate": 3.678491842064995e-05,
140
+ "loss": 0.2622,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 1.44,
145
+ "learning_rate": 3.624028324136517e-05,
146
+ "loss": 0.258,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 1.5,
151
+ "learning_rate": 3.513103285012475e-05,
152
+ "loss": 0.2574,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 1.56,
157
+ "learning_rate": 3.399737591337471e-05,
158
+ "loss": 0.2551,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 1.62,
163
+ "learning_rate": 3.284204350997229e-05,
164
+ "loss": 0.265,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 1.69,
169
+ "learning_rate": 3.1667818936872465e-05,
170
+ "loss": 0.2641,
171
  "step": 54
172
  },
173
  {
174
  "epoch": 1.75,
175
+ "learning_rate": 3.0477531003921745e-05,
176
+ "loss": 0.2588,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 1.81,
181
+ "learning_rate": 2.9274047219007534e-05,
182
+ "loss": 0.2366,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 1.88,
187
+ "learning_rate": 2.8060266879980408e-05,
188
+ "loss": 0.2581,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 1.94,
193
+ "learning_rate": 2.683911408999169e-05,
194
+ "loss": 0.2658,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 2.0,
199
+ "learning_rate": 2.561353071307281e-05,
200
+ "loss": 0.2474,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 2.06,
205
+ "learning_rate": 2.4386469286927196e-05,
206
+ "loss": 0.1347,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 2.12,
211
+ "learning_rate": 2.3160885910008318e-05,
212
+ "loss": 0.135,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 2.19,
217
+ "learning_rate": 2.1939733120019598e-05,
218
+ "loss": 0.1242,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 2.25,
223
+ "learning_rate": 2.0725952780992468e-05,
224
+ "loss": 0.1286,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 2.31,
229
+ "learning_rate": 1.9522468996078258e-05,
230
+ "loss": 0.1252,
231
  "step": 74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  }
233
  ],
234
  "logging_steps": 2,
235
+ "max_steps": 128,
236
  "num_input_tokens_seen": 0,
237
+ "num_train_epochs": 4,
238
+ "save_steps": 25,
239
+ "total_flos": 8.20785473084457e+17,
240
  "train_batch_size": 2,
241
  "trial_name": null,
242
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b3c37f005a0c19e72439d1eb4b0e077d23fb67b5b2041a4b2911b018eafd1d
3
  size 6136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d54f2c2e57d38a1e8d2652900b46927ec020d8523d9bb88688de54db84bd708
3
  size 6136