rakhman-llm commited on
Commit
7051856
1 Parent(s): 20b7cff

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:653c6bba71c5381d01bd043dc2c055bff5332dd7c7b2584287d7e14c2d7ee9d1
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2141320f9b8a2cd3772fb223d1131baebd71974e2bf87b6a1c24de6bf13c8fae
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ea6c528e61d3383dbf8b7d2e9e49dde698241c4bd67ffba410e3666e0023ede
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee2854b2510e7dccd90f9604ed994b1eb7a8d1050daf4f2c17001c85658d2b8
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d492ae092e6f5b76a2098bc809b3c1bf0702a65bf8be974795457efc1921df9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce122efa58437e9eb4167be3235ff668c46393aa7e777fe0c234b56b42a65288
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32ed8f7e313bb93709f94e944fc8cc81a5cde7370033d4c148cc7bbc922ddb47
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd4b192ffac4efaba4f3cd35aa1226766929983bc37c1c74305bd3d40b48b106
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,224 +1,19 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 14436,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.10390689941812137,
13
- "grad_norm": 1.8855979442596436,
14
- "learning_rate": 1.9310058187863676e-05,
15
- "loss": 0.8022,
16
  "step": 500
17
- },
18
- {
19
- "epoch": 0.20781379883624274,
20
- "grad_norm": 0.8622527122497559,
21
- "learning_rate": 1.86173455250762e-05,
22
- "loss": 0.5316,
23
- "step": 1000
24
- },
25
- {
26
- "epoch": 0.3117206982543641,
27
- "grad_norm": 1.404678225517273,
28
- "learning_rate": 1.7924632862288724e-05,
29
- "loss": 0.4832,
30
- "step": 1500
31
- },
32
- {
33
- "epoch": 0.41562759767248547,
34
- "grad_norm": 1.3559819459915161,
35
- "learning_rate": 1.7233305624826823e-05,
36
- "loss": 0.4518,
37
- "step": 2000
38
- },
39
- {
40
- "epoch": 0.5195344970906068,
41
- "grad_norm": 0.8163271546363831,
42
- "learning_rate": 1.6540592962039347e-05,
43
- "loss": 0.4389,
44
- "step": 2500
45
- },
46
- {
47
- "epoch": 0.6234413965087282,
48
- "grad_norm": 0.8109046816825867,
49
- "learning_rate": 1.584788029925187e-05,
50
- "loss": 0.4193,
51
- "step": 3000
52
- },
53
- {
54
- "epoch": 0.7273482959268496,
55
- "grad_norm": 1.0217444896697998,
56
- "learning_rate": 1.5155167636464397e-05,
57
- "loss": 0.4215,
58
- "step": 3500
59
- },
60
- {
61
- "epoch": 0.8312551953449709,
62
- "grad_norm": 1.6476292610168457,
63
- "learning_rate": 1.446245497367692e-05,
64
- "loss": 0.4062,
65
- "step": 4000
66
- },
67
- {
68
- "epoch": 0.9351620947630923,
69
- "grad_norm": 1.4694277048110962,
70
- "learning_rate": 1.3769742310889445e-05,
71
- "loss": 0.4029,
72
- "step": 4500
73
- },
74
- {
75
- "epoch": 1.0,
76
- "eval_loss": 0.3212089240550995,
77
- "eval_runtime": 31.4577,
78
- "eval_samples_per_second": 15.704,
79
- "eval_steps_per_second": 7.852,
80
- "step": 4812
81
- },
82
- {
83
- "epoch": 1.0390689941812137,
84
- "grad_norm": 0.6561925411224365,
85
- "learning_rate": 1.3077029648101969e-05,
86
- "loss": 0.4059,
87
- "step": 5000
88
- },
89
- {
90
- "epoch": 1.142975893599335,
91
- "grad_norm": 0.6741281747817993,
92
- "learning_rate": 1.2384316985314493e-05,
93
- "loss": 0.3594,
94
- "step": 5500
95
- },
96
- {
97
- "epoch": 1.2468827930174564,
98
- "grad_norm": 0.48619207739830017,
99
- "learning_rate": 1.1691604322527017e-05,
100
- "loss": 0.3736,
101
- "step": 6000
102
- },
103
- {
104
- "epoch": 1.3507896924355778,
105
- "grad_norm": 1.1009119749069214,
106
- "learning_rate": 1.099889165973954e-05,
107
- "loss": 0.3624,
108
- "step": 6500
109
- },
110
- {
111
- "epoch": 1.4546965918536992,
112
- "grad_norm": 0.3497615456581116,
113
- "learning_rate": 1.0306178996952066e-05,
114
- "loss": 0.3516,
115
- "step": 7000
116
- },
117
- {
118
- "epoch": 1.5586034912718203,
119
- "grad_norm": 1.4209001064300537,
120
- "learning_rate": 9.61346633416459e-06,
121
- "loss": 0.3565,
122
- "step": 7500
123
- },
124
- {
125
- "epoch": 1.6625103906899419,
126
- "grad_norm": 0.8116744160652161,
127
- "learning_rate": 8.920753671377114e-06,
128
- "loss": 0.3635,
129
- "step": 8000
130
- },
131
- {
132
- "epoch": 1.766417290108063,
133
- "grad_norm": 0.8015578985214233,
134
- "learning_rate": 8.228041008589638e-06,
135
- "loss": 0.3549,
136
- "step": 8500
137
- },
138
- {
139
- "epoch": 1.8703241895261846,
140
- "grad_norm": 0.7980790734291077,
141
- "learning_rate": 7.536713771127737e-06,
142
- "loss": 0.3495,
143
- "step": 9000
144
- },
145
- {
146
- "epoch": 1.9742310889443058,
147
- "grad_norm": 1.4501579999923706,
148
- "learning_rate": 6.845386533665836e-06,
149
- "loss": 0.3385,
150
- "step": 9500
151
- },
152
- {
153
- "epoch": 2.0,
154
- "eval_loss": 0.29613471031188965,
155
- "eval_runtime": 31.4504,
156
- "eval_samples_per_second": 15.707,
157
- "eval_steps_per_second": 7.854,
158
- "step": 9624
159
- },
160
- {
161
- "epoch": 2.0781379883624274,
162
- "grad_norm": 0.6130263209342957,
163
- "learning_rate": 6.15267387087836e-06,
164
- "loss": 0.3293,
165
- "step": 10000
166
- },
167
- {
168
- "epoch": 2.1820448877805485,
169
- "grad_norm": 1.2724053859710693,
170
- "learning_rate": 5.459961208090885e-06,
171
- "loss": 0.3369,
172
- "step": 10500
173
- },
174
- {
175
- "epoch": 2.28595178719867,
176
- "grad_norm": 0.7700533270835876,
177
- "learning_rate": 4.767248545303408e-06,
178
- "loss": 0.3387,
179
- "step": 11000
180
- },
181
- {
182
- "epoch": 2.3898586866167912,
183
- "grad_norm": 1.4450799226760864,
184
- "learning_rate": 4.0759213078415074e-06,
185
- "loss": 0.3411,
186
- "step": 11500
187
- },
188
- {
189
- "epoch": 2.493765586034913,
190
- "grad_norm": 0.8265316486358643,
191
- "learning_rate": 3.3832086450540318e-06,
192
- "loss": 0.317,
193
- "step": 12000
194
- },
195
- {
196
- "epoch": 2.597672485453034,
197
- "grad_norm": 1.4168757200241089,
198
- "learning_rate": 2.690495982266556e-06,
199
- "loss": 0.3176,
200
- "step": 12500
201
- },
202
- {
203
- "epoch": 2.7015793848711556,
204
- "grad_norm": 1.0274338722229004,
205
- "learning_rate": 1.99778331947908e-06,
206
- "loss": 0.3389,
207
- "step": 13000
208
- },
209
- {
210
- "epoch": 2.8054862842892767,
211
- "grad_norm": 1.6529736518859863,
212
- "learning_rate": 1.3050706566916044e-06,
213
- "loss": 0.3163,
214
- "step": 13500
215
- },
216
- {
217
- "epoch": 2.9093931837073983,
218
- "grad_norm": 0.7652114629745483,
219
- "learning_rate": 6.123579939041286e-07,
220
- "loss": 0.3267,
221
- "step": 14000
222
  }
223
  ],
224
  "logging_steps": 500,
@@ -233,12 +28,12 @@
233
  "should_evaluate": false,
234
  "should_log": false,
235
  "should_save": true,
236
- "should_training_stop": true
237
  },
238
  "attributes": {}
239
  }
240
  },
241
- "total_flos": 1.758000534257664e+16,
242
  "train_batch_size": 2,
243
  "trial_name": null,
244
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.10390689941812137,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.10390689941812137,
13
+ "grad_norm": 3.488266944885254,
14
+ "learning_rate": 1.9311443613189252e-05,
15
+ "loss": 0.8436,
16
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
  ],
19
  "logging_steps": 500,
 
28
  "should_evaluate": false,
29
  "should_log": false,
30
  "should_save": true,
31
+ "should_training_stop": false
32
  },
33
  "attributes": {}
34
  }
35
  },
36
+ "total_flos": 608957890560000.0,
37
  "train_batch_size": 2,
38
  "trial_name": null,
39
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:458592354564e0c56775e9ca6e222e6e9b0676dd27e2547cdd1181aaf3301fdb
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1dbee35c77e893d48b1da8b35396070180d26658b9fe560d8ed118dfc2d009
3
  size 5432