Training in progress, step 6000
Browse files- model.safetensors +1 -1
- run-3/checkpoint-4000/model.safetensors +1 -1
- run-3/checkpoint-4000/optimizer.pt +1 -1
- run-3/checkpoint-4000/rng_state.pth +1 -1
- run-3/checkpoint-4000/scheduler.pt +1 -1
- run-3/checkpoint-4000/training_args.bin +1 -1
- run-3/checkpoint-4500/model.safetensors +1 -1
- run-3/checkpoint-4500/optimizer.pt +1 -1
- run-3/checkpoint-4500/rng_state.pth +1 -1
- run-3/checkpoint-4500/scheduler.pt +1 -1
- run-3/checkpoint-4500/training_args.bin +1 -1
- run-3/checkpoint-5000/model.safetensors +1 -1
- run-3/checkpoint-5000/optimizer.pt +1 -1
- run-3/checkpoint-5000/rng_state.pth +1 -1
- run-3/checkpoint-5000/scheduler.pt +1 -1
- run-3/checkpoint-5000/training_args.bin +1 -1
- run-3/checkpoint-5500/model.safetensors +1 -1
- run-3/checkpoint-5500/optimizer.pt +1 -1
- run-3/checkpoint-5500/rng_state.pth +1 -1
- run-3/checkpoint-5500/scheduler.pt +1 -1
- run-3/checkpoint-5500/training_args.bin +1 -1
- run-3/checkpoint-6000/model.safetensors +1 -1
- run-3/checkpoint-6000/optimizer.pt +1 -1
- run-3/checkpoint-6000/rng_state.pth +1 -1
- run-3/checkpoint-6000/scheduler.pt +1 -1
- run-3/checkpoint-6000/trainer_state.json +78 -78
- run-3/checkpoint-6000/training_args.bin +1 -1
- run-3/checkpoint-6500/model.safetensors +1 -1
- run-3/checkpoint-6500/optimizer.pt +1 -1
- run-3/checkpoint-6500/rng_state.pth +1 -1
- run-3/checkpoint-6500/scheduler.pt +1 -1
- run-3/checkpoint-6500/trainer_state.json +81 -81
- run-3/checkpoint-6500/training_args.bin +1 -1
- run-3/checkpoint-7000/model.safetensors +1 -1
- run-3/checkpoint-7000/optimizer.pt +1 -1
- run-3/checkpoint-7000/rng_state.pth +1 -1
- run-3/checkpoint-7000/scheduler.pt +1 -1
- run-3/checkpoint-7000/trainer_state.json +89 -89
- run-3/checkpoint-7000/training_args.bin +1 -1
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:357873f897ac28dcafef3fa3fa53be07c2ae1ce0ee7c067b30c199d7205c8456
|
3 |
size 1340618660
|
run-3/checkpoint-4000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf482efd461a8b92fa2ced7f16699a274ac312a962d4bb81259d9b23caa7fbe1
|
3 |
size 1340618660
|
run-3/checkpoint-4000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2a7bf7ff3c0c0791b258d882964741d4dddac21bd018d264c509ef22e57d0aa
|
3 |
size 2681472237
|
run-3/checkpoint-4000/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d65eeaf2dd6e25acaa89c7669e032118233a7db49d07f0c99cc6439d496417b
|
3 |
size 14244
|
run-3/checkpoint-4000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b70e2b4949e2d396aca261edad5501b5c6cf802168746c892b94d2ef7d820e0
|
3 |
size 1064
|
run-3/checkpoint-4000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-4500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:382a8f46eadf767ed8da97df16ca3df66a409ff23b26af6a2745ff907ca25530
|
3 |
size 1340618660
|
run-3/checkpoint-4500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16b738c555c50c08e23cb5e22cf2b09d6275281380346d11079a373548a519e1
|
3 |
size 2681472237
|
run-3/checkpoint-4500/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53e9fb867ae884a58402e7b3b9a9f22e8f411dc167b418dee588d4b62db82684
|
3 |
size 14244
|
run-3/checkpoint-4500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:836e3991ee1cc34d19f43898c3076c25b8a0367ad6a1217c062230a80dc79d0d
|
3 |
size 1064
|
run-3/checkpoint-4500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-5000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f656c5e41f4db468459396594f52951edf346944c022c6a51fe91022d752880
|
3 |
size 1340618660
|
run-3/checkpoint-5000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9960712477ff26f5f82209fef0cfa0fdc1268e06a394b9a86f091b6cc09276c5
|
3 |
size 2681472237
|
run-3/checkpoint-5000/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7464ebba6819e0c68b094da2227ebd7b7e48fa501069e61ff0c479a55d431d86
|
3 |
size 14244
|
run-3/checkpoint-5000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd000ad96d274229ffdddadc85494aa58efe28c1a588aed7e940403a55b33a50
|
3 |
size 1064
|
run-3/checkpoint-5000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-5500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59133785062b3f02adc682bde548abd9a14813e20e605e69c971f9e1cf743b44
|
3 |
size 1340618660
|
run-3/checkpoint-5500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:823a91a9a9f45cb34713826bb763fe820bfb642a342d0db495d61a7afa005c82
|
3 |
size 2681472237
|
run-3/checkpoint-5500/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4c5967a58f1402443b33894cea74c9f032e86a1e8454f41569028ccf79a7622
|
3 |
size 14244
|
run-3/checkpoint-5500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:328f66a371abac671a66cc9d36cee10be25fd4d036bad87dd50b1a40a8805410
|
3 |
size 1064
|
run-3/checkpoint-5500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-6000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:357873f897ac28dcafef3fa3fa53be07c2ae1ce0ee7c067b30c199d7205c8456
|
3 |
size 1340618660
|
run-3/checkpoint-6000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ff50c71b14db9ed2dc9ab3c5630a214d2f8cd30274ef8942b8db0726ea1613c
|
3 |
size 2681472237
|
run-3/checkpoint-6000/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88211143abc4ea5f4e151fd815af9be01e8a86ec8565449bd20ccf3f1d4ddcb6
|
3 |
size 14244
|
run-3/checkpoint-6000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5974e9076a3f51360e8ba5d82f806d211f25caaf2ff1e16f0a4a3a32639e126a
|
3 |
size 1064
|
run-3/checkpoint-6000/trainer_state.json
CHANGED
@@ -10,158 +10,158 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
-
"grad_norm":
|
14 |
-
"learning_rate":
|
15 |
-
"loss": 0.
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime": 8.
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second": 21.
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
-
"grad_norm":
|
30 |
-
"learning_rate":
|
31 |
-
"loss": 0.
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime": 8.
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second": 21.
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
-
"grad_norm":
|
46 |
-
"learning_rate":
|
47 |
-
"loss": 0.
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
-
"grad_norm":
|
53 |
-
"learning_rate": 1.
|
54 |
-
"loss": 0.
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
-
"eval_accuracy": 0.
|
60 |
-
"eval_loss": 0.
|
61 |
-
"eval_runtime": 8.
|
62 |
-
"eval_samples_per_second":
|
63 |
-
"eval_steps_per_second": 21.
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
-
"grad_norm":
|
69 |
-
"learning_rate": 1.
|
70 |
-
"loss": 0.
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
-
"eval_accuracy": 0.
|
76 |
-
"eval_loss": 1.
|
77 |
-
"eval_runtime": 8.
|
78 |
-
"eval_samples_per_second":
|
79 |
-
"eval_steps_per_second":
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
-
"grad_norm":
|
85 |
-
"learning_rate": 1.
|
86 |
-
"loss": 0.
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
-
"grad_norm":
|
92 |
-
"learning_rate": 1.
|
93 |
-
"loss": 0.
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
-
"eval_accuracy": 0.
|
99 |
-
"eval_loss": 1.
|
100 |
-
"eval_runtime": 8.
|
101 |
-
"eval_samples_per_second":
|
102 |
-
"eval_steps_per_second": 21.
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
-
"grad_norm": 0.
|
108 |
-
"learning_rate":
|
109 |
-
"loss": 0.
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
-
"eval_accuracy": 0.
|
115 |
-
"eval_loss": 1.
|
116 |
-
"eval_runtime": 8.
|
117 |
-
"eval_samples_per_second":
|
118 |
-
"eval_steps_per_second": 21.
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
-
"grad_norm":
|
124 |
-
"learning_rate":
|
125 |
-
"loss": 0.
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
-
"grad_norm": 0.
|
131 |
-
"learning_rate":
|
132 |
-
"loss": 0.
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
-
"eval_accuracy": 0.
|
138 |
-
"eval_loss": 1.
|
139 |
-
"eval_runtime": 8.
|
140 |
-
"eval_samples_per_second":
|
141 |
-
"eval_steps_per_second": 21.
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
-
"grad_norm":
|
147 |
-
"learning_rate":
|
148 |
-
"loss": 0.
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
-
"eval_accuracy": 0.
|
154 |
-
"eval_loss": 1.
|
155 |
-
"eval_runtime": 8.
|
156 |
-
"eval_samples_per_second":
|
157 |
-
"eval_steps_per_second": 21.
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
-
"grad_norm": 0.
|
163 |
-
"learning_rate":
|
164 |
-
"loss": 0.
|
165 |
"step": 6000
|
166 |
}
|
167 |
],
|
@@ -170,11 +170,11 @@
|
|
170 |
"num_input_tokens_seen": 0,
|
171 |
"num_train_epochs": 10,
|
172 |
"save_steps": 500,
|
173 |
-
"total_flos": 1.
|
174 |
"train_batch_size": 16,
|
175 |
"trial_name": null,
|
176 |
"trial_params": {
|
177 |
-
"learning_rate": 2.
|
178 |
"per_device_train_batch_size": 16
|
179 |
}
|
180 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
+
"grad_norm": 5.694277763366699,
|
14 |
+
"learning_rate": 2.542005392773407e-05,
|
15 |
+
"loss": 0.557,
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7254441976547241,
|
21 |
+
"eval_loss": 0.5171247720718384,
|
22 |
+
"eval_runtime": 8.773,
|
23 |
+
"eval_samples_per_second": 340.02,
|
24 |
+
"eval_steps_per_second": 21.315,
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
+
"grad_norm": 8.974740028381348,
|
30 |
+
"learning_rate": 2.359390062832789e-05,
|
31 |
+
"loss": 0.4156,
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.7596379518508911,
|
37 |
+
"eval_loss": 0.6025224924087524,
|
38 |
+
"eval_runtime": 8.8883,
|
39 |
+
"eval_samples_per_second": 335.609,
|
40 |
+
"eval_steps_per_second": 21.039,
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
+
"grad_norm": 7.7003068923950195,
|
46 |
+
"learning_rate": 2.1767747328921705e-05,
|
47 |
+
"loss": 0.2948,
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
+
"grad_norm": 20.24570655822754,
|
53 |
+
"learning_rate": 1.9941594029515523e-05,
|
54 |
+
"loss": 0.1262,
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
+
"eval_accuracy": 0.7703654170036316,
|
60 |
+
"eval_loss": 0.822274386882782,
|
61 |
+
"eval_runtime": 8.8709,
|
62 |
+
"eval_samples_per_second": 336.267,
|
63 |
+
"eval_steps_per_second": 21.08,
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
+
"grad_norm": 0.9093023538589478,
|
69 |
+
"learning_rate": 1.8115440730109338e-05,
|
70 |
+
"loss": 0.1012,
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
+
"eval_accuracy": 0.7683539986610413,
|
76 |
+
"eval_loss": 1.2840174436569214,
|
77 |
+
"eval_runtime": 8.9163,
|
78 |
+
"eval_samples_per_second": 334.557,
|
79 |
+
"eval_steps_per_second": 20.973,
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
+
"grad_norm": 29.010135650634766,
|
85 |
+
"learning_rate": 1.6289287430703153e-05,
|
86 |
+
"loss": 0.0675,
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
+
"grad_norm": 5.461940288543701,
|
92 |
+
"learning_rate": 1.4463134131296973e-05,
|
93 |
+
"loss": 0.0379,
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
+
"eval_accuracy": 0.7700302004814148,
|
99 |
+
"eval_loss": 1.4166399240493774,
|
100 |
+
"eval_runtime": 8.8683,
|
101 |
+
"eval_samples_per_second": 336.367,
|
102 |
+
"eval_steps_per_second": 21.086,
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
+
"grad_norm": 0.004815839231014252,
|
108 |
+
"learning_rate": 1.2636980831890788e-05,
|
109 |
+
"loss": 0.034,
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
+
"eval_accuracy": 0.7720415592193604,
|
115 |
+
"eval_loss": 1.576446533203125,
|
116 |
+
"eval_runtime": 8.9004,
|
117 |
+
"eval_samples_per_second": 335.152,
|
118 |
+
"eval_steps_per_second": 21.01,
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
+
"grad_norm": 0.29464954137802124,
|
124 |
+
"learning_rate": 1.0810827532484605e-05,
|
125 |
+
"loss": 0.0175,
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
+
"grad_norm": 0.010658634826540947,
|
131 |
+
"learning_rate": 8.984674233078421e-06,
|
132 |
+
"loss": 0.0101,
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
+
"eval_accuracy": 0.7753939032554626,
|
138 |
+
"eval_loss": 1.5760776996612549,
|
139 |
+
"eval_runtime": 8.8735,
|
140 |
+
"eval_samples_per_second": 336.169,
|
141 |
+
"eval_steps_per_second": 21.074,
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
+
"grad_norm": 32.647804260253906,
|
147 |
+
"learning_rate": 7.158520933672239e-06,
|
148 |
+
"loss": 0.0101,
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
+
"eval_accuracy": 0.7733824849128723,
|
154 |
+
"eval_loss": 1.5171312093734741,
|
155 |
+
"eval_runtime": 8.9008,
|
156 |
+
"eval_samples_per_second": 335.137,
|
157 |
+
"eval_steps_per_second": 21.009,
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
+
"grad_norm": 0.006861701142042875,
|
163 |
+
"learning_rate": 5.332367634266056e-06,
|
164 |
+
"loss": 0.0064,
|
165 |
"step": 6000
|
166 |
}
|
167 |
],
|
|
|
170 |
"num_input_tokens_seen": 0,
|
171 |
"num_train_epochs": 10,
|
172 |
"save_steps": 500,
|
173 |
+
"total_flos": 1.425589883589798e+16,
|
174 |
"train_batch_size": 16,
|
175 |
"trial_name": null,
|
176 |
"trial_params": {
|
177 |
+
"learning_rate": 2.7246207227140256e-05,
|
178 |
"per_device_train_batch_size": 16
|
179 |
}
|
180 |
}
|
run-3/checkpoint-6000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-6500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31ec2188671ef60fb5d11cf43ed926a0f8ad799f26919a5a8a6693b9245fc4e5
|
3 |
size 1340618660
|
run-3/checkpoint-6500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f6b8b3b7ccb46348a8b4d934977929d9fd3ca1af1cd003a247aa822c1bfb929
|
3 |
size 2681472237
|
run-3/checkpoint-6500/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5b81d476f21a8f359e4d3f42b921b67967019e455d484e4de9d785117a493ca
|
3 |
size 14244
|
run-3/checkpoint-6500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:773f880e1ac6dc3230d622b85e4abbcadb78c0f0a15af651b1285dedb6e9a315
|
3 |
size 1064
|
run-3/checkpoint-6500/trainer_state.json
CHANGED
@@ -10,165 +10,165 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
-
"grad_norm":
|
14 |
-
"learning_rate":
|
15 |
-
"loss": 0.
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime": 8.
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second": 21.
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
-
"grad_norm":
|
30 |
-
"learning_rate":
|
31 |
-
"loss": 0.
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime": 8.
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second": 21.
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
-
"grad_norm":
|
46 |
-
"learning_rate":
|
47 |
-
"loss": 0.
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
-
"grad_norm":
|
53 |
-
"learning_rate": 1.
|
54 |
-
"loss": 0.
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
-
"eval_accuracy": 0.
|
60 |
-
"eval_loss": 0.
|
61 |
-
"eval_runtime": 8.
|
62 |
-
"eval_samples_per_second":
|
63 |
-
"eval_steps_per_second": 21.
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
-
"grad_norm":
|
69 |
-
"learning_rate": 1.
|
70 |
-
"loss": 0.
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
-
"eval_accuracy": 0.
|
76 |
-
"eval_loss": 1.
|
77 |
-
"eval_runtime": 8.
|
78 |
-
"eval_samples_per_second":
|
79 |
-
"eval_steps_per_second":
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
-
"grad_norm":
|
85 |
-
"learning_rate": 1.
|
86 |
-
"loss": 0.
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
-
"grad_norm":
|
92 |
-
"learning_rate": 1.
|
93 |
-
"loss": 0.
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
-
"eval_accuracy": 0.
|
99 |
-
"eval_loss": 1.
|
100 |
-
"eval_runtime": 8.
|
101 |
-
"eval_samples_per_second":
|
102 |
-
"eval_steps_per_second": 21.
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
-
"grad_norm": 0.
|
108 |
-
"learning_rate":
|
109 |
-
"loss": 0.
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
-
"eval_accuracy": 0.
|
115 |
-
"eval_loss": 1.
|
116 |
-
"eval_runtime": 8.
|
117 |
-
"eval_samples_per_second":
|
118 |
-
"eval_steps_per_second": 21.
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
-
"grad_norm":
|
124 |
-
"learning_rate":
|
125 |
-
"loss": 0.
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
-
"grad_norm": 0.
|
131 |
-
"learning_rate":
|
132 |
-
"loss": 0.
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
-
"eval_accuracy": 0.
|
138 |
-
"eval_loss": 1.
|
139 |
-
"eval_runtime": 8.
|
140 |
-
"eval_samples_per_second":
|
141 |
-
"eval_steps_per_second": 21.
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
-
"grad_norm":
|
147 |
-
"learning_rate":
|
148 |
-
"loss": 0.
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
-
"eval_accuracy": 0.
|
154 |
-
"eval_loss": 1.
|
155 |
-
"eval_runtime": 8.
|
156 |
-
"eval_samples_per_second":
|
157 |
-
"eval_steps_per_second": 21.
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
-
"grad_norm": 0.
|
163 |
-
"learning_rate":
|
164 |
-
"loss": 0.
|
165 |
"step": 6000
|
166 |
},
|
167 |
{
|
168 |
"epoch": 8.71313672922252,
|
169 |
-
"grad_norm":
|
170 |
-
"learning_rate":
|
171 |
-
"loss": 0.
|
172 |
"step": 6500
|
173 |
}
|
174 |
],
|
@@ -177,11 +177,11 @@
|
|
177 |
"num_input_tokens_seen": 0,
|
178 |
"num_train_epochs": 10,
|
179 |
"save_steps": 500,
|
180 |
-
"total_flos": 1.
|
181 |
"train_batch_size": 16,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
-
"learning_rate": 2.
|
185 |
"per_device_train_batch_size": 16
|
186 |
}
|
187 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
+
"grad_norm": 5.694277763366699,
|
14 |
+
"learning_rate": 2.542005392773407e-05,
|
15 |
+
"loss": 0.557,
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7254441976547241,
|
21 |
+
"eval_loss": 0.5171247720718384,
|
22 |
+
"eval_runtime": 8.773,
|
23 |
+
"eval_samples_per_second": 340.02,
|
24 |
+
"eval_steps_per_second": 21.315,
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
+
"grad_norm": 8.974740028381348,
|
30 |
+
"learning_rate": 2.359390062832789e-05,
|
31 |
+
"loss": 0.4156,
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.7596379518508911,
|
37 |
+
"eval_loss": 0.6025224924087524,
|
38 |
+
"eval_runtime": 8.8883,
|
39 |
+
"eval_samples_per_second": 335.609,
|
40 |
+
"eval_steps_per_second": 21.039,
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
+
"grad_norm": 7.7003068923950195,
|
46 |
+
"learning_rate": 2.1767747328921705e-05,
|
47 |
+
"loss": 0.2948,
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
+
"grad_norm": 20.24570655822754,
|
53 |
+
"learning_rate": 1.9941594029515523e-05,
|
54 |
+
"loss": 0.1262,
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
+
"eval_accuracy": 0.7703654170036316,
|
60 |
+
"eval_loss": 0.822274386882782,
|
61 |
+
"eval_runtime": 8.8709,
|
62 |
+
"eval_samples_per_second": 336.267,
|
63 |
+
"eval_steps_per_second": 21.08,
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
+
"grad_norm": 0.9093023538589478,
|
69 |
+
"learning_rate": 1.8115440730109338e-05,
|
70 |
+
"loss": 0.1012,
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
+
"eval_accuracy": 0.7683539986610413,
|
76 |
+
"eval_loss": 1.2840174436569214,
|
77 |
+
"eval_runtime": 8.9163,
|
78 |
+
"eval_samples_per_second": 334.557,
|
79 |
+
"eval_steps_per_second": 20.973,
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
+
"grad_norm": 29.010135650634766,
|
85 |
+
"learning_rate": 1.6289287430703153e-05,
|
86 |
+
"loss": 0.0675,
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
+
"grad_norm": 5.461940288543701,
|
92 |
+
"learning_rate": 1.4463134131296973e-05,
|
93 |
+
"loss": 0.0379,
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
+
"eval_accuracy": 0.7700302004814148,
|
99 |
+
"eval_loss": 1.4166399240493774,
|
100 |
+
"eval_runtime": 8.8683,
|
101 |
+
"eval_samples_per_second": 336.367,
|
102 |
+
"eval_steps_per_second": 21.086,
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
+
"grad_norm": 0.004815839231014252,
|
108 |
+
"learning_rate": 1.2636980831890788e-05,
|
109 |
+
"loss": 0.034,
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
+
"eval_accuracy": 0.7720415592193604,
|
115 |
+
"eval_loss": 1.576446533203125,
|
116 |
+
"eval_runtime": 8.9004,
|
117 |
+
"eval_samples_per_second": 335.152,
|
118 |
+
"eval_steps_per_second": 21.01,
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
+
"grad_norm": 0.29464954137802124,
|
124 |
+
"learning_rate": 1.0810827532484605e-05,
|
125 |
+
"loss": 0.0175,
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
+
"grad_norm": 0.010658634826540947,
|
131 |
+
"learning_rate": 8.984674233078421e-06,
|
132 |
+
"loss": 0.0101,
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
+
"eval_accuracy": 0.7753939032554626,
|
138 |
+
"eval_loss": 1.5760776996612549,
|
139 |
+
"eval_runtime": 8.8735,
|
140 |
+
"eval_samples_per_second": 336.169,
|
141 |
+
"eval_steps_per_second": 21.074,
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
+
"grad_norm": 32.647804260253906,
|
147 |
+
"learning_rate": 7.158520933672239e-06,
|
148 |
+
"loss": 0.0101,
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
+
"eval_accuracy": 0.7733824849128723,
|
154 |
+
"eval_loss": 1.5171312093734741,
|
155 |
+
"eval_runtime": 8.9008,
|
156 |
+
"eval_samples_per_second": 335.137,
|
157 |
+
"eval_steps_per_second": 21.009,
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
+
"grad_norm": 0.006861701142042875,
|
163 |
+
"learning_rate": 5.332367634266056e-06,
|
164 |
+
"loss": 0.0064,
|
165 |
"step": 6000
|
166 |
},
|
167 |
{
|
168 |
"epoch": 8.71313672922252,
|
169 |
+
"grad_norm": 1.2482685633585788e-05,
|
170 |
+
"learning_rate": 3.5062143348598724e-06,
|
171 |
+
"loss": 0.0028,
|
172 |
"step": 6500
|
173 |
}
|
174 |
],
|
|
|
177 |
"num_input_tokens_seen": 0,
|
178 |
"num_train_epochs": 10,
|
179 |
"save_steps": 500,
|
180 |
+
"total_flos": 1.5455057465941884e+16,
|
181 |
"train_batch_size": 16,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
+
"learning_rate": 2.7246207227140256e-05,
|
185 |
"per_device_train_batch_size": 16
|
186 |
}
|
187 |
}
|
run-3/checkpoint-6500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|
run-3/checkpoint-7000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1340618660
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c4a8a041c3286f74682e35df552e9fc99c021008e7d2b87738f3fd82618c362
|
3 |
size 1340618660
|
run-3/checkpoint-7000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2681472237
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cde3998752916ce1a2e2a136402465ce51c3e96c07515ddae0b4246d99415b6
|
3 |
size 2681472237
|
run-3/checkpoint-7000/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d9034b167dd57fcb13ab131f7a6b12c7467166bd3b6746d19284f0b0fe4a597
|
3 |
size 14244
|
run-3/checkpoint-7000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:089dd2cbb01d7d635a13cc85346cf219731871b043661ac60d6bbf8e6d664db2
|
3 |
size 1064
|
run-3/checkpoint-7000/trainer_state.json
CHANGED
@@ -10,181 +10,181 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
-
"grad_norm":
|
14 |
-
"learning_rate":
|
15 |
-
"loss": 0.
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime": 8.
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second": 21.
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
-
"grad_norm":
|
30 |
-
"learning_rate":
|
31 |
-
"loss": 0.
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime": 8.
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second": 21.
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
-
"grad_norm":
|
46 |
-
"learning_rate":
|
47 |
-
"loss": 0.
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
-
"grad_norm":
|
53 |
-
"learning_rate": 1.
|
54 |
-
"loss": 0.
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
-
"eval_accuracy": 0.
|
60 |
-
"eval_loss": 0.
|
61 |
-
"eval_runtime": 8.
|
62 |
-
"eval_samples_per_second":
|
63 |
-
"eval_steps_per_second": 21.
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
-
"grad_norm":
|
69 |
-
"learning_rate": 1.
|
70 |
-
"loss": 0.
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
-
"eval_accuracy": 0.
|
76 |
-
"eval_loss": 1.
|
77 |
-
"eval_runtime": 8.
|
78 |
-
"eval_samples_per_second":
|
79 |
-
"eval_steps_per_second":
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
-
"grad_norm":
|
85 |
-
"learning_rate": 1.
|
86 |
-
"loss": 0.
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
-
"grad_norm":
|
92 |
-
"learning_rate": 1.
|
93 |
-
"loss": 0.
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
-
"eval_accuracy": 0.
|
99 |
-
"eval_loss": 1.
|
100 |
-
"eval_runtime": 8.
|
101 |
-
"eval_samples_per_second":
|
102 |
-
"eval_steps_per_second": 21.
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
-
"grad_norm": 0.
|
108 |
-
"learning_rate":
|
109 |
-
"loss": 0.
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
-
"eval_accuracy": 0.
|
115 |
-
"eval_loss": 1.
|
116 |
-
"eval_runtime": 8.
|
117 |
-
"eval_samples_per_second":
|
118 |
-
"eval_steps_per_second": 21.
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
-
"grad_norm":
|
124 |
-
"learning_rate":
|
125 |
-
"loss": 0.
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
-
"grad_norm": 0.
|
131 |
-
"learning_rate":
|
132 |
-
"loss": 0.
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
-
"eval_accuracy": 0.
|
138 |
-
"eval_loss": 1.
|
139 |
-
"eval_runtime": 8.
|
140 |
-
"eval_samples_per_second":
|
141 |
-
"eval_steps_per_second": 21.
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
-
"grad_norm":
|
147 |
-
"learning_rate":
|
148 |
-
"loss": 0.
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
-
"eval_accuracy": 0.
|
154 |
-
"eval_loss": 1.
|
155 |
-
"eval_runtime": 8.
|
156 |
-
"eval_samples_per_second":
|
157 |
-
"eval_steps_per_second": 21.
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
-
"grad_norm": 0.
|
163 |
-
"learning_rate":
|
164 |
-
"loss": 0.
|
165 |
"step": 6000
|
166 |
},
|
167 |
{
|
168 |
"epoch": 8.71313672922252,
|
169 |
-
"grad_norm":
|
170 |
-
"learning_rate":
|
171 |
-
"loss": 0.
|
172 |
"step": 6500
|
173 |
},
|
174 |
{
|
175 |
"epoch": 9.0,
|
176 |
-
"eval_accuracy": 0.
|
177 |
-
"eval_loss": 1.
|
178 |
-
"eval_runtime": 8.
|
179 |
-
"eval_samples_per_second":
|
180 |
-
"eval_steps_per_second": 21.
|
181 |
"step": 6714
|
182 |
},
|
183 |
{
|
184 |
"epoch": 9.383378016085791,
|
185 |
-
"grad_norm":
|
186 |
-
"learning_rate": 1.
|
187 |
-
"loss": 0.
|
188 |
"step": 7000
|
189 |
}
|
190 |
],
|
@@ -193,11 +193,11 @@
|
|
193 |
"num_input_tokens_seen": 0,
|
194 |
"num_train_epochs": 10,
|
195 |
"save_steps": 500,
|
196 |
-
"total_flos": 1.
|
197 |
"train_batch_size": 16,
|
198 |
"trial_name": null,
|
199 |
"trial_params": {
|
200 |
-
"learning_rate": 2.
|
201 |
"per_device_train_batch_size": 16
|
202 |
}
|
203 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.6702412868632708,
|
13 |
+
"grad_norm": 5.694277763366699,
|
14 |
+
"learning_rate": 2.542005392773407e-05,
|
15 |
+
"loss": 0.557,
|
16 |
"step": 500
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7254441976547241,
|
21 |
+
"eval_loss": 0.5171247720718384,
|
22 |
+
"eval_runtime": 8.773,
|
23 |
+
"eval_samples_per_second": 340.02,
|
24 |
+
"eval_steps_per_second": 21.315,
|
25 |
"step": 746
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.3404825737265416,
|
29 |
+
"grad_norm": 8.974740028381348,
|
30 |
+
"learning_rate": 2.359390062832789e-05,
|
31 |
+
"loss": 0.4156,
|
32 |
"step": 1000
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.7596379518508911,
|
37 |
+
"eval_loss": 0.6025224924087524,
|
38 |
+
"eval_runtime": 8.8883,
|
39 |
+
"eval_samples_per_second": 335.609,
|
40 |
+
"eval_steps_per_second": 21.039,
|
41 |
"step": 1492
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.0107238605898123,
|
45 |
+
"grad_norm": 7.7003068923950195,
|
46 |
+
"learning_rate": 2.1767747328921705e-05,
|
47 |
+
"loss": 0.2948,
|
48 |
"step": 1500
|
49 |
},
|
50 |
{
|
51 |
"epoch": 2.680965147453083,
|
52 |
+
"grad_norm": 20.24570655822754,
|
53 |
+
"learning_rate": 1.9941594029515523e-05,
|
54 |
+
"loss": 0.1262,
|
55 |
"step": 2000
|
56 |
},
|
57 |
{
|
58 |
"epoch": 3.0,
|
59 |
+
"eval_accuracy": 0.7703654170036316,
|
60 |
+
"eval_loss": 0.822274386882782,
|
61 |
+
"eval_runtime": 8.8709,
|
62 |
+
"eval_samples_per_second": 336.267,
|
63 |
+
"eval_steps_per_second": 21.08,
|
64 |
"step": 2238
|
65 |
},
|
66 |
{
|
67 |
"epoch": 3.351206434316354,
|
68 |
+
"grad_norm": 0.9093023538589478,
|
69 |
+
"learning_rate": 1.8115440730109338e-05,
|
70 |
+
"loss": 0.1012,
|
71 |
"step": 2500
|
72 |
},
|
73 |
{
|
74 |
"epoch": 4.0,
|
75 |
+
"eval_accuracy": 0.7683539986610413,
|
76 |
+
"eval_loss": 1.2840174436569214,
|
77 |
+
"eval_runtime": 8.9163,
|
78 |
+
"eval_samples_per_second": 334.557,
|
79 |
+
"eval_steps_per_second": 20.973,
|
80 |
"step": 2984
|
81 |
},
|
82 |
{
|
83 |
"epoch": 4.021447721179625,
|
84 |
+
"grad_norm": 29.010135650634766,
|
85 |
+
"learning_rate": 1.6289287430703153e-05,
|
86 |
+
"loss": 0.0675,
|
87 |
"step": 3000
|
88 |
},
|
89 |
{
|
90 |
"epoch": 4.6916890080428955,
|
91 |
+
"grad_norm": 5.461940288543701,
|
92 |
+
"learning_rate": 1.4463134131296973e-05,
|
93 |
+
"loss": 0.0379,
|
94 |
"step": 3500
|
95 |
},
|
96 |
{
|
97 |
"epoch": 5.0,
|
98 |
+
"eval_accuracy": 0.7700302004814148,
|
99 |
+
"eval_loss": 1.4166399240493774,
|
100 |
+
"eval_runtime": 8.8683,
|
101 |
+
"eval_samples_per_second": 336.367,
|
102 |
+
"eval_steps_per_second": 21.086,
|
103 |
"step": 3730
|
104 |
},
|
105 |
{
|
106 |
"epoch": 5.361930294906166,
|
107 |
+
"grad_norm": 0.004815839231014252,
|
108 |
+
"learning_rate": 1.2636980831890788e-05,
|
109 |
+
"loss": 0.034,
|
110 |
"step": 4000
|
111 |
},
|
112 |
{
|
113 |
"epoch": 6.0,
|
114 |
+
"eval_accuracy": 0.7720415592193604,
|
115 |
+
"eval_loss": 1.576446533203125,
|
116 |
+
"eval_runtime": 8.9004,
|
117 |
+
"eval_samples_per_second": 335.152,
|
118 |
+
"eval_steps_per_second": 21.01,
|
119 |
"step": 4476
|
120 |
},
|
121 |
{
|
122 |
"epoch": 6.032171581769437,
|
123 |
+
"grad_norm": 0.29464954137802124,
|
124 |
+
"learning_rate": 1.0810827532484605e-05,
|
125 |
+
"loss": 0.0175,
|
126 |
"step": 4500
|
127 |
},
|
128 |
{
|
129 |
"epoch": 6.702412868632708,
|
130 |
+
"grad_norm": 0.010658634826540947,
|
131 |
+
"learning_rate": 8.984674233078421e-06,
|
132 |
+
"loss": 0.0101,
|
133 |
"step": 5000
|
134 |
},
|
135 |
{
|
136 |
"epoch": 7.0,
|
137 |
+
"eval_accuracy": 0.7753939032554626,
|
138 |
+
"eval_loss": 1.5760776996612549,
|
139 |
+
"eval_runtime": 8.8735,
|
140 |
+
"eval_samples_per_second": 336.169,
|
141 |
+
"eval_steps_per_second": 21.074,
|
142 |
"step": 5222
|
143 |
},
|
144 |
{
|
145 |
"epoch": 7.372654155495979,
|
146 |
+
"grad_norm": 32.647804260253906,
|
147 |
+
"learning_rate": 7.158520933672239e-06,
|
148 |
+
"loss": 0.0101,
|
149 |
"step": 5500
|
150 |
},
|
151 |
{
|
152 |
"epoch": 8.0,
|
153 |
+
"eval_accuracy": 0.7733824849128723,
|
154 |
+
"eval_loss": 1.5171312093734741,
|
155 |
+
"eval_runtime": 8.9008,
|
156 |
+
"eval_samples_per_second": 335.137,
|
157 |
+
"eval_steps_per_second": 21.009,
|
158 |
"step": 5968
|
159 |
},
|
160 |
{
|
161 |
"epoch": 8.04289544235925,
|
162 |
+
"grad_norm": 0.006861701142042875,
|
163 |
+
"learning_rate": 5.332367634266056e-06,
|
164 |
+
"loss": 0.0064,
|
165 |
"step": 6000
|
166 |
},
|
167 |
{
|
168 |
"epoch": 8.71313672922252,
|
169 |
+
"grad_norm": 1.2482685633585788e-05,
|
170 |
+
"learning_rate": 3.5062143348598724e-06,
|
171 |
+
"loss": 0.0028,
|
172 |
"step": 6500
|
173 |
},
|
174 |
{
|
175 |
"epoch": 9.0,
|
176 |
+
"eval_accuracy": 0.7717063426971436,
|
177 |
+
"eval_loss": 1.7022887468338013,
|
178 |
+
"eval_runtime": 8.8529,
|
179 |
+
"eval_samples_per_second": 336.951,
|
180 |
+
"eval_steps_per_second": 21.123,
|
181 |
"step": 6714
|
182 |
},
|
183 |
{
|
184 |
"epoch": 9.383378016085791,
|
185 |
+
"grad_norm": 0.0012023162562400103,
|
186 |
+
"learning_rate": 1.6800610354536887e-06,
|
187 |
+
"loss": 0.0055,
|
188 |
"step": 7000
|
189 |
}
|
190 |
],
|
|
|
193 |
"num_input_tokens_seen": 0,
|
194 |
"num_train_epochs": 10,
|
195 |
"save_steps": 500,
|
196 |
+
"total_flos": 1.6633284114604788e+16,
|
197 |
"train_batch_size": 16,
|
198 |
"trial_name": null,
|
199 |
"trial_params": {
|
200 |
+
"learning_rate": 2.7246207227140256e-05,
|
201 |
"per_device_train_batch_size": 16
|
202 |
}
|
203 |
}
|
run-3/checkpoint-7000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
|
3 |
size 5048
|