inflaton commited on
Commit
472b12c
1 Parent(s): b8f963c

Training in progress, step 3000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29808a477e6dfe819802c452b4eb8059a040db301b42679e25eb9cd4177ce709
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db0c754a09ef5d8f060aae9b2d0f9bdc0b4a43e371f56b329e9aad51b41edeb0
3
  size 1340618660
run-3/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c81a9f4242b6291e55b39407144fcc903a35dde7595a8f838278f2c3657269c2
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7887ff6fdcfcb9a94904c2ad7372830450b6301ec1dbce3f92b065eba06941a4
3
  size 1340618660
run-3/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:088951e1efcae1165260a06a5f79972a7473991f6a84cbe38dfde9bbc32ab8ce
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3c8c491aa2ef552b9b8f4452d20894cb90a050098ec0977ebc929d7c381bb39
3
  size 2681472237
run-3/checkpoint-1500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdad94d4629168a2bf6ebe9809c6aa9b7f4c7bb49624257aed0f70092ede3df7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b6599d5d79187827e050787339b527eef0d6c77ccebbad27127fd9a8877fcbb
3
  size 14244
run-3/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4a08d061763b774416959da40fe22321436bf84b743529ed3e16bd678714ed4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cbcf7ad6996437855c9692400a0f0ae25c519dcf1952481151980bbd0b8b052
3
  size 1064
run-3/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d7e639e12fa05ab793737459f1a86157dfb76d767f1d5eb19931edb98017649
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d310e5f6a293ba94a6494ee07c9f37f5230a179f3e90988a6d2186344d88439c
3
  size 1340618660
run-3/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72206922f1b9081896184e0e81442f186f632d854eaa2a989a764da3289fafd7
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:258a4b43c754deffeca3aa94a6396e56b25a16cf6cf5e2da88d49e5fd6572574
3
  size 2681472237
run-3/checkpoint-2000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28b3bc1d93c8e5d77d91605f4094eab1adea91fbfc16b782d04a37130f913871
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:025eebcb042d7b5cf8c379a84ed4c03984839f35c20062f1c7126a35ed7e9a3f
3
  size 14244
run-3/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c5bf5016802e656b127c6e2dfcd024ab9f999b8198f48afb1fbc97e7f70a98a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5326b95a612a9d55b41f1e7997ba3abf5d10b502b886d90e00f8d84768789c20
3
  size 1064
run-3/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f888cc9b2d239afd1f796ecf442889dcb89847e1551d533f970a04b47784e205
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6aad66fef9c4a0979cc830ad430deca460b6d6d0fa6bf46b919be0daaa1b2f2
3
  size 1340618660
run-3/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:833200cadeb92b311e1e85963436525c1884e05d9f5dd523bc334344740d456a
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93abc2c4970fa10d9ee4cf65fe85e56a27bac9d889b457729ea8dfc99bd14b1b
3
  size 2681472237
run-3/checkpoint-2500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e901d2a4b6b7576bd2641f76be3ac251aed1d25d8496953ff1ca82ba6468b670
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89585a93d0d1163797229c71d3b2b10076d6e8ddc0ecb610145c0b5486d9ef45
3
  size 14244
run-3/checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7420d9f04f235d5ca197b6ebe7418b52a8bf4c738ae163581d29c15a251463e0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:781508431d34282ee15a4090d0bfb23ab89ba75cf1da881018817bf023ddb210
3
  size 1064
run-3/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-3000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a80e1edfa34ca6c9f099aeb1b12228bd8f4731ff771d848cc487600202c82666
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db0c754a09ef5d8f060aae9b2d0f9bdc0b4a43e371f56b329e9aad51b41edeb0
3
  size 1340618660
run-3/checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af1ed3f623429f004ff11ec563af27a5c9e0edc66b8bb942ad852c8b20892568
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d64a422180a34fc6f488ee7a791a05a66c2c8deac159f561c7cd2c6d1193147e
3
  size 2681472237
run-3/checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fda71fd5f2516582f65b43df160d9b94f0e4b0fc28d9135bb5b0ac484f494d5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ddd981a153c8372979053534b577e7b34ff4a45e4a8f39b8176414035e6ee1
3
  size 14244
run-3/checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f283f1f579c8656f5cf2b68f582eabd3dd9ab3ae1c7e0a2e5b7311b18f0970c5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92458ac9f1c9b1daeec2b10312e11502ad473746484a7e588e99ff151678a289
3
  size 1064
run-3/checkpoint-3000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.04289544235925,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
@@ -9,130 +9,94 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7777405381202698,
46
- "eval_loss": 0.948959469795227,
47
- "eval_runtime": 9.007,
48
- "eval_samples_per_second": 331.187,
49
- "eval_steps_per_second": 20.762,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7760643362998962,
55
- "eval_loss": 1.1204984188079834,
56
- "eval_runtime": 9.0575,
57
- "eval_samples_per_second": 329.342,
58
- "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 0.16835728287696838,
64
- "learning_rate": 1.7865316146678937e-05,
65
- "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7787462472915649,
71
- "eval_loss": 1.4213643074035645,
72
- "eval_runtime": 9.0611,
73
- "eval_samples_per_second": 329.211,
74
- "eval_steps_per_second": 20.638,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.16181084513664246,
80
- "learning_rate": 1.3859639880607426e-05,
81
- "loss": 0.0242,
82
  "step": 2000
83
  },
84
  {
85
- "epoch": 6.0,
86
- "eval_accuracy": 0.7784109711647034,
87
- "eval_loss": 1.251684308052063,
88
- "eval_runtime": 9.0244,
89
- "eval_samples_per_second": 330.548,
90
- "eval_steps_per_second": 20.722,
91
  "step": 2238
92
  },
93
  {
94
- "epoch": 6.702412868632708,
95
- "grad_norm": 2.160583734512329,
96
- "learning_rate": 9.853963614535916e-06,
97
- "loss": 0.0136,
98
  "step": 2500
99
  },
100
  {
101
- "epoch": 7.0,
102
- "eval_accuracy": 0.7753939032554626,
103
- "eval_loss": 1.3670138120651245,
104
- "eval_runtime": 9.0134,
105
- "eval_samples_per_second": 330.951,
106
- "eval_steps_per_second": 20.747,
107
- "step": 2611
108
- },
109
- {
110
- "epoch": 8.0,
111
- "eval_accuracy": 0.7794166803359985,
112
- "eval_loss": 1.4973613023757935,
113
- "eval_runtime": 9.0776,
114
- "eval_samples_per_second": 328.611,
115
- "eval_steps_per_second": 20.6,
116
  "step": 2984
117
  },
118
  {
119
- "epoch": 8.04289544235925,
120
- "grad_norm": 1.054295301437378,
121
- "learning_rate": 5.848287348464405e-06,
122
- "loss": 0.0045,
123
  "step": 3000
124
  }
125
  ],
126
  "logging_steps": 500,
127
- "max_steps": 3730,
128
  "num_input_tokens_seen": 0,
129
  "num_train_epochs": 10,
130
  "save_steps": 500,
131
- "total_flos": 1.5221292738223464e+16,
132
- "train_batch_size": 32,
133
  "trial_name": null,
134
  "trial_params": {
135
- "learning_rate": 2.988234494489347e-05,
136
- "per_device_train_batch_size": 32
137
  }
138
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
+ "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
+ "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
+ "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
+ "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  }
89
  ],
90
  "logging_steps": 500,
91
+ "max_steps": 7460,
92
  "num_input_tokens_seen": 0,
93
  "num_train_epochs": 10,
94
  "save_steps": 500,
95
+ "total_flos": 7128413561883960.0,
96
+ "train_batch_size": 16,
97
  "trial_name": null,
98
  "trial_params": {
99
+ "learning_rate": 2.7246207227140256e-05,
100
+ "per_device_train_batch_size": 16
101
  }
102
  }
run-3/checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-3500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19572f5d46152241f93d2ce9ea0af917f6d4e31d54467419b231809c5c78820d
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b104e6a883139119bcd3c466a3f74b4b4faf1a438ba998995cf93b7459f5d75
3
  size 1340618660
run-3/checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be76882b15fffb0814989424f9f49c8a80185c15fd13bbaa0448e30d2530166f
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f7e34c32e17e68460af4e7f96b7d506989170dd403e106c7ed6a0cdfa9fd8b
3
  size 2681472237
run-3/checkpoint-3500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28276f7287b26243a28773084d3832b67115e4dc44e251cd9263cf32052e7f9a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b201aaa78b3795e16af7b9e8fadd8b4d82077458bbcb41c4bc25b6dd164e54e8
3
  size 14244
run-3/checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf0b5d61230faf94be94de245eb52273abc7e93e8c0e9e2a11b6190fb32d452f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9f0108c0583a35063a374236b28c567f1d23932beaba0aaf428c8f8200ce6e1
3
  size 1064
run-3/checkpoint-3500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.383378016085791,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
@@ -9,146 +9,101 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.7727120518684387,
14
- "eval_loss": 0.46057018637657166,
15
- "eval_runtime": 8.9113,
16
- "eval_samples_per_second": 334.743,
17
- "eval_steps_per_second": 20.985,
18
- "step": 373
19
- },
20
- {
21
- "epoch": 1.3404825737265416,
22
- "grad_norm": 8.923430442810059,
23
- "learning_rate": 2.587666867882196e-05,
24
- "loss": 0.4651,
25
  "step": 500
26
  },
27
  {
28
- "epoch": 2.0,
29
- "eval_accuracy": 0.7770700454711914,
30
- "eval_loss": 0.5375419855117798,
31
- "eval_runtime": 9.0162,
32
- "eval_samples_per_second": 330.848,
33
- "eval_steps_per_second": 20.74,
34
  "step": 746
35
  },
36
  {
37
- "epoch": 2.680965147453083,
38
- "grad_norm": 30.215883255004883,
39
- "learning_rate": 2.187099241275045e-05,
40
- "loss": 0.176,
41
  "step": 1000
42
  },
43
  {
44
- "epoch": 3.0,
45
- "eval_accuracy": 0.7777405381202698,
46
- "eval_loss": 0.948959469795227,
47
- "eval_runtime": 9.007,
48
- "eval_samples_per_second": 331.187,
49
- "eval_steps_per_second": 20.762,
50
- "step": 1119
51
- },
52
- {
53
- "epoch": 4.0,
54
- "eval_accuracy": 0.7760643362998962,
55
- "eval_loss": 1.1204984188079834,
56
- "eval_runtime": 9.0575,
57
- "eval_samples_per_second": 329.342,
58
- "eval_steps_per_second": 20.646,
59
  "step": 1492
60
  },
61
  {
62
- "epoch": 4.021447721179625,
63
- "grad_norm": 0.16835728287696838,
64
- "learning_rate": 1.7865316146678937e-05,
65
- "loss": 0.0608,
66
  "step": 1500
67
  },
68
  {
69
- "epoch": 5.0,
70
- "eval_accuracy": 0.7787462472915649,
71
- "eval_loss": 1.4213643074035645,
72
- "eval_runtime": 9.0611,
73
- "eval_samples_per_second": 329.211,
74
- "eval_steps_per_second": 20.638,
75
- "step": 1865
76
- },
77
- {
78
- "epoch": 5.361930294906166,
79
- "grad_norm": 0.16181084513664246,
80
- "learning_rate": 1.3859639880607426e-05,
81
- "loss": 0.0242,
82
  "step": 2000
83
  },
84
  {
85
- "epoch": 6.0,
86
- "eval_accuracy": 0.7784109711647034,
87
- "eval_loss": 1.251684308052063,
88
- "eval_runtime": 9.0244,
89
- "eval_samples_per_second": 330.548,
90
- "eval_steps_per_second": 20.722,
91
  "step": 2238
92
  },
93
  {
94
- "epoch": 6.702412868632708,
95
- "grad_norm": 2.160583734512329,
96
- "learning_rate": 9.853963614535916e-06,
97
- "loss": 0.0136,
98
  "step": 2500
99
  },
100
  {
101
- "epoch": 7.0,
102
- "eval_accuracy": 0.7753939032554626,
103
- "eval_loss": 1.3670138120651245,
104
- "eval_runtime": 9.0134,
105
- "eval_samples_per_second": 330.951,
106
- "eval_steps_per_second": 20.747,
107
- "step": 2611
108
- },
109
- {
110
- "epoch": 8.0,
111
- "eval_accuracy": 0.7794166803359985,
112
- "eval_loss": 1.4973613023757935,
113
- "eval_runtime": 9.0776,
114
- "eval_samples_per_second": 328.611,
115
- "eval_steps_per_second": 20.6,
116
  "step": 2984
117
  },
118
  {
119
- "epoch": 8.04289544235925,
120
- "grad_norm": 1.054295301437378,
121
- "learning_rate": 5.848287348464405e-06,
122
- "loss": 0.0045,
123
  "step": 3000
124
  },
125
  {
126
- "epoch": 9.0,
127
- "eval_accuracy": 0.7797519564628601,
128
- "eval_loss": 1.4273899793624878,
129
- "eval_runtime": 9.0355,
130
- "eval_samples_per_second": 330.142,
131
- "eval_steps_per_second": 20.696,
132
- "step": 3357
133
- },
134
- {
135
- "epoch": 9.383378016085791,
136
- "grad_norm": 0.03208499401807785,
137
- "learning_rate": 1.8426110823928949e-06,
138
- "loss": 0.0039,
139
  "step": 3500
140
  }
141
  ],
142
  "logging_steps": 500,
143
- "max_steps": 3730,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 10,
146
  "save_steps": 500,
147
- "total_flos": 1.774860924913164e+16,
148
- "train_batch_size": 32,
149
  "trial_name": null,
150
  "trial_params": {
151
- "learning_rate": 2.988234494489347e-05,
152
- "per_device_train_batch_size": 32
153
  }
154
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.6916890080428955,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
+ "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
+ "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
+ "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
+ "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
+ "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
+ "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
+ "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
+ "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
+ "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
 
 
 
 
 
 
 
 
 
94
  "step": 3500
95
  }
96
  ],
97
  "logging_steps": 500,
98
+ "max_steps": 7460,
99
  "num_input_tokens_seen": 0,
100
  "num_train_epochs": 10,
101
  "save_steps": 500,
102
+ "total_flos": 8321747640587064.0,
103
+ "train_batch_size": 16,
104
  "trial_name": null,
105
  "trial_params": {
106
+ "learning_rate": 2.7246207227140256e-05,
107
+ "per_device_train_batch_size": 16
108
  }
109
  }
run-3/checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7873f643bbe0bcde81dd97f76bd7f35cc2bc225794540a61053f0708234413f
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507950b2d37b737502f824dab70976a7fa7a07f6887612e84989d3ab0cc54db
3
  size 5048
run-3/checkpoint-4000/trainer_state.json CHANGED
@@ -10,103 +10,103 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  }
112
  ],
@@ -115,11 +115,11 @@
115
  "num_input_tokens_seen": 0,
116
  "num_train_epochs": 10,
117
  "save_steps": 500,
118
- "total_flos": 9475245428463684.0,
119
  "train_batch_size": 16,
120
  "trial_name": null,
121
  "trial_params": {
122
- "learning_rate": 2.0032413727096193e-05,
123
  "per_device_train_batch_size": 16
124
  }
125
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  }
112
  ],
 
115
  "num_input_tokens_seen": 0,
116
  "num_train_epochs": 10,
117
  "save_steps": 500,
118
+ "total_flos": 9511987426390368.0,
119
  "train_batch_size": 16,
120
  "trial_name": null,
121
  "trial_params": {
122
+ "learning_rate": 2.7246207227140256e-05,
123
  "per_device_train_batch_size": 16
124
  }
125
  }
run-3/checkpoint-4500/trainer_state.json CHANGED
@@ -10,119 +10,119 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  }
128
  ],
@@ -131,11 +131,11 @@
131
  "num_input_tokens_seen": 0,
132
  "num_train_epochs": 10,
133
  "save_steps": 500,
134
- "total_flos": 1.065994825014864e+16,
135
  "train_batch_size": 16,
136
  "trial_name": null,
137
  "trial_params": {
138
- "learning_rate": 2.0032413727096193e-05,
139
  "per_device_train_batch_size": 16
140
  }
141
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  }
128
  ],
 
131
  "num_input_tokens_seen": 0,
132
  "num_train_epochs": 10,
133
  "save_steps": 500,
134
+ "total_flos": 1.0698816209314716e+16,
135
  "train_batch_size": 16,
136
  "trial_name": null,
137
  "trial_params": {
138
+ "learning_rate": 2.7246207227140256e-05,
139
  "per_device_train_batch_size": 16
140
  }
141
  }
run-3/checkpoint-5000/trainer_state.json CHANGED
@@ -10,126 +10,126 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
- "grad_norm": 0.009470508433878422,
131
- "learning_rate": 6.60586297166979e-06,
132
- "loss": 0.0098,
133
  "step": 5000
134
  }
135
  ],
@@ -138,11 +138,11 @@
138
  "num_input_tokens_seen": 0,
139
  "num_train_epochs": 10,
140
  "save_steps": 500,
141
- "total_flos": 1.1851884436529952e+16,
142
  "train_batch_size": 16,
143
  "trial_name": null,
144
  "trial_params": {
145
- "learning_rate": 2.0032413727096193e-05,
146
  "per_device_train_batch_size": 16
147
  }
148
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
+ "grad_norm": 0.010658634826540947,
131
+ "learning_rate": 8.984674233078421e-06,
132
+ "loss": 0.0101,
133
  "step": 5000
134
  }
135
  ],
 
138
  "num_input_tokens_seen": 0,
139
  "num_train_epochs": 10,
140
  "save_steps": 500,
141
+ "total_flos": 1.1885859772569756e+16,
142
  "train_batch_size": 16,
143
  "trial_name": null,
144
  "trial_params": {
145
+ "learning_rate": 2.7246207227140256e-05,
146
  "per_device_train_batch_size": 16
147
  }
148
  }
run-3/checkpoint-5500/trainer_state.json CHANGED
@@ -10,142 +10,142 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
- "grad_norm": 13.834343910217285,
14
- "learning_rate": 1.8689758651553552e-05,
15
- "loss": 0.5491,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.762319803237915,
21
- "eval_loss": 0.4773792326450348,
22
- "eval_runtime": 8.5342,
23
- "eval_samples_per_second": 349.533,
24
- "eval_steps_per_second": 21.912,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
- "grad_norm": 12.70506477355957,
30
- "learning_rate": 1.7347103576010912e-05,
31
- "loss": 0.4116,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.7696949243545532,
37
- "eval_loss": 0.5922191739082336,
38
- "eval_runtime": 8.5618,
39
- "eval_samples_per_second": 348.409,
40
- "eval_steps_per_second": 21.841,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
- "grad_norm": 100.83161163330078,
46
- "learning_rate": 1.6004448500468272e-05,
47
- "loss": 0.2993,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
- "grad_norm": 251.75213623046875,
53
- "learning_rate": 1.4661793424925633e-05,
54
- "loss": 0.1136,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
- "eval_accuracy": 0.7676835656166077,
60
- "eval_loss": 0.9344700574874878,
61
- "eval_runtime": 8.6072,
62
- "eval_samples_per_second": 346.57,
63
- "eval_steps_per_second": 21.726,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
- "grad_norm": 17.935895919799805,
69
- "learning_rate": 1.3319138349382991e-05,
70
- "loss": 0.1,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.777405321598053,
76
- "eval_loss": 1.230825662612915,
77
- "eval_runtime": 8.5403,
78
- "eval_samples_per_second": 349.285,
79
- "eval_steps_per_second": 21.896,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
- "grad_norm": 0.8948413729667664,
85
- "learning_rate": 1.1976483273840351e-05,
86
- "loss": 0.0715,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
- "grad_norm": 0.7082040309906006,
92
- "learning_rate": 1.063382819829771e-05,
93
- "loss": 0.0294,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
- "eval_accuracy": 0.7763996124267578,
99
- "eval_loss": 1.194653034210205,
100
- "eval_runtime": 8.5079,
101
- "eval_samples_per_second": 350.617,
102
- "eval_steps_per_second": 21.98,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
- "grad_norm": 0.04226335510611534,
108
- "learning_rate": 9.29117312275507e-06,
109
- "loss": 0.025,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
- "eval_accuracy": 0.7713711261749268,
115
- "eval_loss": 1.458601713180542,
116
- "eval_runtime": 8.5492,
117
- "eval_samples_per_second": 348.92,
118
- "eval_steps_per_second": 21.873,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
- "grad_norm": 1.8958851099014282,
124
- "learning_rate": 7.94851804721243e-06,
125
- "loss": 0.0138,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
- "grad_norm": 0.009470508433878422,
131
- "learning_rate": 6.60586297166979e-06,
132
- "loss": 0.0098,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
- "eval_accuracy": 0.7660073637962341,
138
- "eval_loss": 1.4296730756759644,
139
- "eval_runtime": 8.5064,
140
- "eval_samples_per_second": 350.675,
141
- "eval_steps_per_second": 21.983,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
- "grad_norm": 0.5303798317909241,
147
- "learning_rate": 5.26320789612715e-06,
148
- "loss": 0.0083,
149
  "step": 5500
150
  }
151
  ],
@@ -154,11 +154,11 @@
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 10,
156
  "save_steps": 500,
157
- "total_flos": 1.3032055029202848e+16,
158
  "train_batch_size": 16,
159
  "trial_name": null,
160
  "trial_params": {
161
- "learning_rate": 2.0032413727096193e-05,
162
  "per_device_train_batch_size": 16
163
  }
164
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.6702412868632708,
13
+ "grad_norm": 5.694277763366699,
14
+ "learning_rate": 2.542005392773407e-05,
15
+ "loss": 0.557,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7254441976547241,
21
+ "eval_loss": 0.5171247720718384,
22
+ "eval_runtime": 8.773,
23
+ "eval_samples_per_second": 340.02,
24
+ "eval_steps_per_second": 21.315,
25
  "step": 746
26
  },
27
  {
28
  "epoch": 1.3404825737265416,
29
+ "grad_norm": 8.974740028381348,
30
+ "learning_rate": 2.359390062832789e-05,
31
+ "loss": 0.4156,
32
  "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.7596379518508911,
37
+ "eval_loss": 0.6025224924087524,
38
+ "eval_runtime": 8.8883,
39
+ "eval_samples_per_second": 335.609,
40
+ "eval_steps_per_second": 21.039,
41
  "step": 1492
42
  },
43
  {
44
  "epoch": 2.0107238605898123,
45
+ "grad_norm": 7.7003068923950195,
46
+ "learning_rate": 2.1767747328921705e-05,
47
+ "loss": 0.2948,
48
  "step": 1500
49
  },
50
  {
51
  "epoch": 2.680965147453083,
52
+ "grad_norm": 20.24570655822754,
53
+ "learning_rate": 1.9941594029515523e-05,
54
+ "loss": 0.1262,
55
  "step": 2000
56
  },
57
  {
58
  "epoch": 3.0,
59
+ "eval_accuracy": 0.7703654170036316,
60
+ "eval_loss": 0.822274386882782,
61
+ "eval_runtime": 8.8709,
62
+ "eval_samples_per_second": 336.267,
63
+ "eval_steps_per_second": 21.08,
64
  "step": 2238
65
  },
66
  {
67
  "epoch": 3.351206434316354,
68
+ "grad_norm": 0.9093023538589478,
69
+ "learning_rate": 1.8115440730109338e-05,
70
+ "loss": 0.1012,
71
  "step": 2500
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.7683539986610413,
76
+ "eval_loss": 1.2840174436569214,
77
+ "eval_runtime": 8.9163,
78
+ "eval_samples_per_second": 334.557,
79
+ "eval_steps_per_second": 20.973,
80
  "step": 2984
81
  },
82
  {
83
  "epoch": 4.021447721179625,
84
+ "grad_norm": 29.010135650634766,
85
+ "learning_rate": 1.6289287430703153e-05,
86
+ "loss": 0.0675,
87
  "step": 3000
88
  },
89
  {
90
  "epoch": 4.6916890080428955,
91
+ "grad_norm": 5.461940288543701,
92
+ "learning_rate": 1.4463134131296973e-05,
93
+ "loss": 0.0379,
94
  "step": 3500
95
  },
96
  {
97
  "epoch": 5.0,
98
+ "eval_accuracy": 0.7700302004814148,
99
+ "eval_loss": 1.4166399240493774,
100
+ "eval_runtime": 8.8683,
101
+ "eval_samples_per_second": 336.367,
102
+ "eval_steps_per_second": 21.086,
103
  "step": 3730
104
  },
105
  {
106
  "epoch": 5.361930294906166,
107
+ "grad_norm": 0.004815839231014252,
108
+ "learning_rate": 1.2636980831890788e-05,
109
+ "loss": 0.034,
110
  "step": 4000
111
  },
112
  {
113
  "epoch": 6.0,
114
+ "eval_accuracy": 0.7720415592193604,
115
+ "eval_loss": 1.576446533203125,
116
+ "eval_runtime": 8.9004,
117
+ "eval_samples_per_second": 335.152,
118
+ "eval_steps_per_second": 21.01,
119
  "step": 4476
120
  },
121
  {
122
  "epoch": 6.032171581769437,
123
+ "grad_norm": 0.29464954137802124,
124
+ "learning_rate": 1.0810827532484605e-05,
125
+ "loss": 0.0175,
126
  "step": 4500
127
  },
128
  {
129
  "epoch": 6.702412868632708,
130
+ "grad_norm": 0.010658634826540947,
131
+ "learning_rate": 8.984674233078421e-06,
132
+ "loss": 0.0101,
133
  "step": 5000
134
  },
135
  {
136
  "epoch": 7.0,
137
+ "eval_accuracy": 0.7753939032554626,
138
+ "eval_loss": 1.5760776996612549,
139
+ "eval_runtime": 8.8735,
140
+ "eval_samples_per_second": 336.169,
141
+ "eval_steps_per_second": 21.074,
142
  "step": 5222
143
  },
144
  {
145
  "epoch": 7.372654155495979,
146
+ "grad_norm": 32.647804260253906,
147
+ "learning_rate": 7.158520933672239e-06,
148
+ "loss": 0.0101,
149
  "step": 5500
150
  }
151
  ],
 
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 10,
156
  "save_steps": 500,
157
+ "total_flos": 1.307182215348216e+16,
158
  "train_batch_size": 16,
159
  "trial_name": null,
160
  "trial_params": {
161
+ "learning_rate": 2.7246207227140256e-05,
162
  "per_device_train_batch_size": 16
163
  }
164
  }