inflaton commited on
Commit
8619e88
1 Parent(s): 3f7fbce

Training in progress, step 2500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8aac62e03ade79bd140ad03d4165c4c7674c4b98432ea9d71cb5d14f49ccdee
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c2e9b0d7dc747a8d8d78f5dc799a8808ec878848fb8586ab212cf9c8c81ffb
3
  size 1340618660
run-1/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c088729f3427ac5c1e2df68174b0854bf2324672574affd7736e3e6bb2a9acd
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99df09116ca6c07cd2ee50f92c83af65b6ed682f87444c81d56debdb15a67bc7
3
  size 1340618660
run-1/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24fabdf20c53024f38140ea6ff24d52a80b8508e81735b52184322551cd78897
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d18b53a4e7f25346bd7c16172a4adef79a6edeb39f9f4050e609b09add8364
3
  size 2681472237
run-1/checkpoint-1000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54bea121f6144566f091fc3db2c9498700b03cbc999f62b42941d6dfd7452436
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee5d4f5bf9cb098c9058ef37d6ea26594c6643d7293de0cc233fefe6dc2e9266
3
  size 14244
run-1/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e0387f8c1f0c455d9d47684623696b0dd21029c4ed1b59ee625cc8620c5218
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59e2c9d4f5aab1b41945071fcf783c8c98428267f9d726c5315008dd5ee958e5
3
  size 1064
run-1/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-1/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad04493da459f0813f0a12a3a1e6615890d448cab7cd6f98457af575a0587de8
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd242d25db48d533f5b419e9accae96c0bbb75abac1929a7bc3ff3c8fa62507
3
  size 1340618660
run-1/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8184cdbc47a88fc421339a73ca2bc3f26fda024fd480e96872d6a7d2950dabb
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57dc78f006df17d06afd83a6c804746d65526acad93c0007b6985d4008e130c9
3
  size 2681472237
run-1/checkpoint-1500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d94e9959fe10619a73c116e156a1d9aa901f533fcb5d140a1ce44038a0d2c6fd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd348b5ed52326f9e7c0a4fc04aeede902c855e83469ebb30ebe75d48562d6f0
3
  size 14244
run-1/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f2c5a1d01f0f1f7e70b8c4640a04bfb69042c6b12da253e819ac542e2bbb605
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cac12569c89bd6af0a6c464c2ccb5502cfa80b78c591436907cf2144eff2ef7
3
  size 1064
run-1/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-1/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c02a37660ef6bf57a92f148d0dee9e3d689ee051360bce911faee8e264d385bc
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459d423b4a4cab135485dbccd2081772edb7d23a6b21139598082433ac96a4c8
3
  size 1340618660
run-1/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7962c95fabee8dfa19bc5b350d007585f00550d1180e59d92317dafa5e3865ec
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:644bad49b0cb9c00f4eb96528debee31287df83b350f2841ba416a7119f87223
3
  size 2681472237
run-1/checkpoint-2000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75c227961e4ca5d0f7ba2c4abe0a18977107a0b2c234fd11994cf5a6ecf20ea8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b49bea260ecfd05aebf3d6cf4bfc1d44acdc012600b03e899ea43dd79114c75d
3
  size 14244
run-1/checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8aab70f36df8a8105cdb3296e9361f9efbddc35ccdc7d44c370ace77d4ff888
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e241ad10906a6203ac7c777fe41f5c867da1c3afcff02ec2e9f7a5fe7164cc6
3
  size 1064
run-1/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-1/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a41db24a10e574e2b78eeb6f866395aa88c5230db6759d5b95ca29c9d5a28a6
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c2e9b0d7dc747a8d8d78f5dc799a8808ec878848fb8586ab212cf9c8c81ffb
3
  size 1340618660
run-1/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bafc3f7f7a6509710922db78c915ecfe67f71d01e55763c57cafa025f5b507ab
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98005b87b74e181ecf343bb98631f3732bad5e17c680437d5a2b007ef6db45a7
3
  size 2681472237
run-1/checkpoint-2500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59660cfbae90c75aff22fbb936299b2868fa9acddf052cbe56f2d442145f3b8b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8cec4ebd0626f313f72eea7d5bf9683890b0894a82bb4c8ec29edc9e8142906
3
  size 14244
run-1/checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:323f7ece6edd36dc80d4f36bf9473a3dc7924fe86832e5afe55cc73ff4cb517a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0fea71d66caec6080c6e087ac6154fb81cbccfc41bd256c64df83f2553485e
3
  size 1064
run-1/checkpoint-2500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.351206434316354,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
@@ -9,78 +9,105 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 20.23386001586914,
14
- "learning_rate": 1.2656372669468497e-05,
15
- "loss": 0.5543,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7663425803184509,
21
- "eval_loss": 0.4670431315898895,
22
- "eval_runtime": 8.8872,
23
- "eval_samples_per_second": 335.653,
24
- "eval_steps_per_second": 21.042,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 15.927043914794922,
30
- "learning_rate": 1.1747150494937714e-05,
31
- "loss": 0.3962,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7676835656166077,
37
- "eval_loss": 0.5088892579078674,
38
- "eval_runtime": 8.9675,
39
- "eval_samples_per_second": 332.647,
40
- "eval_steps_per_second": 20.853,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 13.621393203735352,
46
- "learning_rate": 1.0837928320406931e-05,
47
- "loss": 0.3095,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 11.860713005065918,
53
- "learning_rate": 9.92870614587615e-06,
54
- "loss": 0.1321,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7743881940841675,
60
- "eval_loss": 0.7907660007476807,
61
- "eval_runtime": 8.9795,
62
- "eval_samples_per_second": 332.2,
63
- "eval_steps_per_second": 20.825,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 4.521104335784912,
69
- "learning_rate": 9.019483971345365e-06,
70
- "loss": 0.0899,
71
  "step": 2500
72
  }
73
  ],
74
  "logging_steps": 500,
75
- "max_steps": 7460,
76
  "num_input_tokens_seen": 0,
77
  "num_train_epochs": 10,
78
  "save_steps": 500,
79
- "total_flos": 5932068918206580.0,
80
- "train_batch_size": 16,
81
  "trial_name": null,
82
  "trial_params": {
83
- "learning_rate": 1.356559484399928e-05,
84
- "per_device_train_batch_size": 16
85
  }
86
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.702412868632708,
5
  "eval_steps": 500,
6
  "global_step": 2500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7106939554214478,
14
+ "eval_loss": 0.538670003414154,
15
+ "eval_runtime": 8.938,
16
+ "eval_samples_per_second": 333.743,
17
+ "eval_steps_per_second": 20.922,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 6.848696231842041,
23
+ "learning_rate": 2.0560750028839876e-06,
24
+ "loss": 0.5981,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7321488261222839,
30
+ "eval_loss": 0.5092849135398865,
31
+ "eval_runtime": 8.9404,
32
+ "eval_samples_per_second": 333.653,
33
+ "eval_steps_per_second": 20.916,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 9.747313499450684,
39
+ "learning_rate": 1.7377971386604603e-06,
40
+ "loss": 0.4883,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7412001490592957,
46
+ "eval_loss": 0.4964694678783417,
47
+ "eval_runtime": 8.8911,
48
+ "eval_samples_per_second": 335.504,
49
+ "eval_steps_per_second": 21.032,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7412001490592957,
55
+ "eval_loss": 0.5169216394424438,
56
+ "eval_runtime": 8.9728,
57
+ "eval_samples_per_second": 332.448,
58
+ "eval_steps_per_second": 20.841,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 6.287171363830566,
64
+ "learning_rate": 1.4195192744369326e-06,
65
+ "loss": 0.4333,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7422058582305908,
71
+ "eval_loss": 0.5148842930793762,
72
+ "eval_runtime": 8.9557,
73
+ "eval_samples_per_second": 333.084,
74
+ "eval_steps_per_second": 20.881,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 10.289645195007324,
80
+ "learning_rate": 1.101241410213405e-06,
81
+ "loss": 0.3746,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.744552493095398,
87
+ "eval_loss": 0.5290147662162781,
88
+ "eval_runtime": 8.9283,
89
+ "eval_samples_per_second": 334.107,
90
+ "eval_steps_per_second": 20.945,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 7.953075408935547,
96
+ "learning_rate": 7.829635459898776e-07,
97
+ "loss": 0.3503,
98
  "step": 2500
99
  }
100
  ],
101
  "logging_steps": 500,
102
+ "max_steps": 3730,
103
  "num_input_tokens_seen": 0,
104
  "num_train_epochs": 10,
105
  "save_steps": 500,
106
+ "total_flos": 1.2728741518130688e+16,
107
+ "train_batch_size": 32,
108
  "trial_name": null,
109
  "trial_params": {
110
+ "learning_rate": 2.374352867107515e-06,
111
+ "per_device_train_batch_size": 32
112
  }
113
  }
run-1/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d4b8161881416da68cca372d0db6cfca426bea24c4463a15d5c25fafd1f2c02
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb93c6c6aec435a71a91d9e9d85d4dd9cc0b1363951552ae02212d34fb657545
3
  size 5048
run-1/checkpoint-3000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.021447721179625,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
@@ -9,94 +9,130 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 20.23386001586914,
14
- "learning_rate": 1.2656372669468497e-05,
15
- "loss": 0.5543,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7663425803184509,
21
- "eval_loss": 0.4670431315898895,
22
- "eval_runtime": 8.8872,
23
- "eval_samples_per_second": 335.653,
24
- "eval_steps_per_second": 21.042,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 15.927043914794922,
30
- "learning_rate": 1.1747150494937714e-05,
31
- "loss": 0.3962,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7676835656166077,
37
- "eval_loss": 0.5088892579078674,
38
- "eval_runtime": 8.9675,
39
- "eval_samples_per_second": 332.647,
40
- "eval_steps_per_second": 20.853,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 13.621393203735352,
46
- "learning_rate": 1.0837928320406931e-05,
47
- "loss": 0.3095,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 11.860713005065918,
53
- "learning_rate": 9.92870614587615e-06,
54
- "loss": 0.1321,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7743881940841675,
60
- "eval_loss": 0.7907660007476807,
61
- "eval_runtime": 8.9795,
62
- "eval_samples_per_second": 332.2,
63
- "eval_steps_per_second": 20.825,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 4.521104335784912,
69
- "learning_rate": 9.019483971345365e-06,
70
- "loss": 0.0899,
71
  "step": 2500
72
  },
73
  {
74
- "epoch": 4.0,
75
- "eval_accuracy": 0.7730472683906555,
76
- "eval_loss": 1.2473280429840088,
77
- "eval_runtime": 9.0219,
78
- "eval_samples_per_second": 330.638,
79
- "eval_steps_per_second": 20.727,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
- "epoch": 4.021447721179625,
84
- "grad_norm": 1.990652084350586,
85
- "learning_rate": 8.110261796814582e-06,
86
- "loss": 0.0644,
87
  "step": 3000
88
  }
89
  ],
90
  "logging_steps": 500,
91
- "max_steps": 7460,
92
  "num_input_tokens_seen": 0,
93
  "num_train_epochs": 10,
94
  "save_steps": 500,
95
- "total_flos": 7120029848297796.0,
96
- "train_batch_size": 16,
97
  "trial_name": null,
98
  "trial_params": {
99
- "learning_rate": 1.356559484399928e-05,
100
- "per_device_train_batch_size": 16
101
  }
102
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.04289544235925,
5
  "eval_steps": 500,
6
  "global_step": 3000,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7106939554214478,
14
+ "eval_loss": 0.538670003414154,
15
+ "eval_runtime": 8.938,
16
+ "eval_samples_per_second": 333.743,
17
+ "eval_steps_per_second": 20.922,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 6.848696231842041,
23
+ "learning_rate": 2.0560750028839876e-06,
24
+ "loss": 0.5981,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7321488261222839,
30
+ "eval_loss": 0.5092849135398865,
31
+ "eval_runtime": 8.9404,
32
+ "eval_samples_per_second": 333.653,
33
+ "eval_steps_per_second": 20.916,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 9.747313499450684,
39
+ "learning_rate": 1.7377971386604603e-06,
40
+ "loss": 0.4883,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7412001490592957,
46
+ "eval_loss": 0.4964694678783417,
47
+ "eval_runtime": 8.8911,
48
+ "eval_samples_per_second": 335.504,
49
+ "eval_steps_per_second": 21.032,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7412001490592957,
55
+ "eval_loss": 0.5169216394424438,
56
+ "eval_runtime": 8.9728,
57
+ "eval_samples_per_second": 332.448,
58
+ "eval_steps_per_second": 20.841,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 6.287171363830566,
64
+ "learning_rate": 1.4195192744369326e-06,
65
+ "loss": 0.4333,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7422058582305908,
71
+ "eval_loss": 0.5148842930793762,
72
+ "eval_runtime": 8.9557,
73
+ "eval_samples_per_second": 333.084,
74
+ "eval_steps_per_second": 20.881,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 10.289645195007324,
80
+ "learning_rate": 1.101241410213405e-06,
81
+ "loss": 0.3746,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.744552493095398,
87
+ "eval_loss": 0.5290147662162781,
88
+ "eval_runtime": 8.9283,
89
+ "eval_samples_per_second": 334.107,
90
+ "eval_steps_per_second": 20.945,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 7.953075408935547,
96
+ "learning_rate": 7.829635459898776e-07,
97
+ "loss": 0.3503,
98
  "step": 2500
99
  },
100
  {
101
+ "epoch": 7.0,
102
+ "eval_accuracy": 0.7505866289138794,
103
+ "eval_loss": 0.5396992564201355,
104
+ "eval_runtime": 8.8627,
105
+ "eval_samples_per_second": 336.58,
106
+ "eval_steps_per_second": 21.1,
107
+ "step": 2611
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7492457032203674,
112
+ "eval_loss": 0.5516654253005981,
113
+ "eval_runtime": 8.9319,
114
+ "eval_samples_per_second": 333.971,
115
+ "eval_steps_per_second": 20.936,
116
  "step": 2984
117
  },
118
  {
119
+ "epoch": 8.04289544235925,
120
+ "grad_norm": 8.618916511535645,
121
+ "learning_rate": 4.646856817663501e-07,
122
+ "loss": 0.3183,
123
  "step": 3000
124
  }
125
  ],
126
  "logging_steps": 500,
127
+ "max_steps": 3730,
128
  "num_input_tokens_seen": 0,
129
  "num_train_epochs": 10,
130
  "save_steps": 500,
131
+ "total_flos": 1.52601424956666e+16,
132
+ "train_batch_size": 32,
133
  "trial_name": null,
134
  "trial_params": {
135
+ "learning_rate": 2.374352867107515e-06,
136
+ "per_device_train_batch_size": 32
137
  }
138
  }
run-1/checkpoint-3500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.6916890080428955,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
@@ -9,101 +9,146 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 20.23386001586914,
14
- "learning_rate": 1.2656372669468497e-05,
15
- "loss": 0.5543,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7663425803184509,
21
- "eval_loss": 0.4670431315898895,
22
- "eval_runtime": 8.8872,
23
- "eval_samples_per_second": 335.653,
24
- "eval_steps_per_second": 21.042,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 15.927043914794922,
30
- "learning_rate": 1.1747150494937714e-05,
31
- "loss": 0.3962,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7676835656166077,
37
- "eval_loss": 0.5088892579078674,
38
- "eval_runtime": 8.9675,
39
- "eval_samples_per_second": 332.647,
40
- "eval_steps_per_second": 20.853,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 13.621393203735352,
46
- "learning_rate": 1.0837928320406931e-05,
47
- "loss": 0.3095,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 11.860713005065918,
53
- "learning_rate": 9.92870614587615e-06,
54
- "loss": 0.1321,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7743881940841675,
60
- "eval_loss": 0.7907660007476807,
61
- "eval_runtime": 8.9795,
62
- "eval_samples_per_second": 332.2,
63
- "eval_steps_per_second": 20.825,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 4.521104335784912,
69
- "learning_rate": 9.019483971345365e-06,
70
- "loss": 0.0899,
71
  "step": 2500
72
  },
73
  {
74
- "epoch": 4.0,
75
- "eval_accuracy": 0.7730472683906555,
76
- "eval_loss": 1.2473280429840088,
77
- "eval_runtime": 9.0219,
78
- "eval_samples_per_second": 330.638,
79
- "eval_steps_per_second": 20.727,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
- "epoch": 4.021447721179625,
84
- "grad_norm": 1.990652084350586,
85
- "learning_rate": 8.110261796814582e-06,
86
- "loss": 0.0644,
87
  "step": 3000
88
  },
89
  {
90
- "epoch": 4.6916890080428955,
91
- "grad_norm": 28.1745662689209,
92
- "learning_rate": 7.2010396222838e-06,
93
- "loss": 0.0335,
 
 
 
 
 
 
 
 
 
94
  "step": 3500
95
  }
96
  ],
97
  "logging_steps": 500,
98
- "max_steps": 7460,
99
  "num_input_tokens_seen": 0,
100
  "num_train_epochs": 10,
101
  "save_steps": 500,
102
- "total_flos": 8303869908315396.0,
103
- "train_batch_size": 16,
104
  "trial_name": null,
105
  "trial_params": {
106
- "learning_rate": 1.356559484399928e-05,
107
- "per_device_train_batch_size": 16
108
  }
109
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.383378016085791,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7106939554214478,
14
+ "eval_loss": 0.538670003414154,
15
+ "eval_runtime": 8.938,
16
+ "eval_samples_per_second": 333.743,
17
+ "eval_steps_per_second": 20.922,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 6.848696231842041,
23
+ "learning_rate": 2.0560750028839876e-06,
24
+ "loss": 0.5981,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7321488261222839,
30
+ "eval_loss": 0.5092849135398865,
31
+ "eval_runtime": 8.9404,
32
+ "eval_samples_per_second": 333.653,
33
+ "eval_steps_per_second": 20.916,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 9.747313499450684,
39
+ "learning_rate": 1.7377971386604603e-06,
40
+ "loss": 0.4883,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7412001490592957,
46
+ "eval_loss": 0.4964694678783417,
47
+ "eval_runtime": 8.8911,
48
+ "eval_samples_per_second": 335.504,
49
+ "eval_steps_per_second": 21.032,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7412001490592957,
55
+ "eval_loss": 0.5169216394424438,
56
+ "eval_runtime": 8.9728,
57
+ "eval_samples_per_second": 332.448,
58
+ "eval_steps_per_second": 20.841,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 6.287171363830566,
64
+ "learning_rate": 1.4195192744369326e-06,
65
+ "loss": 0.4333,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7422058582305908,
71
+ "eval_loss": 0.5148842930793762,
72
+ "eval_runtime": 8.9557,
73
+ "eval_samples_per_second": 333.084,
74
+ "eval_steps_per_second": 20.881,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 10.289645195007324,
80
+ "learning_rate": 1.101241410213405e-06,
81
+ "loss": 0.3746,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.744552493095398,
87
+ "eval_loss": 0.5290147662162781,
88
+ "eval_runtime": 8.9283,
89
+ "eval_samples_per_second": 334.107,
90
+ "eval_steps_per_second": 20.945,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 7.953075408935547,
96
+ "learning_rate": 7.829635459898776e-07,
97
+ "loss": 0.3503,
98
  "step": 2500
99
  },
100
  {
101
+ "epoch": 7.0,
102
+ "eval_accuracy": 0.7505866289138794,
103
+ "eval_loss": 0.5396992564201355,
104
+ "eval_runtime": 8.8627,
105
+ "eval_samples_per_second": 336.58,
106
+ "eval_steps_per_second": 21.1,
107
+ "step": 2611
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7492457032203674,
112
+ "eval_loss": 0.5516654253005981,
113
+ "eval_runtime": 8.9319,
114
+ "eval_samples_per_second": 333.971,
115
+ "eval_steps_per_second": 20.936,
116
  "step": 2984
117
  },
118
  {
119
+ "epoch": 8.04289544235925,
120
+ "grad_norm": 8.618916511535645,
121
+ "learning_rate": 4.646856817663501e-07,
122
+ "loss": 0.3183,
123
  "step": 3000
124
  },
125
  {
126
+ "epoch": 9.0,
127
+ "eval_accuracy": 0.7472343444824219,
128
+ "eval_loss": 0.5633240938186646,
129
+ "eval_runtime": 8.9492,
130
+ "eval_samples_per_second": 333.326,
131
+ "eval_steps_per_second": 20.896,
132
+ "step": 3357
133
+ },
134
+ {
135
+ "epoch": 9.383378016085791,
136
+ "grad_norm": 10.000330924987793,
137
+ "learning_rate": 1.4640781754282265e-07,
138
+ "loss": 0.2984,
139
  "step": 3500
140
  }
141
  ],
142
  "logging_steps": 500,
143
+ "max_steps": 3730,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 10,
146
  "save_steps": 500,
147
+ "total_flos": 1.781054971229646e+16,
148
+ "train_batch_size": 32,
149
  "trial_name": null,
150
  "trial_params": {
151
+ "learning_rate": 2.374352867107515e-06,
152
+ "per_device_train_batch_size": 32
153
  }
154
  }