inflaton commited on
Commit
e83a66f
1 Parent(s): 40e50c1

Training in progress, step 3500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f58060fd011a98d8f2d969998b3f2b03b259ce50532656dcac85415292b624
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b23bd5bcb6c3f20acbd47f46114a67d639b650403922c7d1cf092f251113025
3
  size 1340618660
run-2/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4823de534c03cd8365c4d1063243e9562f9ad233f8e5b72c4faf9e5d30e9a9
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd03c9a052519a37796fd580798a1b7167c12e63f4edbe7fca408f06cff6e5e4
3
  size 1340618660
run-2/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:900bdd243dc7019f81b887d4d33268ba44e786303120f092f21956eaffde22c5
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd55af21bbe43a82736b823ac7d9272daaf1458487f7e4061546d9909d7ec30b
3
  size 2681472237
run-2/checkpoint-2500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59660cfbae90c75aff22fbb936299b2868fa9acddf052cbe56f2d442145f3b8b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8cec4ebd0626f313f72eea7d5bf9683890b0894a82bb4c8ec29edc9e8142906
3
  size 14244
run-2/checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc5b41e5527ab73bd9c8d7179f5eb7befb77760a1f96a4dd58ff327d4748a0c9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009a2f5834038eb367fc915673c02337753238147294fc9ac84286149749ba11
3
  size 1064
run-2/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68cfe34dae325096495265befbc04045f52f535badd4f779e9c272d3d0b0f80
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048
run-2/checkpoint-3000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee681dc84fac79d1d9d643c003b15bd7ccdee1fb48cb870b660bb2ee31378926
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25be34950a69b17ba12b1d3ccfe75faee8c6694f3b41f2e87bd9a121a42947e0
3
  size 1340618660
run-2/checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2782860aba135a87310c45e9585c94228cb176d6a10f199e3b982b07a1496824
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:334df518cbe61cc8ee49b8158465538643593267e658da6228234d6649832566
3
  size 2681472237
run-2/checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db288463401544c237747deaf541285079a7fac434ce2c87fca0252ed44b357
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878fd85ac5cda037d90d9829e34a480865a8673564905ce79142723b0e6d8190
3
  size 14244
run-2/checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6502a7770a12ff2e084bd7551eb23352078f9a6fdfa8b4cb93b3ae1994b447c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7c6fc647fdf7339fe77d6d702eedc9cb500398ae4245ed815ba9a6fa346f3d
3
  size 1064
run-2/checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68cfe34dae325096495265befbc04045f52f535badd4f779e9c272d3d0b0f80
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048
run-2/checkpoint-3500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f507d33e2ac708b2aebe24a9e2832e2c9ca8ebfb0f277b02a31df18d9f9893b7
3
  size 1340618660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b23bd5bcb6c3f20acbd47f46114a67d639b650403922c7d1cf092f251113025
3
  size 1340618660
run-2/checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4e47b93b237f3238d088c97ebb8c296c7735b81a7bc2954c8902b00452643ff
3
  size 2681472237
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d95c4d377f38ddd2fe3f1efdf0181471a229d634f0504184b4364f9903406c9
3
  size 2681472237
run-2/checkpoint-3500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f4337773f0e3eb3ebd819153d5c9547815b2eb781fb1b3b8acb5f2d3c9910ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89a7fe4a55f5798a402c40524f63ed4391bf96f4737fa9c1c3b85a3eb033165
3
  size 14244
run-2/checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b75a1a9f7ae761e7eb2cdacbdb01d35d249e5b0ce9653f5183e3f1e93019ba13
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77616bc41c3c2bf2895df13b27557d69c18633a5408ba8760434113beac7d02a
3
  size 1064
run-2/checkpoint-3500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.6916890080428955,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
@@ -9,101 +9,146 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6702412868632708,
13
- "grad_norm": 17.923147201538086,
14
- "learning_rate": 1.3001640168623715e-05,
15
- "loss": 0.5514,
 
 
 
 
 
 
 
 
 
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.7683539986610413,
21
- "eval_loss": 0.46452510356903076,
22
- "eval_runtime": 8.9706,
23
- "eval_samples_per_second": 332.531,
24
- "eval_steps_per_second": 20.846,
25
  "step": 746
26
  },
27
  {
28
- "epoch": 1.3404825737265416,
29
- "grad_norm": 19.32151222229004,
30
- "learning_rate": 1.2067614294440976e-05,
31
- "loss": 0.3969,
32
  "step": 1000
33
  },
34
  {
35
- "epoch": 2.0,
36
- "eval_accuracy": 0.7643312215805054,
37
- "eval_loss": 0.5671606063842773,
38
- "eval_runtime": 9.0896,
39
- "eval_samples_per_second": 328.177,
40
- "eval_steps_per_second": 20.573,
 
 
 
 
 
 
 
 
 
41
  "step": 1492
42
  },
43
  {
44
- "epoch": 2.0107238605898123,
45
- "grad_norm": 17.702476501464844,
46
- "learning_rate": 1.1133588420258238e-05,
47
- "loss": 0.2998,
48
  "step": 1500
49
  },
50
  {
51
- "epoch": 2.680965147453083,
52
- "grad_norm": 6.516891956329346,
53
- "learning_rate": 1.0199562546075501e-05,
54
- "loss": 0.1319,
 
 
 
 
 
 
 
 
 
55
  "step": 2000
56
  },
57
  {
58
- "epoch": 3.0,
59
- "eval_accuracy": 0.7770700454711914,
60
- "eval_loss": 0.7936307191848755,
61
- "eval_runtime": 9.0031,
62
- "eval_samples_per_second": 331.332,
63
- "eval_steps_per_second": 20.771,
64
  "step": 2238
65
  },
66
  {
67
- "epoch": 3.351206434316354,
68
- "grad_norm": 0.19944968819618225,
69
- "learning_rate": 9.265536671892763e-06,
70
- "loss": 0.0932,
71
  "step": 2500
72
  },
73
  {
74
- "epoch": 4.0,
75
- "eval_accuracy": 0.7750586867332458,
76
- "eval_loss": 1.1848183870315552,
77
- "eval_runtime": 9.064,
78
- "eval_samples_per_second": 329.104,
79
- "eval_steps_per_second": 20.631,
 
 
 
 
 
 
 
 
 
80
  "step": 2984
81
  },
82
  {
83
- "epoch": 4.021447721179625,
84
- "grad_norm": 0.03240065276622772,
85
- "learning_rate": 8.331510797710023e-06,
86
- "loss": 0.0618,
87
  "step": 3000
88
  },
89
  {
90
- "epoch": 4.6916890080428955,
91
- "grad_norm": 23.173229217529297,
92
- "learning_rate": 7.397484923527286e-06,
93
- "loss": 0.0314,
 
 
 
 
 
 
 
 
 
94
  "step": 3500
95
  }
96
  ],
97
  "logging_steps": 500,
98
- "max_steps": 7460,
99
  "num_input_tokens_seen": 0,
100
  "num_train_epochs": 10,
101
  "save_steps": 500,
102
- "total_flos": 8303869908315396.0,
103
- "train_batch_size": 16,
104
  "trial_name": null,
105
  "trial_params": {
106
- "learning_rate": 1.3935666042806453e-05,
107
- "per_device_train_batch_size": 16
108
  }
109
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.383378016085791,
5
  "eval_steps": 500,
6
  "global_step": 3500,
7
  "is_hyper_param_search": true,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.7525980472564697,
14
+ "eval_loss": 0.4786739945411682,
15
+ "eval_runtime": 8.9408,
16
+ "eval_samples_per_second": 333.638,
17
+ "eval_steps_per_second": 20.915,
18
+ "step": 373
19
+ },
20
+ {
21
+ "epoch": 1.3404825737265416,
22
+ "grad_norm": 76.13782501220703,
23
+ "learning_rate": 1.6728270896334427e-05,
24
+ "loss": 0.4795,
25
  "step": 500
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.7680187821388245,
30
+ "eval_loss": 0.5211130976676941,
31
+ "eval_runtime": 8.9873,
32
+ "eval_samples_per_second": 331.913,
33
+ "eval_steps_per_second": 20.807,
34
  "step": 746
35
  },
36
  {
37
+ "epoch": 2.680965147453083,
38
+ "grad_norm": 7.740549087524414,
39
+ "learning_rate": 1.4138755277706807e-05,
40
+ "loss": 0.2127,
41
  "step": 1000
42
  },
43
  {
44
+ "epoch": 3.0,
45
+ "eval_accuracy": 0.7683539986610413,
46
+ "eval_loss": 0.8232186436653137,
47
+ "eval_runtime": 8.9717,
48
+ "eval_samples_per_second": 332.49,
49
+ "eval_steps_per_second": 20.843,
50
+ "step": 1119
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.7660073637962341,
55
+ "eval_loss": 1.132562518119812,
56
+ "eval_runtime": 9.1516,
57
+ "eval_samples_per_second": 325.954,
58
+ "eval_steps_per_second": 20.434,
59
  "step": 1492
60
  },
61
  {
62
+ "epoch": 4.021447721179625,
63
+ "grad_norm": 2.1688199043273926,
64
+ "learning_rate": 1.1549239659079185e-05,
65
+ "loss": 0.0802,
66
  "step": 1500
67
  },
68
  {
69
+ "epoch": 5.0,
70
+ "eval_accuracy": 0.7693597078323364,
71
+ "eval_loss": 1.26638662815094,
72
+ "eval_runtime": 9.0164,
73
+ "eval_samples_per_second": 330.843,
74
+ "eval_steps_per_second": 20.74,
75
+ "step": 1865
76
+ },
77
+ {
78
+ "epoch": 5.361930294906166,
79
+ "grad_norm": 3.62616229057312,
80
+ "learning_rate": 8.959724040451565e-06,
81
+ "loss": 0.0363,
82
  "step": 2000
83
  },
84
  {
85
+ "epoch": 6.0,
86
+ "eval_accuracy": 0.7639960050582886,
87
+ "eval_loss": 1.3062156438827515,
88
+ "eval_runtime": 9.0187,
89
+ "eval_samples_per_second": 330.757,
90
+ "eval_steps_per_second": 20.735,
91
  "step": 2238
92
  },
93
  {
94
+ "epoch": 6.702412868632708,
95
+ "grad_norm": 0.06093325465917587,
96
+ "learning_rate": 6.370208421823946e-06,
97
+ "loss": 0.0173,
98
  "step": 2500
99
  },
100
  {
101
+ "epoch": 7.0,
102
+ "eval_accuracy": 0.7693597078323364,
103
+ "eval_loss": 1.458601713180542,
104
+ "eval_runtime": 8.9619,
105
+ "eval_samples_per_second": 332.855,
106
+ "eval_steps_per_second": 20.866,
107
+ "step": 2611
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7727120518684387,
112
+ "eval_loss": 1.3809081315994263,
113
+ "eval_runtime": 9.0485,
114
+ "eval_samples_per_second": 329.668,
115
+ "eval_steps_per_second": 20.666,
116
  "step": 2984
117
  },
118
  {
119
+ "epoch": 8.04289544235925,
120
+ "grad_norm": 0.044611748307943344,
121
+ "learning_rate": 3.7806928031963256e-06,
122
+ "loss": 0.0078,
123
  "step": 3000
124
  },
125
  {
126
+ "epoch": 9.0,
127
+ "eval_accuracy": 0.7717063426971436,
128
+ "eval_loss": 1.5632531642913818,
129
+ "eval_runtime": 9.0494,
130
+ "eval_samples_per_second": 329.634,
131
+ "eval_steps_per_second": 20.664,
132
+ "step": 3357
133
+ },
134
+ {
135
+ "epoch": 9.383378016085791,
136
+ "grad_norm": 0.052517324686050415,
137
+ "learning_rate": 1.1911771845687053e-06,
138
+ "loss": 0.0055,
139
  "step": 3500
140
  }
141
  ],
142
  "logging_steps": 500,
143
+ "max_steps": 3730,
144
  "num_input_tokens_seen": 0,
145
  "num_train_epochs": 10,
146
  "save_steps": 500,
147
+ "total_flos": 1.781054971229646e+16,
148
+ "train_batch_size": 32,
149
  "trial_name": null,
150
  "trial_params": {
151
+ "learning_rate": 1.9317786514962047e-05,
152
+ "per_device_train_batch_size": 32
153
  }
154
  }
run-2/checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68cfe34dae325096495265befbc04045f52f535badd4f779e9c272d3d0b0f80
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed49fe002187d5ffed6d819c5938a827ebb9152529606096a657c65a1141ceb
3
  size 5048