sanguyen commited on
Commit
cbbce8b
1 Parent(s): dc593cf

End of training

Browse files
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.7108753315649867,
4
- "eval_loss": 1.1812396049499512,
5
- "eval_runtime": 5.9051,
6
  "eval_samples": 377,
7
- "eval_samples_per_second": 63.844,
8
- "eval_steps_per_second": 8.129,
9
- "total_flos": 2285918447616000.0,
10
- "train_loss": 1.2526328404744467,
11
- "train_runtime": 557.6223,
12
- "train_samples": 3450,
13
- "train_samples_per_second": 30.935,
14
- "train_steps_per_second": 0.968
15
  }
 
1
  {
2
+ "epoch": 9.991589571068124,
3
+ "eval_accuracy": 0.7294429708222812,
4
+ "eval_loss": 0.7049754858016968,
5
+ "eval_runtime": 5.8794,
6
  "eval_samples": 377,
7
+ "eval_samples_per_second": 64.122,
8
+ "eval_steps_per_second": 8.164,
9
+ "total_flos": 2.518883353755648e+16,
10
+ "train_loss": 1.114941159643308,
11
+ "train_runtime": 5818.2377,
12
+ "train_samples": 19024,
13
+ "train_samples_per_second": 32.697,
14
+ "train_steps_per_second": 1.021
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.7108753315649867,
4
- "eval_loss": 1.1812396049499512,
5
- "eval_runtime": 5.9051,
6
  "eval_samples": 377,
7
- "eval_samples_per_second": 63.844,
8
- "eval_steps_per_second": 8.129
9
  }
 
1
  {
2
+ "epoch": 9.991589571068124,
3
+ "eval_accuracy": 0.7294429708222812,
4
+ "eval_loss": 0.7049754858016968,
5
+ "eval_runtime": 5.8794,
6
  "eval_samples": 377,
7
+ "eval_samples_per_second": 64.122,
8
+ "eval_steps_per_second": 8.164
9
  }
predict_results.txt CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 2285918447616000.0,
4
- "train_loss": 1.2526328404744467,
5
- "train_runtime": 557.6223,
6
- "train_samples": 3450,
7
- "train_samples_per_second": 30.935,
8
- "train_steps_per_second": 0.968
9
  }
 
1
  {
2
+ "epoch": 9.991589571068124,
3
+ "total_flos": 2.518883353755648e+16,
4
+ "train_loss": 1.114941159643308,
5
+ "train_runtime": 5818.2377,
6
+ "train_samples": 19024,
7
+ "train_samples_per_second": 32.697,
8
+ "train_steps_per_second": 1.021
9
  }
trainer_state.json CHANGED
@@ -1,79 +1,194 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 540,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.6472148541114059,
14
- "eval_loss": 1.6224454641342163,
15
- "eval_runtime": 5.881,
16
- "eval_samples_per_second": 64.105,
17
- "eval_steps_per_second": 8.162,
18
- "step": 108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  {
21
  "epoch": 2.0,
22
- "eval_accuracy": 0.6604774535809018,
23
- "eval_loss": 1.332472801208496,
24
- "eval_runtime": 5.9014,
25
- "eval_samples_per_second": 63.884,
26
- "eval_steps_per_second": 8.134,
27
- "step": 216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  {
30
- "epoch": 3.0,
31
- "eval_accuracy": 0.6923076923076923,
32
- "eval_loss": 1.231541633605957,
33
- "eval_runtime": 5.9048,
34
- "eval_samples_per_second": 63.846,
35
- "eval_steps_per_second": 8.129,
36
- "step": 324
37
  },
38
  {
39
  "epoch": 4.0,
40
- "eval_accuracy": 0.7108753315649867,
41
- "eval_loss": 1.1934351921081543,
42
- "eval_runtime": 5.899,
43
- "eval_samples_per_second": 63.909,
44
- "eval_steps_per_second": 8.137,
45
- "step": 432
46
- },
47
- {
48
- "epoch": 4.62962962962963,
49
- "grad_norm": 7.095102787017822,
50
- "learning_rate": 2.222222222222222e-06,
51
- "loss": 1.2712,
52
- "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  },
54
  {
55
- "epoch": 5.0,
56
- "eval_accuracy": 0.7108753315649867,
57
- "eval_loss": 1.1812396049499512,
58
- "eval_runtime": 5.917,
59
- "eval_samples_per_second": 63.715,
60
- "eval_steps_per_second": 8.112,
61
- "step": 540
62
  },
63
  {
64
- "epoch": 5.0,
65
- "step": 540,
66
- "total_flos": 2285918447616000.0,
67
- "train_loss": 1.2526328404744467,
68
- "train_runtime": 557.6223,
69
- "train_samples_per_second": 30.935,
70
- "train_steps_per_second": 0.968
71
  }
72
  ],
73
  "logging_steps": 500,
74
- "max_steps": 540,
75
  "num_input_tokens_seen": 0,
76
- "num_train_epochs": 5,
77
  "save_steps": 500,
78
  "stateful_callbacks": {
79
  "TrainerControl": {
@@ -81,13 +196,13 @@
81
  "should_epoch_stop": false,
82
  "should_evaluate": false,
83
  "should_log": false,
84
- "should_save": true,
85
- "should_training_stop": true
86
  },
87
  "attributes": {}
88
  }
89
  },
90
- "total_flos": 2285918447616000.0,
91
  "train_batch_size": 16,
92
  "trial_name": null,
93
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.991589571068124,
5
  "eval_steps": 500,
6
+ "global_step": 5940,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.8410428931875525,
13
+ "grad_norm": 3.157212972640991,
14
+ "learning_rate": 9.15824915824916e-05,
15
+ "loss": 1.7278,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.9991589571068125,
20
+ "eval_accuracy": 0.5517241379310345,
21
+ "eval_loss": 1.2574303150177002,
22
+ "eval_runtime": 5.9277,
23
+ "eval_samples_per_second": 63.6,
24
+ "eval_steps_per_second": 8.098,
25
+ "step": 594
26
+ },
27
+ {
28
+ "epoch": 1.682085786375105,
29
+ "grad_norm": 1.998612880706787,
30
+ "learning_rate": 8.316498316498317e-05,
31
+ "loss": 1.3236,
32
+ "step": 1000
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.6392572944297082,
37
+ "eval_loss": 0.9894065856933594,
38
+ "eval_runtime": 5.8845,
39
+ "eval_samples_per_second": 64.067,
40
+ "eval_steps_per_second": 8.157,
41
+ "step": 1189
42
+ },
43
+ {
44
+ "epoch": 2.5231286795626575,
45
+ "grad_norm": 1.9598954916000366,
46
+ "learning_rate": 7.474747474747475e-05,
47
+ "loss": 1.1827,
48
+ "step": 1500
49
+ },
50
+ {
51
+ "epoch": 2.9991589571068125,
52
+ "eval_accuracy": 0.6896551724137931,
53
+ "eval_loss": 0.8649284839630127,
54
+ "eval_runtime": 5.8972,
55
+ "eval_samples_per_second": 63.929,
56
+ "eval_steps_per_second": 8.139,
57
+ "step": 1783
58
  },
59
  {
60
+ "epoch": 3.36417157275021,
61
+ "grad_norm": 2.8274946212768555,
62
+ "learning_rate": 6.632996632996633e-05,
63
+ "loss": 1.1147,
64
+ "step": 2000
 
 
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.6896551724137931,
69
+ "eval_loss": 0.859340488910675,
70
+ "eval_runtime": 5.894,
71
+ "eval_samples_per_second": 63.964,
72
+ "eval_steps_per_second": 8.144,
73
+ "step": 2378
74
+ },
75
+ {
76
+ "epoch": 4.205214465937763,
77
+ "grad_norm": 2.467745542526245,
78
+ "learning_rate": 5.7912457912457915e-05,
79
+ "loss": 1.0552,
80
+ "step": 2500
81
+ },
82
+ {
83
+ "epoch": 4.999158957106813,
84
+ "eval_accuracy": 0.6976127320954907,
85
+ "eval_loss": 0.774470865726471,
86
+ "eval_runtime": 5.9015,
87
+ "eval_samples_per_second": 63.882,
88
+ "eval_steps_per_second": 8.134,
89
+ "step": 2972
90
+ },
91
+ {
92
+ "epoch": 5.046257359125315,
93
+ "grad_norm": 3.3015456199645996,
94
+ "learning_rate": 4.94949494949495e-05,
95
+ "loss": 1.0474,
96
+ "step": 3000
97
+ },
98
+ {
99
+ "epoch": 5.887300252312868,
100
+ "grad_norm": 2.705965995788574,
101
+ "learning_rate": 4.1077441077441085e-05,
102
+ "loss": 1.0143,
103
+ "step": 3500
104
+ },
105
+ {
106
+ "epoch": 6.0,
107
+ "eval_accuracy": 0.713527851458886,
108
+ "eval_loss": 0.7180978655815125,
109
+ "eval_runtime": 5.9009,
110
+ "eval_samples_per_second": 63.889,
111
+ "eval_steps_per_second": 8.134,
112
+ "step": 3567
113
+ },
114
+ {
115
+ "epoch": 6.72834314550042,
116
+ "grad_norm": 1.9816116094589233,
117
+ "learning_rate": 3.265993265993266e-05,
118
+ "loss": 0.9872,
119
+ "step": 4000
120
+ },
121
+ {
122
+ "epoch": 6.999158957106813,
123
+ "eval_accuracy": 0.7506631299734748,
124
+ "eval_loss": 0.7037277221679688,
125
+ "eval_runtime": 5.9082,
126
+ "eval_samples_per_second": 63.81,
127
+ "eval_steps_per_second": 8.124,
128
+ "step": 4161
129
+ },
130
+ {
131
+ "epoch": 7.569386038687973,
132
+ "grad_norm": 2.6848223209381104,
133
+ "learning_rate": 2.4242424242424244e-05,
134
+ "loss": 1.002,
135
+ "step": 4500
136
+ },
137
+ {
138
+ "epoch": 8.0,
139
+ "eval_accuracy": 0.7347480106100795,
140
+ "eval_loss": 0.7111003398895264,
141
+ "eval_runtime": 5.9138,
142
+ "eval_samples_per_second": 63.749,
143
+ "eval_steps_per_second": 8.117,
144
+ "step": 4756
145
+ },
146
+ {
147
+ "epoch": 8.410428931875526,
148
+ "grad_norm": 2.5223660469055176,
149
+ "learning_rate": 1.5824915824915825e-05,
150
+ "loss": 0.9816,
151
+ "step": 5000
152
+ },
153
+ {
154
+ "epoch": 8.999158957106813,
155
+ "eval_accuracy": 0.7241379310344828,
156
+ "eval_loss": 0.6930974125862122,
157
+ "eval_runtime": 5.8939,
158
+ "eval_samples_per_second": 63.965,
159
+ "eval_steps_per_second": 8.144,
160
+ "step": 5350
161
+ },
162
+ {
163
+ "epoch": 9.251471825063078,
164
+ "grad_norm": 2.612992286682129,
165
+ "learning_rate": 7.4074074074074075e-06,
166
+ "loss": 0.9602,
167
+ "step": 5500
168
  },
169
  {
170
+ "epoch": 9.991589571068124,
171
+ "eval_accuracy": 0.7294429708222812,
172
+ "eval_loss": 0.7049754858016968,
173
+ "eval_runtime": 5.8891,
174
+ "eval_samples_per_second": 64.017,
175
+ "eval_steps_per_second": 8.151,
176
+ "step": 5940
177
  },
178
  {
179
+ "epoch": 9.991589571068124,
180
+ "step": 5940,
181
+ "total_flos": 2.518883353755648e+16,
182
+ "train_loss": 1.114941159643308,
183
+ "train_runtime": 5818.2377,
184
+ "train_samples_per_second": 32.697,
185
+ "train_steps_per_second": 1.021
186
  }
187
  ],
188
  "logging_steps": 500,
189
+ "max_steps": 5940,
190
  "num_input_tokens_seen": 0,
191
+ "num_train_epochs": 10,
192
  "save_steps": 500,
193
  "stateful_callbacks": {
194
  "TrainerControl": {
 
196
  "should_epoch_stop": false,
197
  "should_evaluate": false,
198
  "should_log": false,
199
+ "should_save": false,
200
+ "should_training_stop": false
201
  },
202
  "attributes": {}
203
  }
204
  },
205
+ "total_flos": 2.518883353755648e+16,
206
  "train_batch_size": 16,
207
  "trial_name": null,
208
  "trial_params": null