AlekseyKorshuk commited on
Commit
8e62039
1 Parent(s): c4bcd05

Model save

Browse files
README.md CHANGED
@@ -7,7 +7,7 @@ datasets:
7
  metrics:
8
  - accuracy
9
  model-index:
10
- - name: dalio-handwritten-io-1.3b
11
  results:
12
  - task:
13
  name: Causal Language Modeling
@@ -18,18 +18,18 @@ model-index:
18
  metrics:
19
  - name: Accuracy
20
  type: accuracy
21
- value: 0.055489496630994846
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
25
  should probably proofread and complete it, then remove this comment. -->
26
 
27
- # dalio-handwritten-io-1.3b
28
 
29
  This model is a fine-tuned version of [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b) on the AlekseyKorshuk/dalio-handwritten-io dataset.
30
  It achieves the following results on the evaluation set:
31
- - Loss: 2.6172
32
- - Accuracy: 0.0555
33
 
34
  ## Model description
35
 
@@ -49,24 +49,71 @@ More information needed
49
 
50
  The following hyperparameters were used during training:
51
  - learning_rate: 3e-05
52
- - train_batch_size: 8
53
- - eval_batch_size: 8
54
  - seed: 42
55
  - distributed_type: multi-GPU
56
  - num_devices: 8
57
- - total_train_batch_size: 64
58
- - total_eval_batch_size: 64
59
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
60
- - lr_scheduler_type: linear
61
- - num_epochs: 1.0
62
 
63
  ### Training results
64
 
65
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
66
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
67
- | 2.6917 | 0.33 | 1 | 2.6680 | 0.0530 |
68
- | 2.8435 | 0.67 | 2 | 2.6680 | 0.0530 |
69
- | 2.7502 | 1.0 | 3 | 2.6172 | 0.0555 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  ### Framework versions
 
7
  metrics:
8
  - accuracy
9
  model-index:
10
+ - name: test-clm
11
  results:
12
  - task:
13
  name: Causal Language Modeling
 
18
  metrics:
19
  - name: Accuracy
20
  type: accuracy
21
+ value: 0.06414321574844761
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
25
  should probably proofread and complete it, then remove this comment. -->
26
 
27
+ # test-clm
28
 
29
  This model is a fine-tuned version of [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b) on the AlekseyKorshuk/dalio-handwritten-io dataset.
30
  It achieves the following results on the evaluation set:
31
+ - Loss: 2.5547
32
+ - Accuracy: 0.0641
33
 
34
  ## Model description
35
 
 
49
 
50
  The following hyperparameters were used during training:
51
  - learning_rate: 3e-05
52
+ - train_batch_size: 2
53
+ - eval_batch_size: 2
54
  - seed: 42
55
  - distributed_type: multi-GPU
56
  - num_devices: 8
57
+ - total_train_batch_size: 16
58
+ - total_eval_batch_size: 16
59
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
60
+ - lr_scheduler_type: cosine
61
+ - num_epochs: 5.0
62
 
63
  ### Training results
64
 
65
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
66
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
67
+ | 2.9219 | 0.1 | 1 | 2.6484 | 0.0529 |
68
+ | 2.6938 | 0.2 | 2 | 2.6484 | 0.0529 |
69
+ | 2.6365 | 0.3 | 3 | 2.5508 | 0.0560 |
70
+ | 2.5088 | 0.4 | 4 | 2.5332 | 0.0563 |
71
+ | 2.7297 | 0.5 | 5 | 2.5176 | 0.0567 |
72
+ | 2.9702 | 0.6 | 6 | 2.4941 | 0.0572 |
73
+ | 2.729 | 0.7 | 7 | 2.4883 | 0.0568 |
74
+ | 2.6172 | 0.8 | 8 | 2.4785 | 0.0578 |
75
+ | 2.6428 | 0.9 | 9 | 2.4590 | 0.0581 |
76
+ | 2.5681 | 1.0 | 10 | 2.4355 | 0.0590 |
77
+ | 2.1885 | 1.1 | 11 | 2.4238 | 0.0587 |
78
+ | 1.981 | 1.2 | 12 | 2.4219 | 0.0587 |
79
+ | 1.8673 | 1.3 | 13 | 2.4180 | 0.0591 |
80
+ | 1.7321 | 1.4 | 14 | 2.4180 | 0.0596 |
81
+ | 1.6355 | 1.5 | 15 | 2.4180 | 0.0601 |
82
+ | 1.7758 | 1.6 | 16 | 2.4199 | 0.0602 |
83
+ | 2.0162 | 1.7 | 17 | 2.4082 | 0.0605 |
84
+ | 1.8037 | 1.8 | 18 | 2.3965 | 0.0605 |
85
+ | 1.7204 | 1.9 | 19 | 2.375 | 0.0608 |
86
+ | 1.7831 | 2.0 | 20 | 2.3574 | 0.0609 |
87
+ | 1.299 | 2.1 | 21 | 2.3496 | 0.0616 |
88
+ | 1.4463 | 2.2 | 22 | 2.3496 | 0.0620 |
89
+ | 1.1733 | 2.3 | 23 | 2.3652 | 0.0617 |
90
+ | 1.1142 | 2.4 | 24 | 2.3887 | 0.0626 |
91
+ | 1.3107 | 2.5 | 25 | 2.4219 | 0.0627 |
92
+ | 1.011 | 2.6 | 26 | 2.4551 | 0.0622 |
93
+ | 1.3403 | 2.7 | 27 | 2.4766 | 0.0616 |
94
+ | 1.3108 | 2.8 | 28 | 2.4766 | 0.0616 |
95
+ | 1.0076 | 2.9 | 29 | 2.4609 | 0.0619 |
96
+ | 0.8656 | 3.0 | 30 | 2.4512 | 0.0624 |
97
+ | 0.6635 | 3.1 | 31 | 2.4512 | 0.0628 |
98
+ | 0.9996 | 3.2 | 32 | 2.4434 | 0.0635 |
99
+ | 0.9029 | 3.3 | 33 | 2.4473 | 0.0637 |
100
+ | 0.8329 | 3.4 | 34 | 2.4551 | 0.0637 |
101
+ | 0.8012 | 3.5 | 35 | 2.4648 | 0.0639 |
102
+ | 0.5814 | 3.6 | 36 | 2.4902 | 0.0640 |
103
+ | 1.0688 | 3.7 | 37 | 2.5098 | 0.0638 |
104
+ | 0.8688 | 3.8 | 38 | 2.5176 | 0.0635 |
105
+ | 0.7341 | 3.9 | 39 | 2.5195 | 0.0638 |
106
+ | 0.7102 | 4.0 | 40 | 2.5195 | 0.0640 |
107
+ | 0.7079 | 4.1 | 41 | 2.5195 | 0.0641 |
108
+ | 0.7656 | 4.2 | 42 | 2.5195 | 0.0643 |
109
+ | 0.6377 | 4.3 | 43 | 2.5273 | 0.0645 |
110
+ | 0.5898 | 4.4 | 44 | 2.5352 | 0.0641 |
111
+ | 0.5958 | 4.5 | 45 | 2.5430 | 0.0641 |
112
+ | 0.7048 | 4.6 | 46 | 2.5488 | 0.0640 |
113
+ | 0.5435 | 4.7 | 47 | 2.5527 | 0.0641 |
114
+ | 0.4769 | 4.8 | 48 | 2.5527 | 0.0640 |
115
+ | 0.6583 | 4.9 | 49 | 2.5547 | 0.0642 |
116
+ | 0.7168 | 5.0 | 50 | 2.5547 | 0.0641 |
117
 
118
 
119
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 2.7618001302083335,
4
- "train_runtime": 2178.2199,
5
  "train_samples": 156,
6
- "train_samples_per_second": 0.072,
7
- "train_steps_per_second": 0.001
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.4392181396484376,
4
+ "train_runtime": 183.8573,
5
  "train_samples": 156,
6
+ "train_samples_per_second": 4.242,
7
+ "train_steps_per_second": 0.272
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30e0b0639422ce0d828c4911f942e189e8f882b1fb6682952aac1b28fbdcf62c
3
  size 2631610617
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6206eb9213367136975c480c7b17e06148f1ce3be73c7b78ca2c2373ce011857
3
  size 2631610617
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 2.7618001302083335,
4
- "train_runtime": 2178.2199,
5
  "train_samples": 156,
6
- "train_samples_per_second": 0.072,
7
- "train_steps_per_second": 0.001
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.4392181396484376,
4
+ "train_runtime": 183.8573,
5
  "train_samples": 156,
6
+ "train_samples_per_second": 4.242,
7
+ "train_steps_per_second": 0.272
8
  }
trainer_state.json CHANGED
@@ -1,70 +1,775 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "global_step": 3,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.33,
12
  "learning_rate": 3e-05,
13
- "loss": 2.6917,
14
  "step": 1
15
  },
16
  {
17
- "epoch": 0.33,
18
- "eval_accuracy": 0.05304531642224865,
19
- "eval_loss": 2.66796875,
20
- "eval_runtime": 1.9177,
21
- "eval_samples_per_second": 15.122,
22
- "eval_steps_per_second": 0.521,
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.67,
27
  "learning_rate": 3e-05,
28
- "loss": 2.8435,
29
  "step": 2
30
  },
31
  {
32
- "epoch": 0.67,
33
- "eval_accuracy": 0.05304531642224865,
34
- "eval_loss": 2.66796875,
35
- "eval_runtime": 1.7979,
36
- "eval_samples_per_second": 16.13,
37
- "eval_steps_per_second": 0.556,
38
  "step": 2
39
  },
40
  {
41
- "epoch": 1.0,
42
- "learning_rate": 1.9999999999999998e-05,
43
- "loss": 2.7502,
44
  "step": 3
45
  },
46
  {
47
- "epoch": 1.0,
48
- "eval_accuracy": 0.055489496630994846,
49
- "eval_loss": 2.6171875,
50
- "eval_runtime": 1.4088,
51
- "eval_samples_per_second": 20.586,
52
- "eval_steps_per_second": 0.71,
53
  "step": 3
54
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  {
56
  "epoch": 1.0,
57
- "step": 3,
58
- "total_flos": 303365357568.0,
59
- "train_loss": 2.7618001302083335,
60
- "train_runtime": 2178.2199,
61
- "train_samples_per_second": 0.072,
62
- "train_steps_per_second": 0.001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
64
  ],
65
- "max_steps": 3,
66
- "num_train_epochs": 1,
67
- "total_flos": 303365357568.0,
68
  "trial_name": null,
69
  "trial_params": null
70
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "global_step": 50,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.1,
12
  "learning_rate": 3e-05,
13
+ "loss": 2.9219,
14
  "step": 1
15
  },
16
  {
17
+ "epoch": 0.1,
18
+ "eval_accuracy": 0.05291319857312723,
19
+ "eval_loss": 2.6484375,
20
+ "eval_runtime": 1.5576,
21
+ "eval_samples_per_second": 18.618,
22
+ "eval_steps_per_second": 1.284,
23
  "step": 1
24
  },
25
  {
26
+ "epoch": 0.2,
27
  "learning_rate": 3e-05,
28
+ "loss": 2.6938,
29
  "step": 2
30
  },
31
  {
32
+ "epoch": 0.2,
33
+ "eval_accuracy": 0.05291319857312723,
34
+ "eval_loss": 2.6484375,
35
+ "eval_runtime": 1.275,
36
+ "eval_samples_per_second": 22.746,
37
+ "eval_steps_per_second": 1.569,
38
  "step": 2
39
  },
40
  {
41
+ "epoch": 0.3,
42
+ "learning_rate": 2.9970400926424075e-05,
43
+ "loss": 2.6365,
44
  "step": 3
45
  },
46
  {
47
+ "epoch": 0.3,
48
+ "eval_accuracy": 0.05601796802748051,
49
+ "eval_loss": 2.55078125,
50
+ "eval_runtime": 1.1095,
51
+ "eval_samples_per_second": 26.137,
52
+ "eval_steps_per_second": 1.803,
53
  "step": 3
54
  },
55
+ {
56
+ "epoch": 0.4,
57
+ "learning_rate": 2.988172051971717e-05,
58
+ "loss": 2.5088,
59
+ "step": 4
60
+ },
61
+ {
62
+ "epoch": 0.4,
63
+ "eval_accuracy": 0.056282203725723345,
64
+ "eval_loss": 2.533203125,
65
+ "eval_runtime": 1.0069,
66
+ "eval_samples_per_second": 28.8,
67
+ "eval_steps_per_second": 1.986,
68
+ "step": 4
69
+ },
70
+ {
71
+ "epoch": 0.5,
72
+ "learning_rate": 2.9734308760930333e-05,
73
+ "loss": 2.7297,
74
+ "step": 5
75
+ },
76
+ {
77
+ "epoch": 0.5,
78
+ "eval_accuracy": 0.05667855727308759,
79
+ "eval_loss": 2.517578125,
80
+ "eval_runtime": 1.0127,
81
+ "eval_samples_per_second": 28.637,
82
+ "eval_steps_per_second": 1.975,
83
+ "step": 5
84
+ },
85
+ {
86
+ "epoch": 0.6,
87
+ "learning_rate": 2.9528747416929467e-05,
88
+ "loss": 2.9702,
89
+ "step": 6
90
+ },
91
+ {
92
+ "epoch": 0.6,
93
+ "eval_accuracy": 0.05720702866957326,
94
+ "eval_loss": 2.494140625,
95
+ "eval_runtime": 1.2159,
96
+ "eval_samples_per_second": 23.851,
97
+ "eval_steps_per_second": 1.645,
98
+ "step": 6
99
+ },
100
+ {
101
+ "epoch": 0.7,
102
+ "learning_rate": 2.9265847744427305e-05,
103
+ "loss": 2.729,
104
+ "step": 7
105
+ },
106
+ {
107
+ "epoch": 0.7,
108
+ "eval_accuracy": 0.05681067512220901,
109
+ "eval_loss": 2.48828125,
110
+ "eval_runtime": 1.0262,
111
+ "eval_samples_per_second": 28.259,
112
+ "eval_steps_per_second": 1.949,
113
+ "step": 7
114
+ },
115
+ {
116
+ "epoch": 0.8,
117
+ "learning_rate": 2.894664728832377e-05,
118
+ "loss": 2.6172,
119
+ "step": 8
120
+ },
121
+ {
122
+ "epoch": 0.8,
123
+ "eval_accuracy": 0.05780155899061963,
124
+ "eval_loss": 2.478515625,
125
+ "eval_runtime": 1.3127,
126
+ "eval_samples_per_second": 22.092,
127
+ "eval_steps_per_second": 1.524,
128
+ "step": 8
129
+ },
130
+ {
131
+ "epoch": 0.9,
132
+ "learning_rate": 2.8572405786990293e-05,
133
+ "loss": 2.6428,
134
+ "step": 9
135
+ },
136
+ {
137
+ "epoch": 0.9,
138
+ "eval_accuracy": 0.058065794688862464,
139
+ "eval_loss": 2.458984375,
140
+ "eval_runtime": 1.0971,
141
+ "eval_samples_per_second": 26.432,
142
+ "eval_steps_per_second": 1.823,
143
+ "step": 9
144
+ },
145
+ {
146
+ "epoch": 1.0,
147
+ "learning_rate": 2.8144600200657953e-05,
148
+ "loss": 2.5681,
149
+ "step": 10
150
+ },
151
  {
152
  "epoch": 1.0,
153
+ "eval_accuracy": 0.05899061963271238,
154
+ "eval_loss": 2.435546875,
155
+ "eval_runtime": 1.3171,
156
+ "eval_samples_per_second": 22.018,
157
+ "eval_steps_per_second": 1.518,
158
+ "step": 10
159
+ },
160
+ {
161
+ "epoch": 1.1,
162
+ "learning_rate": 2.7664918882530227e-05,
163
+ "loss": 2.1885,
164
+ "step": 11
165
+ },
166
+ {
167
+ "epoch": 1.1,
168
+ "eval_accuracy": 0.05866032500990884,
169
+ "eval_loss": 2.423828125,
170
+ "eval_runtime": 1.2181,
171
+ "eval_samples_per_second": 23.807,
172
+ "eval_steps_per_second": 1.642,
173
+ "step": 11
174
+ },
175
+ {
176
+ "epoch": 1.2,
177
+ "learning_rate": 2.7135254915624213e-05,
178
+ "loss": 1.981,
179
+ "step": 12
180
+ },
181
+ {
182
+ "epoch": 1.2,
183
+ "eval_accuracy": 0.05872638393446954,
184
+ "eval_loss": 2.421875,
185
+ "eval_runtime": 1.6039,
186
+ "eval_samples_per_second": 18.081,
187
+ "eval_steps_per_second": 1.247,
188
+ "step": 12
189
+ },
190
+ {
191
+ "epoch": 1.3,
192
+ "learning_rate": 2.655769864163684e-05,
193
+ "loss": 1.8673,
194
+ "step": 13
195
+ },
196
+ {
197
+ "epoch": 1.3,
198
+ "eval_accuracy": 0.0591227374818338,
199
+ "eval_loss": 2.41796875,
200
+ "eval_runtime": 1.0123,
201
+ "eval_samples_per_second": 28.649,
202
+ "eval_steps_per_second": 1.976,
203
+ "step": 13
204
+ },
205
+ {
206
+ "epoch": 1.4,
207
+ "learning_rate": 2.5934529411321174e-05,
208
+ "loss": 1.7321,
209
+ "step": 14
210
+ },
211
+ {
212
+ "epoch": 1.4,
213
+ "eval_accuracy": 0.05958514995375875,
214
+ "eval_loss": 2.41796875,
215
+ "eval_runtime": 1.0152,
216
+ "eval_samples_per_second": 28.567,
217
+ "eval_steps_per_second": 1.97,
218
+ "step": 14
219
+ },
220
+ {
221
+ "epoch": 1.5,
222
+ "learning_rate": 2.5268206588930332e-05,
223
+ "loss": 1.6355,
224
+ "step": 15
225
+ },
226
+ {
227
+ "epoch": 1.5,
228
+ "eval_accuracy": 0.060113621350244416,
229
+ "eval_loss": 2.41796875,
230
+ "eval_runtime": 1.4219,
231
+ "eval_samples_per_second": 20.396,
232
+ "eval_steps_per_second": 1.407,
233
+ "step": 15
234
+ },
235
+ {
236
+ "epoch": 1.6,
237
+ "learning_rate": 2.4561359846230346e-05,
238
+ "loss": 1.7758,
239
+ "step": 16
240
+ },
241
+ {
242
+ "epoch": 1.6,
243
+ "eval_accuracy": 0.06017968027480513,
244
+ "eval_loss": 2.419921875,
245
+ "eval_runtime": 1.618,
246
+ "eval_samples_per_second": 17.923,
247
+ "eval_steps_per_second": 1.236,
248
+ "step": 16
249
+ },
250
+ {
251
+ "epoch": 1.7,
252
+ "learning_rate": 2.3816778784387097e-05,
253
+ "loss": 2.0162,
254
+ "step": 17
255
+ },
256
+ {
257
+ "epoch": 1.7,
258
+ "eval_accuracy": 0.06050997489760867,
259
+ "eval_loss": 2.408203125,
260
+ "eval_runtime": 1.1225,
261
+ "eval_samples_per_second": 25.835,
262
+ "eval_steps_per_second": 1.782,
263
+ "step": 17
264
+ },
265
+ {
266
+ "epoch": 1.8,
267
+ "learning_rate": 2.303740192468495e-05,
268
+ "loss": 1.8037,
269
+ "step": 18
270
+ },
271
+ {
272
+ "epoch": 1.8,
273
+ "eval_accuracy": 0.06050997489760867,
274
+ "eval_loss": 2.396484375,
275
+ "eval_runtime": 0.912,
276
+ "eval_samples_per_second": 31.8,
277
+ "eval_steps_per_second": 2.193,
278
+ "step": 18
279
+ },
280
+ {
281
+ "epoch": 1.9,
282
+ "learning_rate": 2.222630511152573e-05,
283
+ "loss": 1.7204,
284
+ "step": 19
285
+ },
286
+ {
287
+ "epoch": 1.9,
288
+ "eval_accuracy": 0.0607742105958515,
289
+ "eval_loss": 2.375,
290
+ "eval_runtime": 1.2202,
291
+ "eval_samples_per_second": 23.767,
292
+ "eval_steps_per_second": 1.639,
293
+ "step": 19
294
+ },
295
+ {
296
+ "epoch": 2.0,
297
+ "learning_rate": 2.138668937347609e-05,
298
+ "loss": 1.7831,
299
+ "step": 20
300
+ },
301
+ {
302
+ "epoch": 2.0,
303
+ "eval_accuracy": 0.060906328444972915,
304
+ "eval_loss": 2.357421875,
305
+ "eval_runtime": 1.0082,
306
+ "eval_samples_per_second": 28.765,
307
+ "eval_steps_per_second": 1.984,
308
+ "step": 20
309
+ },
310
+ {
311
+ "epoch": 2.1,
312
+ "learning_rate": 2.052186829027017e-05,
313
+ "loss": 1.299,
314
+ "step": 21
315
+ },
316
+ {
317
+ "epoch": 2.1,
318
+ "eval_accuracy": 0.06163297661514071,
319
+ "eval_loss": 2.349609375,
320
+ "eval_runtime": 1.2114,
321
+ "eval_samples_per_second": 23.94,
322
+ "eval_steps_per_second": 1.651,
323
+ "step": 21
324
+ },
325
+ {
326
+ "epoch": 2.2,
327
+ "learning_rate": 1.963525491562421e-05,
328
+ "loss": 1.4463,
329
+ "step": 22
330
+ },
331
+ {
332
+ "epoch": 2.2,
333
+ "eval_accuracy": 0.06196327123794425,
334
+ "eval_loss": 2.349609375,
335
+ "eval_runtime": 1.5179,
336
+ "eval_samples_per_second": 19.105,
337
+ "eval_steps_per_second": 1.318,
338
+ "step": 22
339
+ },
340
+ {
341
+ "epoch": 2.3,
342
+ "learning_rate": 1.8730348307472828e-05,
343
+ "loss": 1.1733,
344
+ "step": 23
345
+ },
346
+ {
347
+ "epoch": 2.3,
348
+ "eval_accuracy": 0.061699035539701415,
349
+ "eval_loss": 2.365234375,
350
+ "eval_runtime": 1.3077,
351
+ "eval_samples_per_second": 22.176,
352
+ "eval_steps_per_second": 1.529,
353
+ "step": 23
354
+ },
355
+ {
356
+ "epoch": 2.4,
357
+ "learning_rate": 1.781071971878587e-05,
358
+ "loss": 1.1142,
359
+ "step": 24
360
+ },
361
+ {
362
+ "epoch": 2.4,
363
+ "eval_accuracy": 0.06255780155899061,
364
+ "eval_loss": 2.388671875,
365
+ "eval_runtime": 1.1042,
366
+ "eval_samples_per_second": 26.264,
367
+ "eval_steps_per_second": 1.811,
368
+ "step": 24
369
+ },
370
+ {
371
+ "epoch": 2.5,
372
+ "learning_rate": 1.6879998503464565e-05,
373
+ "loss": 1.3107,
374
+ "step": 25
375
+ },
376
+ {
377
+ "epoch": 2.5,
378
+ "eval_accuracy": 0.06268991940811204,
379
+ "eval_loss": 2.421875,
380
+ "eval_runtime": 0.9167,
381
+ "eval_samples_per_second": 31.634,
382
+ "eval_steps_per_second": 2.182,
383
+ "step": 25
384
+ },
385
+ {
386
+ "epoch": 2.6,
387
+ "learning_rate": 1.5941857792939702e-05,
388
+ "loss": 1.011,
389
+ "step": 26
390
+ },
391
+ {
392
+ "epoch": 2.6,
393
+ "eval_accuracy": 0.06216144801162637,
394
+ "eval_loss": 2.455078125,
395
+ "eval_runtime": 0.9207,
396
+ "eval_samples_per_second": 31.497,
397
+ "eval_steps_per_second": 2.172,
398
+ "step": 26
399
+ },
400
+ {
401
+ "epoch": 2.7,
402
+ "learning_rate": 1.5e-05,
403
+ "loss": 1.3403,
404
+ "step": 27
405
+ },
406
+ {
407
+ "epoch": 2.7,
408
+ "eval_accuracy": 0.061566917690579995,
409
+ "eval_loss": 2.4765625,
410
+ "eval_runtime": 1.5266,
411
+ "eval_samples_per_second": 18.997,
412
+ "eval_steps_per_second": 1.31,
413
+ "step": 27
414
+ },
415
+ {
416
+ "epoch": 2.8,
417
+ "learning_rate": 1.5e-05,
418
+ "loss": 1.3108,
419
+ "step": 28
420
+ },
421
+ {
422
+ "epoch": 2.8,
423
+ "eval_accuracy": 0.061566917690579995,
424
+ "eval_loss": 2.4765625,
425
+ "eval_runtime": 1.252,
426
+ "eval_samples_per_second": 23.163,
427
+ "eval_steps_per_second": 1.597,
428
+ "step": 28
429
+ },
430
+ {
431
+ "epoch": 2.9,
432
+ "learning_rate": 1.40581422070603e-05,
433
+ "loss": 1.0076,
434
+ "step": 29
435
+ },
436
+ {
437
+ "epoch": 2.9,
438
+ "eval_accuracy": 0.06189721231338354,
439
+ "eval_loss": 2.4609375,
440
+ "eval_runtime": 0.9112,
441
+ "eval_samples_per_second": 31.825,
442
+ "eval_steps_per_second": 2.195,
443
+ "step": 29
444
+ },
445
+ {
446
+ "epoch": 3.0,
447
+ "learning_rate": 1.3120001496535434e-05,
448
+ "loss": 0.8656,
449
+ "step": 30
450
+ },
451
+ {
452
+ "epoch": 3.0,
453
+ "eval_accuracy": 0.062359624785308494,
454
+ "eval_loss": 2.451171875,
455
+ "eval_runtime": 1.5156,
456
+ "eval_samples_per_second": 19.134,
457
+ "eval_steps_per_second": 1.32,
458
+ "step": 30
459
+ },
460
+ {
461
+ "epoch": 3.1,
462
+ "learning_rate": 1.2189280281214128e-05,
463
+ "loss": 0.6635,
464
+ "step": 31
465
+ },
466
+ {
467
+ "epoch": 3.1,
468
+ "eval_accuracy": 0.06282203725723345,
469
+ "eval_loss": 2.451171875,
470
+ "eval_runtime": 1.313,
471
+ "eval_samples_per_second": 22.087,
472
+ "eval_steps_per_second": 1.523,
473
+ "step": 31
474
+ },
475
+ {
476
+ "epoch": 3.2,
477
+ "learning_rate": 1.1269651692527181e-05,
478
+ "loss": 0.9996,
479
+ "step": 32
480
+ },
481
+ {
482
+ "epoch": 3.2,
483
+ "eval_accuracy": 0.06348262650284053,
484
+ "eval_loss": 2.443359375,
485
+ "eval_runtime": 1.01,
486
+ "eval_samples_per_second": 28.711,
487
+ "eval_steps_per_second": 1.98,
488
+ "step": 32
489
+ },
490
+ {
491
+ "epoch": 3.3,
492
+ "learning_rate": 1.036474508437579e-05,
493
+ "loss": 0.9029,
494
+ "step": 33
495
+ },
496
+ {
497
+ "epoch": 3.3,
498
+ "eval_accuracy": 0.06368080327652266,
499
+ "eval_loss": 2.447265625,
500
+ "eval_runtime": 1.4214,
501
+ "eval_samples_per_second": 20.402,
502
+ "eval_steps_per_second": 1.407,
503
+ "step": 33
504
+ },
505
+ {
506
+ "epoch": 3.4,
507
+ "learning_rate": 9.478131709729831e-06,
508
+ "loss": 0.8329,
509
+ "step": 34
510
+ },
511
+ {
512
+ "epoch": 3.4,
513
+ "eval_accuracy": 0.06374686220108336,
514
+ "eval_loss": 2.455078125,
515
+ "eval_runtime": 0.9136,
516
+ "eval_samples_per_second": 31.742,
517
+ "eval_steps_per_second": 2.189,
518
+ "step": 34
519
+ },
520
+ {
521
+ "epoch": 3.5,
522
+ "learning_rate": 8.61331062652391e-06,
523
+ "loss": 0.8012,
524
+ "step": 35
525
+ },
526
+ {
527
+ "epoch": 3.5,
528
+ "eval_accuracy": 0.06387898005020479,
529
+ "eval_loss": 2.46484375,
530
+ "eval_runtime": 1.6062,
531
+ "eval_samples_per_second": 18.055,
532
+ "eval_steps_per_second": 1.245,
533
+ "step": 35
534
+ },
535
+ {
536
+ "epoch": 3.6,
537
+ "learning_rate": 7.773694888474268e-06,
538
+ "loss": 0.5814,
539
+ "step": 36
540
+ },
541
+ {
542
+ "epoch": 3.6,
543
+ "eval_accuracy": 0.0640110978993262,
544
+ "eval_loss": 2.490234375,
545
+ "eval_runtime": 1.6209,
546
+ "eval_samples_per_second": 17.891,
547
+ "eval_steps_per_second": 1.234,
548
+ "step": 36
549
+ },
550
+ {
551
+ "epoch": 3.7,
552
+ "learning_rate": 6.962598075315047e-06,
553
+ "loss": 1.0688,
554
+ "step": 37
555
+ },
556
+ {
557
+ "epoch": 3.7,
558
+ "eval_accuracy": 0.06381292112564407,
559
+ "eval_loss": 2.509765625,
560
+ "eval_runtime": 1.2235,
561
+ "eval_samples_per_second": 23.703,
562
+ "eval_steps_per_second": 1.635,
563
+ "step": 37
564
+ },
565
+ {
566
+ "epoch": 3.8,
567
+ "learning_rate": 6.1832212156129045e-06,
568
+ "loss": 0.8688,
569
+ "step": 38
570
+ },
571
+ {
572
+ "epoch": 3.8,
573
+ "eval_accuracy": 0.06348262650284053,
574
+ "eval_loss": 2.517578125,
575
+ "eval_runtime": 1.5088,
576
+ "eval_samples_per_second": 19.221,
577
+ "eval_steps_per_second": 1.326,
578
+ "step": 38
579
+ },
580
+ {
581
+ "epoch": 3.9,
582
+ "learning_rate": 5.438640153769654e-06,
583
+ "loss": 0.7341,
584
+ "step": 39
585
+ },
586
+ {
587
+ "epoch": 3.9,
588
+ "eval_accuracy": 0.06381292112564407,
589
+ "eval_loss": 2.51953125,
590
+ "eval_runtime": 1.0209,
591
+ "eval_samples_per_second": 28.406,
592
+ "eval_steps_per_second": 1.959,
593
+ "step": 39
594
+ },
595
+ {
596
+ "epoch": 4.0,
597
+ "learning_rate": 4.731793411069669e-06,
598
+ "loss": 0.7102,
599
+ "step": 40
600
+ },
601
+ {
602
+ "epoch": 4.0,
603
+ "eval_accuracy": 0.0640110978993262,
604
+ "eval_loss": 2.51953125,
605
+ "eval_runtime": 1.3112,
606
+ "eval_samples_per_second": 22.116,
607
+ "eval_steps_per_second": 1.525,
608
+ "step": 40
609
+ },
610
+ {
611
+ "epoch": 4.1,
612
+ "learning_rate": 4.06547058867883e-06,
613
+ "loss": 0.7079,
614
+ "step": 41
615
+ },
616
+ {
617
+ "epoch": 4.1,
618
+ "eval_accuracy": 0.06414321574844761,
619
+ "eval_loss": 2.51953125,
620
+ "eval_runtime": 1.0046,
621
+ "eval_samples_per_second": 28.868,
622
+ "eval_steps_per_second": 1.991,
623
+ "step": 41
624
+ },
625
+ {
626
+ "epoch": 4.2,
627
+ "learning_rate": 3.442301358363163e-06,
628
+ "loss": 0.7656,
629
+ "step": 42
630
+ },
631
+ {
632
+ "epoch": 4.2,
633
+ "eval_accuracy": 0.06427533359756903,
634
+ "eval_loss": 2.51953125,
635
+ "eval_runtime": 1.3174,
636
+ "eval_samples_per_second": 22.012,
637
+ "eval_steps_per_second": 1.518,
638
+ "step": 42
639
+ },
640
+ {
641
+ "epoch": 4.3,
642
+ "learning_rate": 2.86474508437579e-06,
643
+ "loss": 0.6377,
644
+ "step": 43
645
+ },
646
+ {
647
+ "epoch": 4.3,
648
+ "eval_accuracy": 0.06447351037125115,
649
+ "eval_loss": 2.52734375,
650
+ "eval_runtime": 1.1043,
651
+ "eval_samples_per_second": 26.261,
652
+ "eval_steps_per_second": 1.811,
653
+ "step": 43
654
+ },
655
+ {
656
+ "epoch": 4.4,
657
+ "learning_rate": 2.335081117469777e-06,
658
+ "loss": 0.5898,
659
+ "step": 44
660
+ },
661
+ {
662
+ "epoch": 4.4,
663
+ "eval_accuracy": 0.06414321574844761,
664
+ "eval_loss": 2.53515625,
665
+ "eval_runtime": 1.0226,
666
+ "eval_samples_per_second": 28.359,
667
+ "eval_steps_per_second": 1.956,
668
+ "step": 44
669
+ },
670
+ {
671
+ "epoch": 4.5,
672
+ "learning_rate": 1.8553997993420495e-06,
673
+ "loss": 0.5958,
674
+ "step": 45
675
+ },
676
+ {
677
+ "epoch": 4.5,
678
+ "eval_accuracy": 0.06407715682388691,
679
+ "eval_loss": 2.54296875,
680
+ "eval_runtime": 0.91,
681
+ "eval_samples_per_second": 31.867,
682
+ "eval_steps_per_second": 2.198,
683
+ "step": 45
684
+ },
685
+ {
686
+ "epoch": 4.6,
687
+ "learning_rate": 1.4275942130097097e-06,
688
+ "loss": 0.7048,
689
+ "step": 46
690
+ },
691
+ {
692
+ "epoch": 4.6,
693
+ "eval_accuracy": 0.0640110978993262,
694
+ "eval_loss": 2.548828125,
695
+ "eval_runtime": 1.2089,
696
+ "eval_samples_per_second": 23.989,
697
+ "eval_steps_per_second": 1.654,
698
+ "step": 46
699
+ },
700
+ {
701
+ "epoch": 4.7,
702
+ "learning_rate": 1.0533527116762298e-06,
703
+ "loss": 0.5435,
704
+ "step": 47
705
+ },
706
+ {
707
+ "epoch": 4.7,
708
+ "eval_accuracy": 0.06414321574844761,
709
+ "eval_loss": 2.552734375,
710
+ "eval_runtime": 1.3311,
711
+ "eval_samples_per_second": 21.786,
712
+ "eval_steps_per_second": 1.502,
713
+ "step": 47
714
+ },
715
+ {
716
+ "epoch": 4.8,
717
+ "learning_rate": 7.341522555726971e-07,
718
+ "loss": 0.4769,
719
+ "step": 48
720
+ },
721
+ {
722
+ "epoch": 4.8,
723
+ "eval_accuracy": 0.0640110978993262,
724
+ "eval_loss": 2.552734375,
725
+ "eval_runtime": 1.0089,
726
+ "eval_samples_per_second": 28.743,
727
+ "eval_steps_per_second": 1.982,
728
+ "step": 48
729
+ },
730
+ {
731
+ "epoch": 4.9,
732
+ "learning_rate": 4.7125258307053385e-07,
733
+ "loss": 0.6583,
734
+ "step": 49
735
+ },
736
+ {
737
+ "epoch": 4.9,
738
+ "eval_accuracy": 0.06420927467300833,
739
+ "eval_loss": 2.5546875,
740
+ "eval_runtime": 1.0099,
741
+ "eval_samples_per_second": 28.715,
742
+ "eval_steps_per_second": 1.98,
743
+ "step": 49
744
+ },
745
+ {
746
+ "epoch": 5.0,
747
+ "learning_rate": 2.6569123906967083e-07,
748
+ "loss": 0.7168,
749
+ "step": 50
750
+ },
751
+ {
752
+ "epoch": 5.0,
753
+ "eval_accuracy": 0.06414321574844761,
754
+ "eval_loss": 2.5546875,
755
+ "eval_runtime": 1.5134,
756
+ "eval_samples_per_second": 19.163,
757
+ "eval_steps_per_second": 1.322,
758
+ "step": 50
759
+ },
760
+ {
761
+ "epoch": 5.0,
762
+ "step": 50,
763
+ "total_flos": 2477483753472.0,
764
+ "train_loss": 1.4392181396484376,
765
+ "train_runtime": 183.8573,
766
+ "train_samples_per_second": 4.242,
767
+ "train_steps_per_second": 0.272
768
  }
769
  ],
770
+ "max_steps": 50,
771
+ "num_train_epochs": 5,
772
+ "total_flos": 2477483753472.0,
773
  "trial_name": null,
774
  "trial_params": null
775
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9ff01ac6de6e77d1f6de3f89d7538f083c7fc7f4f1608fcdedfeedea383dd46
3
  size 4591
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9567bff4e6027ed7913ec27f9c92249df3e60ca2394ff65e8c89def0feb5eac1
3
  size 4591