avuhong commited on
Commit
63b1277
1 Parent(s): 7d826ad

Upload 16 files

Browse files
README.md CHANGED
@@ -5,19 +5,19 @@ tags:
5
  metrics:
6
  - accuracy
7
  model-index:
8
- - name: output_v2
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # output_v2
16
 
17
  This model is a fine-tuned version of [avuhong/ParvoGPT2](https://huggingface.co/avuhong/ParvoGPT2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.9835
20
- - Accuracy: 0.8502
21
 
22
  ## Model description
23
 
@@ -36,7 +36,7 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 1e-05
40
  - train_batch_size: 1
41
  - eval_batch_size: 1
42
  - seed: 42
@@ -47,29 +47,45 @@ The following hyperparameters were used during training:
47
  - total_eval_batch_size: 2
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: linear
50
- - num_epochs: 16.0
51
  - mixed_precision_training: Native AMP
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
56
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
57
- | No log | 1.0 | 220 | 1.5038 | 0.7795 |
58
- | No log | 2.0 | 440 | 1.3765 | 0.7965 |
59
- | 1.5308 | 3.0 | 660 | 1.2920 | 0.8075 |
60
- | 1.5308 | 4.0 | 880 | 1.2308 | 0.8156 |
61
- | 1.2695 | 5.0 | 1100 | 1.1788 | 0.8226 |
62
- | 1.2695 | 6.0 | 1320 | 1.1363 | 0.8279 |
63
- | 1.1353 | 7.0 | 1540 | 1.1027 | 0.8324 |
64
- | 1.1353 | 8.0 | 1760 | 1.0726 | 0.8373 |
65
- | 1.1353 | 9.0 | 1980 | 1.0481 | 0.8405 |
66
- | 1.0713 | 10.0 | 2200 | 1.0299 | 0.8433 |
67
- | 1.0713 | 11.0 | 2420 | 1.0174 | 0.8455 |
68
- | 1.0233 | 12.0 | 2640 | 1.0028 | 0.8477 |
69
- | 1.0233 | 13.0 | 2860 | 0.9939 | 0.8488 |
70
- | 0.9811 | 14.0 | 3080 | 0.9889 | 0.8497 |
71
- | 0.9811 | 15.0 | 3300 | 0.9854 | 0.8500 |
72
- | 0.9696 | 16.0 | 3520 | 0.9835 | 0.8502 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  ### Framework versions
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
+ - name: output_v3
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # output_v3
16
 
17
  This model is a fine-tuned version of [avuhong/ParvoGPT2](https://huggingface.co/avuhong/ParvoGPT2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.4775
20
+ - Accuracy: 0.9290
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 5e-05
40
  - train_batch_size: 1
41
  - eval_batch_size: 1
42
  - seed: 42
 
47
  - total_eval_batch_size: 2
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: linear
50
+ - num_epochs: 32.0
51
  - mixed_precision_training: Native AMP
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
56
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
57
+ | No log | 1.0 | 220 | 1.1623 | 0.8225 |
58
+ | No log | 2.0 | 440 | 0.9566 | 0.8539 |
59
+ | 1.1942 | 3.0 | 660 | 0.8456 | 0.8709 |
60
+ | 1.1942 | 4.0 | 880 | 0.7719 | 0.8801 |
61
+ | 0.7805 | 5.0 | 1100 | 0.7224 | 0.8872 |
62
+ | 0.7805 | 6.0 | 1320 | 0.6895 | 0.8928 |
63
+ | 0.6257 | 7.0 | 1540 | 0.6574 | 0.8972 |
64
+ | 0.6257 | 8.0 | 1760 | 0.6289 | 0.9014 |
65
+ | 0.6257 | 9.0 | 1980 | 0.6054 | 0.9045 |
66
+ | 0.5385 | 10.0 | 2200 | 0.5881 | 0.9077 |
67
+ | 0.5385 | 11.0 | 2420 | 0.5709 | 0.9102 |
68
+ | 0.4778 | 12.0 | 2640 | 0.5591 | 0.9121 |
69
+ | 0.4778 | 13.0 | 2860 | 0.5497 | 0.9143 |
70
+ | 0.427 | 14.0 | 3080 | 0.5385 | 0.9161 |
71
+ | 0.427 | 15.0 | 3300 | 0.5258 | 0.9180 |
72
+ | 0.394 | 16.0 | 3520 | 0.5170 | 0.9195 |
73
+ | 0.394 | 17.0 | 3740 | 0.5157 | 0.9212 |
74
+ | 0.394 | 18.0 | 3960 | 0.5038 | 0.9221 |
75
+ | 0.363 | 19.0 | 4180 | 0.4977 | 0.9234 |
76
+ | 0.363 | 20.0 | 4400 | 0.4976 | 0.9236 |
77
+ | 0.3392 | 21.0 | 4620 | 0.4924 | 0.9247 |
78
+ | 0.3392 | 22.0 | 4840 | 0.4888 | 0.9255 |
79
+ | 0.33 | 23.0 | 5060 | 0.4890 | 0.9262 |
80
+ | 0.33 | 24.0 | 5280 | 0.4856 | 0.9268 |
81
+ | 0.3058 | 25.0 | 5500 | 0.4803 | 0.9275 |
82
+ | 0.3058 | 26.0 | 5720 | 0.4785 | 0.9277 |
83
+ | 0.3058 | 27.0 | 5940 | 0.4813 | 0.9281 |
84
+ | 0.2973 | 28.0 | 6160 | 0.4799 | 0.9282 |
85
+ | 0.2973 | 29.0 | 6380 | 0.4773 | 0.9285 |
86
+ | 0.2931 | 30.0 | 6600 | 0.4778 | 0.9286 |
87
+ | 0.2931 | 31.0 | 6820 | 0.4756 | 0.9290 |
88
+ | 0.2879 | 32.0 | 7040 | 0.4775 | 0.9290 |
89
 
90
 
91
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 16.0,
3
- "eval_accuracy": 0.8501928179347534,
4
- "eval_loss": 0.9834597110748291,
5
- "eval_runtime": 5.6916,
6
  "eval_samples": 91,
7
- "eval_samples_per_second": 15.988,
8
- "eval_steps_per_second": 8.082,
9
- "perplexity": 2.6736904553424052,
10
- "train_loss": 1.138753395730799,
11
- "train_runtime": 5180.444,
12
  "train_samples": 1762,
13
  "train_samples_per_second": 5.442,
14
- "train_steps_per_second": 0.679
15
  }
 
1
  {
2
+ "epoch": 32.0,
3
+ "eval_accuracy": 0.9289527676624451,
4
+ "eval_loss": 0.47753414511680603,
5
+ "eval_runtime": 6.0712,
6
  "eval_samples": 91,
7
+ "eval_samples_per_second": 14.989,
8
+ "eval_steps_per_second": 7.577,
9
+ "perplexity": 1.6120943064226643,
10
+ "train_loss": 0.47427067851478405,
11
+ "train_runtime": 10360.152,
12
  "train_samples": 1762,
13
  "train_samples_per_second": 5.442,
14
+ "train_steps_per_second": 0.68
15
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "output_v3",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "avuhong/ParvoGPT2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 16.0,
3
- "eval_accuracy": 0.8501928179347534,
4
- "eval_loss": 0.9834597110748291,
5
- "eval_runtime": 5.6916,
6
  "eval_samples": 91,
7
- "eval_samples_per_second": 15.988,
8
- "eval_steps_per_second": 8.082,
9
- "perplexity": 2.6736904553424052
10
  }
 
1
  {
2
+ "epoch": 32.0,
3
+ "eval_accuracy": 0.9289527676624451,
4
+ "eval_loss": 0.47753414511680603,
5
+ "eval_runtime": 6.0712,
6
  "eval_samples": 91,
7
+ "eval_samples_per_second": 14.989,
8
+ "eval_steps_per_second": 7.577,
9
+ "perplexity": 1.6120943064226643
10
  }
runs/Mar15_17-26-14_srvgpu/1678901187.8142433/events.out.tfevents.1678901187.srvgpu.2936559.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:115496e7d79030d313da0cb77622a6aa7a29c7c51cc9c631d90574c2ad318d67
3
+ size 5643
runs/Mar15_17-26-14_srvgpu/events.out.tfevents.1678901187.srvgpu.2936559.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fdcee8d6bf9c043faf357536cbe68e370e845f5a47aa1679291e96c05b51d5d
3
+ size 16954
runs/Mar15_17-26-14_srvgpu/events.out.tfevents.1678911557.srvgpu.2936559.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc6a5fd4556ffd4c7220a94911b8d060aa146107d8ab1774b9a6c3c343b2534
3
+ size 363
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "bos_token": "<|endoftext|>",
4
  "eos_token": "<|endoftext|>",
5
  "model_max_length": 1000000000000000019884624838656,
6
- "name_or_path": "output_v3",
7
  "special_tokens_map_file": "/home/avuhong/.cache/huggingface/hub/models--nferruz--ProtGPT2/snapshots/afbc64cbb1e9c2b11f66f1f7c66d0be28df32b1b/special_tokens_map.json",
8
  "tokenizer_class": "GPT2Tokenizer",
9
  "unk_token": "<|endoftext|>"
 
3
  "bos_token": "<|endoftext|>",
4
  "eos_token": "<|endoftext|>",
5
  "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "avuhong/ParvoGPT2",
7
  "special_tokens_map_file": "/home/avuhong/.cache/huggingface/hub/models--nferruz--ProtGPT2/snapshots/afbc64cbb1e9c2b11f66f1f7c66d0be28df32b1b/special_tokens_map.json",
8
  "tokenizer_class": "GPT2Tokenizer",
9
  "unk_token": "<|endoftext|>"
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 16.0,
3
- "train_loss": 1.138753395730799,
4
- "train_runtime": 5180.444,
5
  "train_samples": 1762,
6
  "train_samples_per_second": 5.442,
7
- "train_steps_per_second": 0.679
8
  }
 
1
  {
2
+ "epoch": 32.0,
3
+ "train_loss": 0.47427067851478405,
4
+ "train_runtime": 10360.152,
5
  "train_samples": 1762,
6
  "train_samples_per_second": 5.442,
7
+ "train_steps_per_second": 0.68
8
  }
trainer_state.json CHANGED
@@ -1,211 +1,397 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 15.998864926220204,
5
- "global_step": 3520,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
- "eval_accuracy": 0.7795322956613279,
13
- "eval_loss": 1.5038145780563354,
14
- "eval_runtime": 5.9639,
15
- "eval_samples_per_second": 15.258,
16
- "eval_steps_per_second": 7.713,
17
  "step": 220
18
  },
19
  {
20
  "epoch": 2.0,
21
- "eval_accuracy": 0.7964938287518932,
22
- "eval_loss": 1.3765002489089966,
23
- "eval_runtime": 5.973,
24
- "eval_samples_per_second": 15.235,
25
- "eval_steps_per_second": 7.701,
26
  "step": 440
27
  },
28
  {
29
  "epoch": 2.27,
30
- "learning_rate": 8.579545454545455e-06,
31
- "loss": 1.5308,
32
  "step": 500
33
  },
34
  {
35
  "epoch": 3.0,
36
- "eval_accuracy": 0.8075258075258075,
37
- "eval_loss": 1.2920387983322144,
38
- "eval_runtime": 5.9722,
39
- "eval_samples_per_second": 15.237,
40
- "eval_steps_per_second": 7.702,
41
  "step": 660
42
  },
43
  {
44
  "epoch": 4.0,
45
- "eval_accuracy": 0.8155607833027188,
46
- "eval_loss": 1.230821967124939,
47
- "eval_runtime": 5.6615,
48
- "eval_samples_per_second": 16.073,
49
- "eval_steps_per_second": 8.125,
50
  "step": 880
51
  },
52
  {
53
  "epoch": 4.54,
54
- "learning_rate": 7.161931818181819e-06,
55
- "loss": 1.2695,
56
  "step": 1000
57
  },
58
  {
59
  "epoch": 5.0,
60
- "eval_accuracy": 0.8225860161344032,
61
- "eval_loss": 1.1788371801376343,
62
- "eval_runtime": 5.9852,
63
- "eval_samples_per_second": 15.204,
64
- "eval_steps_per_second": 7.686,
65
  "step": 1100
66
  },
67
  {
68
  "epoch": 6.0,
69
- "eval_accuracy": 0.8278817956237311,
70
- "eval_loss": 1.136326789855957,
71
- "eval_runtime": 5.9723,
72
- "eval_samples_per_second": 15.237,
73
- "eval_steps_per_second": 7.702,
74
  "step": 1320
75
  },
76
  {
77
  "epoch": 6.82,
78
- "learning_rate": 5.741477272727272e-06,
79
- "loss": 1.1353,
80
  "step": 1500
81
  },
82
  {
83
  "epoch": 7.0,
84
- "eval_accuracy": 0.8323826710923485,
85
- "eval_loss": 1.102668285369873,
86
- "eval_runtime": 5.9851,
87
- "eval_samples_per_second": 15.204,
88
- "eval_steps_per_second": 7.686,
89
  "step": 1540
90
  },
91
  {
92
  "epoch": 8.0,
93
- "eval_accuracy": 0.8373239663562244,
94
- "eval_loss": 1.072572112083435,
95
- "eval_runtime": 5.9939,
96
- "eval_samples_per_second": 15.182,
97
- "eval_steps_per_second": 7.674,
98
  "step": 1760
99
  },
100
  {
101
  "epoch": 9.0,
102
- "eval_accuracy": 0.8404928404928405,
103
- "eval_loss": 1.0481319427490234,
104
- "eval_runtime": 5.9927,
105
- "eval_samples_per_second": 15.185,
106
- "eval_steps_per_second": 7.676,
107
  "step": 1980
108
  },
109
  {
110
  "epoch": 9.09,
111
- "learning_rate": 4.321022727272728e-06,
112
- "loss": 1.0713,
113
  "step": 2000
114
  },
115
  {
116
  "epoch": 10.0,
117
- "eval_accuracy": 0.8432642626191014,
118
- "eval_loss": 1.0299291610717773,
119
- "eval_runtime": 5.6745,
120
- "eval_samples_per_second": 16.037,
121
- "eval_steps_per_second": 8.106,
122
  "step": 2200
123
  },
124
  {
125
  "epoch": 11.0,
126
- "eval_accuracy": 0.8455415552189746,
127
- "eval_loss": 1.0174473524093628,
128
- "eval_runtime": 5.9763,
129
- "eval_samples_per_second": 15.227,
130
- "eval_steps_per_second": 7.697,
131
  "step": 2420
132
  },
133
  {
134
  "epoch": 11.36,
135
- "learning_rate": 2.900568181818182e-06,
136
- "loss": 1.0233,
137
  "step": 2500
138
  },
139
  {
140
  "epoch": 12.0,
141
- "eval_accuracy": 0.8477006864103638,
142
- "eval_loss": 1.0027512311935425,
143
- "eval_runtime": 5.9881,
144
- "eval_samples_per_second": 15.197,
145
- "eval_steps_per_second": 7.682,
146
  "step": 2640
147
  },
148
  {
149
  "epoch": 13.0,
150
- "eval_accuracy": 0.8488178488178488,
151
- "eval_loss": 0.9938735961914062,
152
- "eval_runtime": 5.9759,
153
- "eval_samples_per_second": 15.228,
154
- "eval_steps_per_second": 7.698,
155
  "step": 2860
156
  },
157
  {
158
  "epoch": 13.64,
159
- "learning_rate": 1.4829545454545454e-06,
160
- "loss": 0.9811,
161
  "step": 3000
162
  },
163
  {
164
  "epoch": 14.0,
165
- "eval_accuracy": 0.8496772045159142,
166
- "eval_loss": 0.9889363646507263,
167
- "eval_runtime": 5.9819,
168
- "eval_samples_per_second": 15.213,
169
- "eval_steps_per_second": 7.69,
170
  "step": 3080
171
  },
172
  {
173
  "epoch": 15.0,
174
- "eval_accuracy": 0.8499779790102371,
175
- "eval_loss": 0.9854440093040466,
176
- "eval_runtime": 5.9887,
177
- "eval_samples_per_second": 15.195,
178
- "eval_steps_per_second": 7.681,
179
  "step": 3300
180
  },
181
  {
182
  "epoch": 15.91,
183
- "learning_rate": 6.250000000000001e-08,
184
- "loss": 0.9696,
185
  "step": 3500
186
  },
187
  {
188
  "epoch": 16.0,
189
- "eval_accuracy": 0.8501928179347534,
190
- "eval_loss": 0.9834597110748291,
191
- "eval_runtime": 5.978,
192
- "eval_samples_per_second": 15.223,
193
- "eval_steps_per_second": 7.695,
194
  "step": 3520
195
  },
196
  {
197
- "epoch": 16.0,
198
- "step": 3520,
199
- "total_flos": 1.2269276173959168e+17,
200
- "train_loss": 1.138753395730799,
201
- "train_runtime": 5180.444,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  "train_samples_per_second": 5.442,
203
- "train_steps_per_second": 0.679
204
  }
205
  ],
206
- "max_steps": 3520,
207
- "num_train_epochs": 16,
208
- "total_flos": 1.2269276173959168e+17,
209
  "trial_name": null,
210
  "trial_params": null
211
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 31.998864926220204,
5
+ "global_step": 7040,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 1.0,
12
+ "eval_accuracy": 0.8224893386183709,
13
+ "eval_loss": 1.1622651815414429,
14
+ "eval_runtime": 5.9816,
15
+ "eval_samples_per_second": 15.213,
16
+ "eval_steps_per_second": 7.69,
17
  "step": 220
18
  },
19
  {
20
  "epoch": 2.0,
21
+ "eval_accuracy": 0.8538665635439829,
22
+ "eval_loss": 0.9566460251808167,
23
+ "eval_runtime": 5.9768,
24
+ "eval_samples_per_second": 15.226,
25
+ "eval_steps_per_second": 7.696,
26
  "step": 440
27
  },
28
  {
29
  "epoch": 2.27,
30
+ "learning_rate": 4.6448863636363636e-05,
31
+ "loss": 1.1942,
32
  "step": 500
33
  },
34
  {
35
  "epoch": 3.0,
36
+ "eval_accuracy": 0.8709140322043548,
37
+ "eval_loss": 0.8456286191940308,
38
+ "eval_runtime": 5.9871,
39
+ "eval_samples_per_second": 15.199,
40
+ "eval_steps_per_second": 7.683,
41
  "step": 660
42
  },
43
  {
44
  "epoch": 4.0,
45
+ "eval_accuracy": 0.8801198801198801,
46
+ "eval_loss": 0.7718582153320312,
47
+ "eval_runtime": 5.6728,
48
+ "eval_samples_per_second": 16.041,
49
+ "eval_steps_per_second": 8.109,
50
  "step": 880
51
  },
52
  {
53
  "epoch": 4.54,
54
+ "learning_rate": 4.289772727272727e-05,
55
+ "loss": 0.7805,
56
  "step": 1000
57
  },
58
  {
59
  "epoch": 5.0,
60
+ "eval_accuracy": 0.8871880807364678,
61
+ "eval_loss": 0.7224407196044922,
62
+ "eval_runtime": 5.988,
63
+ "eval_samples_per_second": 15.197,
64
+ "eval_steps_per_second": 7.682,
65
  "step": 1100
66
  },
67
  {
68
  "epoch": 6.0,
69
+ "eval_accuracy": 0.892816860558796,
70
+ "eval_loss": 0.6894707679748535,
71
+ "eval_runtime": 5.9958,
72
+ "eval_samples_per_second": 15.177,
73
+ "eval_steps_per_second": 7.672,
74
  "step": 1320
75
  },
76
  {
77
  "epoch": 6.82,
78
+ "learning_rate": 3.934659090909091e-05,
79
+ "loss": 0.6257,
80
  "step": 1500
81
  },
82
  {
83
  "epoch": 7.0,
84
+ "eval_accuracy": 0.8972103165651553,
85
+ "eval_loss": 0.6574080586433411,
86
+ "eval_runtime": 5.9909,
87
+ "eval_samples_per_second": 15.19,
88
+ "eval_steps_per_second": 7.678,
89
  "step": 1540
90
  },
91
  {
92
  "epoch": 8.0,
93
+ "eval_accuracy": 0.9014426433781273,
94
+ "eval_loss": 0.6289474368095398,
95
+ "eval_runtime": 5.6911,
96
+ "eval_samples_per_second": 15.99,
97
+ "eval_steps_per_second": 8.083,
98
  "step": 1760
99
  },
100
  {
101
  "epoch": 9.0,
102
+ "eval_accuracy": 0.9045470658373884,
103
+ "eval_loss": 0.6054205298423767,
104
+ "eval_runtime": 5.6786,
105
+ "eval_samples_per_second": 16.025,
106
+ "eval_steps_per_second": 8.101,
107
  "step": 1980
108
  },
109
  {
110
  "epoch": 9.09,
111
+ "learning_rate": 3.579545454545455e-05,
112
+ "loss": 0.5385,
113
  "step": 2000
114
  },
115
  {
116
  "epoch": 10.0,
117
+ "eval_accuracy": 0.9076622302428754,
118
+ "eval_loss": 0.5881273746490479,
119
+ "eval_runtime": 6.0014,
120
+ "eval_samples_per_second": 15.163,
121
+ "eval_steps_per_second": 7.665,
122
  "step": 2200
123
  },
124
  {
125
  "epoch": 11.0,
126
+ "eval_accuracy": 0.9101865876059424,
127
+ "eval_loss": 0.5709272623062134,
128
+ "eval_runtime": 5.9886,
129
+ "eval_samples_per_second": 15.195,
130
+ "eval_steps_per_second": 7.681,
131
  "step": 2420
132
  },
133
  {
134
  "epoch": 11.36,
135
+ "learning_rate": 3.2244318181818185e-05,
136
+ "loss": 0.4778,
137
  "step": 2500
138
  },
139
  {
140
  "epoch": 12.0,
141
+ "eval_accuracy": 0.9120986540341379,
142
+ "eval_loss": 0.5591339468955994,
143
+ "eval_runtime": 5.9874,
144
+ "eval_samples_per_second": 15.199,
145
+ "eval_steps_per_second": 7.683,
146
  "step": 2640
147
  },
148
  {
149
  "epoch": 13.0,
150
+ "eval_accuracy": 0.9142900110642046,
151
+ "eval_loss": 0.5496613383293152,
152
+ "eval_runtime": 5.9826,
153
+ "eval_samples_per_second": 15.211,
154
+ "eval_steps_per_second": 7.689,
155
  "step": 2860
156
  },
157
  {
158
  "epoch": 13.64,
159
+ "learning_rate": 2.870028409090909e-05,
160
+ "loss": 0.427,
161
  "step": 3000
162
  },
163
  {
164
  "epoch": 14.0,
165
+ "eval_accuracy": 0.9161053999763678,
166
+ "eval_loss": 0.5385328531265259,
167
+ "eval_runtime": 5.9885,
168
+ "eval_samples_per_second": 15.196,
169
+ "eval_steps_per_second": 7.681,
170
  "step": 3080
171
  },
172
  {
173
  "epoch": 15.0,
174
+ "eval_accuracy": 0.9179637566734341,
175
+ "eval_loss": 0.5258467793464661,
176
+ "eval_runtime": 5.9812,
177
+ "eval_samples_per_second": 15.214,
178
+ "eval_steps_per_second": 7.691,
179
  "step": 3300
180
  },
181
  {
182
  "epoch": 15.91,
183
+ "learning_rate": 2.5149147727272725e-05,
184
+ "loss": 0.394,
185
  "step": 3500
186
  },
187
  {
188
  "epoch": 16.0,
189
+ "eval_accuracy": 0.9195428227686292,
190
+ "eval_loss": 0.5170450806617737,
191
+ "eval_runtime": 5.6775,
192
+ "eval_samples_per_second": 16.028,
193
+ "eval_steps_per_second": 8.102,
194
  "step": 3520
195
  },
196
  {
197
+ "epoch": 17.0,
198
+ "eval_accuracy": 0.9211648566487276,
199
+ "eval_loss": 0.5156892538070679,
200
+ "eval_runtime": 5.9945,
201
+ "eval_samples_per_second": 15.18,
202
+ "eval_steps_per_second": 7.674,
203
+ "step": 3740
204
+ },
205
+ {
206
+ "epoch": 18.0,
207
+ "eval_accuracy": 0.9220671801316963,
208
+ "eval_loss": 0.5037761926651001,
209
+ "eval_runtime": 5.687,
210
+ "eval_samples_per_second": 16.001,
211
+ "eval_steps_per_second": 8.089,
212
+ "step": 3960
213
+ },
214
+ {
215
+ "epoch": 18.18,
216
+ "learning_rate": 2.1598011363636363e-05,
217
+ "loss": 0.363,
218
+ "step": 4000
219
+ },
220
+ {
221
+ "epoch": 19.0,
222
+ "eval_accuracy": 0.9233884395174717,
223
+ "eval_loss": 0.49766021966934204,
224
+ "eval_runtime": 5.6849,
225
+ "eval_samples_per_second": 16.007,
226
+ "eval_steps_per_second": 8.092,
227
+ "step": 4180
228
+ },
229
+ {
230
+ "epoch": 20.0,
231
+ "eval_accuracy": 0.9236462462268914,
232
+ "eval_loss": 0.4975946843624115,
233
+ "eval_runtime": 5.9923,
234
+ "eval_samples_per_second": 15.186,
235
+ "eval_steps_per_second": 7.677,
236
+ "step": 4400
237
+ },
238
+ {
239
+ "epoch": 20.45,
240
+ "learning_rate": 1.8046875000000003e-05,
241
+ "loss": 0.3392,
242
+ "step": 4500
243
+ },
244
+ {
245
+ "epoch": 21.0,
246
+ "eval_accuracy": 0.9246882150107957,
247
+ "eval_loss": 0.49241966009140015,
248
+ "eval_runtime": 5.9923,
249
+ "eval_samples_per_second": 15.186,
250
+ "eval_steps_per_second": 7.677,
251
+ "step": 4620
252
+ },
253
+ {
254
+ "epoch": 22.0,
255
+ "eval_accuracy": 0.9255046029239578,
256
+ "eval_loss": 0.4887617826461792,
257
+ "eval_runtime": 5.69,
258
+ "eval_samples_per_second": 15.993,
259
+ "eval_steps_per_second": 8.084,
260
+ "step": 4840
261
+ },
262
+ {
263
+ "epoch": 22.73,
264
+ "learning_rate": 1.4495738636363637e-05,
265
+ "loss": 0.33,
266
+ "step": 5000
267
+ },
268
+ {
269
+ "epoch": 23.0,
270
+ "eval_accuracy": 0.9262028294286359,
271
+ "eval_loss": 0.4889785051345825,
272
+ "eval_runtime": 6.0024,
273
+ "eval_samples_per_second": 15.161,
274
+ "eval_steps_per_second": 7.664,
275
+ "step": 5060
276
+ },
277
+ {
278
+ "epoch": 24.0,
279
+ "eval_accuracy": 0.9267936364710558,
280
+ "eval_loss": 0.4856303334236145,
281
+ "eval_runtime": 5.9867,
282
+ "eval_samples_per_second": 15.2,
283
+ "eval_steps_per_second": 7.684,
284
+ "step": 5280
285
+ },
286
+ {
287
+ "epoch": 25.0,
288
+ "learning_rate": 1.0951704545454545e-05,
289
+ "loss": 0.3058,
290
+ "step": 5500
291
+ },
292
+ {
293
+ "epoch": 25.0,
294
+ "eval_accuracy": 0.9275348307606373,
295
+ "eval_loss": 0.4802783131599426,
296
+ "eval_runtime": 5.6869,
297
+ "eval_samples_per_second": 16.002,
298
+ "eval_steps_per_second": 8.089,
299
+ "step": 5500
300
+ },
301
+ {
302
+ "epoch": 26.0,
303
+ "eval_accuracy": 0.9277389277389277,
304
+ "eval_loss": 0.47845765948295593,
305
+ "eval_runtime": 5.9773,
306
+ "eval_samples_per_second": 15.224,
307
+ "eval_steps_per_second": 7.696,
308
+ "step": 5720
309
+ },
310
+ {
311
+ "epoch": 27.0,
312
+ "eval_accuracy": 0.9280611861257022,
313
+ "eval_loss": 0.4813348948955536,
314
+ "eval_runtime": 5.6869,
315
+ "eval_samples_per_second": 16.002,
316
+ "eval_steps_per_second": 8.089,
317
+ "step": 5940
318
+ },
319
+ {
320
+ "epoch": 27.27,
321
+ "learning_rate": 7.407670454545455e-06,
322
+ "loss": 0.2973,
323
+ "step": 6000
324
+ },
325
+ {
326
+ "epoch": 28.0,
327
+ "eval_accuracy": 0.9281793475341863,
328
+ "eval_loss": 0.4798637628555298,
329
+ "eval_runtime": 5.6812,
330
+ "eval_samples_per_second": 16.018,
331
+ "eval_steps_per_second": 8.097,
332
+ "step": 6160
333
+ },
334
+ {
335
+ "epoch": 29.0,
336
+ "eval_accuracy": 0.9285230898134124,
337
+ "eval_loss": 0.47730037569999695,
338
+ "eval_runtime": 5.9912,
339
+ "eval_samples_per_second": 15.189,
340
+ "eval_steps_per_second": 7.678,
341
+ "step": 6380
342
+ },
343
+ {
344
+ "epoch": 29.54,
345
+ "learning_rate": 3.856534090909091e-06,
346
+ "loss": 0.2931,
347
+ "step": 6500
348
+ },
349
+ {
350
+ "epoch": 30.0,
351
+ "eval_accuracy": 0.9285982834369931,
352
+ "eval_loss": 0.4778377413749695,
353
+ "eval_runtime": 5.9957,
354
+ "eval_samples_per_second": 15.177,
355
+ "eval_steps_per_second": 7.672,
356
+ "step": 6600
357
+ },
358
+ {
359
+ "epoch": 31.0,
360
+ "eval_accuracy": 0.9289527676624451,
361
+ "eval_loss": 0.4756244122982025,
362
+ "eval_runtime": 5.6812,
363
+ "eval_samples_per_second": 16.018,
364
+ "eval_steps_per_second": 8.097,
365
+ "step": 6820
366
+ },
367
+ {
368
+ "epoch": 31.82,
369
+ "learning_rate": 3.053977272727273e-07,
370
+ "loss": 0.2879,
371
+ "step": 7000
372
+ },
373
+ {
374
+ "epoch": 32.0,
375
+ "eval_accuracy": 0.9289527676624451,
376
+ "eval_loss": 0.47753414511680603,
377
+ "eval_runtime": 5.6774,
378
+ "eval_samples_per_second": 16.029,
379
+ "eval_steps_per_second": 8.102,
380
+ "step": 7040
381
+ },
382
+ {
383
+ "epoch": 32.0,
384
+ "step": 7040,
385
+ "total_flos": 2.4539422830415053e+17,
386
+ "train_loss": 0.47427067851478405,
387
+ "train_runtime": 10360.152,
388
  "train_samples_per_second": 5.442,
389
+ "train_steps_per_second": 0.68
390
  }
391
  ],
392
+ "max_steps": 7040,
393
+ "num_train_epochs": 32,
394
+ "total_flos": 2.4539422830415053e+17,
395
  "trial_name": null,
396
  "trial_params": null
397
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d279c7dbc43acd518e710d89f64a4cc417adfa56edacc6d0708f9864295a4747
3
  size 3579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2dff5363843946656ccbba4034725ba3135ad064c49c180c73190d34e5b0a9d
3
  size 3579