Elron commited on
Commit
a029410
1 Parent(s): 4c02cc6

Pushing deberta-v3-large-emotion to hub

Browse files
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - generated_from_trainer
5
+ metrics:
6
+ - accuracy
7
+ model-index:
8
+ - name: deberta-v3-large-emotion-lr7e-6-gas1-ls0.1
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # deberta-v3-large-emotion-lr7e-6-gas1-ls0.1
16
+
17
+ This model is a fine-tuned version of [microsoft/deberta-v3-large](https://huggingface.co/microsoft/deberta-v3-large) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.8311
20
+ - Accuracy: 0.8235
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 7e-06
40
+ - train_batch_size: 16
41
+ - eval_batch_size: 16
42
+ - seed: 42
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: linear
45
+ - lr_scheduler_warmup_steps: 50
46
+ - num_epochs: 10.0
47
+ - label_smoothing_factor: 0.1
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
+ | 1.2787 | 0.49 | 100 | 1.1127 | 0.4866 |
54
+ | 1.089 | 0.98 | 200 | 0.9668 | 0.7139 |
55
+ | 0.9134 | 1.47 | 300 | 0.8720 | 0.7834 |
56
+ | 0.8618 | 1.96 | 400 | 0.7726 | 0.7941 |
57
+ | 0.686 | 2.45 | 500 | 0.7337 | 0.8209 |
58
+ | 0.6333 | 2.94 | 600 | 0.7350 | 0.8235 |
59
+ | 0.5765 | 3.43 | 700 | 0.7561 | 0.8235 |
60
+ | 0.5502 | 3.92 | 800 | 0.7273 | 0.8476 |
61
+ | 0.5049 | 4.41 | 900 | 0.8137 | 0.8102 |
62
+ | 0.4695 | 4.9 | 1000 | 0.7581 | 0.8289 |
63
+ | 0.4657 | 5.39 | 1100 | 0.8404 | 0.8048 |
64
+ | 0.4549 | 5.88 | 1200 | 0.7800 | 0.8369 |
65
+ | 0.4305 | 6.37 | 1300 | 0.8575 | 0.8235 |
66
+ | 0.4209 | 6.86 | 1400 | 0.8572 | 0.8102 |
67
+ | 0.3983 | 7.35 | 1500 | 0.8392 | 0.8316 |
68
+ | 0.4139 | 7.84 | 1600 | 0.8152 | 0.8209 |
69
+ | 0.393 | 8.33 | 1700 | 0.8261 | 0.8289 |
70
+ | 0.3979 | 8.82 | 1800 | 0.8328 | 0.8235 |
71
+ | 0.3928 | 9.31 | 1900 | 0.8364 | 0.8209 |
72
+ | 0.3848 | 9.8 | 2000 | 0.8322 | 0.8235 |
73
+
74
+
75
+ ### Framework versions
76
+
77
+ - Transformers 4.20.0.dev0
78
+ - Pytorch 1.9.0
79
+ - Datasets 2.2.2
80
+ - Tokenizers 0.11.6
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8235294222831726,
4
+ "eval_loss": 0.8311316967010498,
5
+ "eval_runtime": 2.1275,
6
+ "eval_samples": 374,
7
+ "eval_samples_per_second": 175.794,
8
+ "eval_steps_per_second": 11.281,
9
+ "train_loss": 0.5818920486113605,
10
+ "train_runtime": 787.9964,
11
+ "train_samples": 3257,
12
+ "train_samples_per_second": 41.333,
13
+ "train_steps_per_second": 2.589
14
+ }
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-large",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "id2label": {
11
+ "0": 0,
12
+ "1": 1,
13
+ "2": 2,
14
+ "3": 3
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 4096,
18
+ "label2id": {
19
+ "0": 0,
20
+ "1": 1,
21
+ "2": 2,
22
+ "3": 3
23
+ },
24
+ "layer_norm_eps": 1e-07,
25
+ "max_position_embeddings": 512,
26
+ "max_relative_positions": -1,
27
+ "model_type": "deberta-v2",
28
+ "norm_rel_ebd": "layer_norm",
29
+ "num_attention_heads": 16,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "pooler_dropout": 0,
33
+ "pooler_hidden_act": "gelu",
34
+ "pooler_hidden_size": 1024,
35
+ "pos_att_type": [
36
+ "p2c",
37
+ "c2p"
38
+ ],
39
+ "position_biased_input": false,
40
+ "position_buckets": 256,
41
+ "relative_attention": true,
42
+ "share_att_key": true,
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.20.0.dev0",
45
+ "type_vocab_size": 0,
46
+ "vocab_size": 128100
47
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.8475936055183411,
3
+ "eval_loss": 0.5155109763145447,
4
+ "eval_runtime": 3.3452,
5
+ "eval_samples": 374,
6
+ "eval_samples_per_second": 111.803,
7
+ "eval_steps_per_second": 7.174
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e182efbb57bc6e4ecda054f4a41e886d02bcafcf1de64adf784d0e47c61f27ab
3
+ size 1740401579
run_test.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_1h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/test.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/best_checkpoint --train_file data/tweet_eval/emotion/train.csv --validation_file data/tweet_eval/emotion/validation.csv --test_file data/tweet_eval/emotion/test.csv --do_eval --do_predict --report_to none --per_device_eval_batch_size 16 --max_seq_length 256 --output_dir outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/best_checkpoint
run_train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_6h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/train.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path microsoft/deberta-v3-large --train_file data/tweet_eval/emotion/train.csv --validation_file data/tweet_eval/emotion/validation.csv --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_seq_length 256 --learning_rate 7e-6 --output_dir outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1 --evaluation_strategy steps --save_strategy no --warmup_steps 50 --num_train_epochs 10 --overwrite_output_dir --logging_steps 100 --gradient_accumulation_steps 1 --label_smoothing_factor 0.1 --report_to clearml --metric_for_best_model accuracy --logging_dir outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/tb \; rm -rf outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/tb \; rm -rf outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/checkpoint-* \; . outputs/train/tweet_eval2/emotion/deberta-v3-large-emotion-lr7e-6-gas1-ls0.1/run_test.sh
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.8627727031707764,
3
+ "eval_loss": 0.46840450167655945,
4
+ "eval_runtime": 8.8065,
5
+ "eval_samples_per_second": 161.359,
6
+ "eval_steps_per_second": 10.106,
7
+ "test_samples": 1421
8
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "mask_token": "[MASK]",
7
+ "name_or_path": "microsoft/deberta-v3-large",
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "sp_model_kwargs": {},
11
+ "special_tokens_map_file": null,
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }
trainer_state.json ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 2040,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.49,
12
+ "learning_rate": 6.824120603015075e-06,
13
+ "loss": 1.2787,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.49,
18
+ "eval_accuracy": 0.48663100600242615,
19
+ "eval_loss": 1.1127219200134277,
20
+ "eval_runtime": 2.143,
21
+ "eval_samples_per_second": 174.524,
22
+ "eval_steps_per_second": 11.199,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.98,
27
+ "learning_rate": 6.472361809045226e-06,
28
+ "loss": 1.089,
29
+ "step": 200
30
+ },
31
+ {
32
+ "epoch": 0.98,
33
+ "eval_accuracy": 0.7139037251472473,
34
+ "eval_loss": 0.9668397903442383,
35
+ "eval_runtime": 2.1362,
36
+ "eval_samples_per_second": 175.077,
37
+ "eval_steps_per_second": 11.235,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 1.47,
42
+ "learning_rate": 6.1206030150753765e-06,
43
+ "loss": 0.9134,
44
+ "step": 300
45
+ },
46
+ {
47
+ "epoch": 1.47,
48
+ "eval_accuracy": 0.7834224700927734,
49
+ "eval_loss": 0.8720457553863525,
50
+ "eval_runtime": 2.1335,
51
+ "eval_samples_per_second": 175.301,
52
+ "eval_steps_per_second": 11.249,
53
+ "step": 300
54
+ },
55
+ {
56
+ "epoch": 1.96,
57
+ "learning_rate": 5.7688442211055275e-06,
58
+ "loss": 0.8618,
59
+ "step": 400
60
+ },
61
+ {
62
+ "epoch": 1.96,
63
+ "eval_accuracy": 0.7941176295280457,
64
+ "eval_loss": 0.772637665271759,
65
+ "eval_runtime": 2.1363,
66
+ "eval_samples_per_second": 175.072,
67
+ "eval_steps_per_second": 11.235,
68
+ "step": 400
69
+ },
70
+ {
71
+ "epoch": 2.45,
72
+ "learning_rate": 5.4170854271356785e-06,
73
+ "loss": 0.686,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 2.45,
78
+ "eval_accuracy": 0.8208556175231934,
79
+ "eval_loss": 0.7337380647659302,
80
+ "eval_runtime": 2.1375,
81
+ "eval_samples_per_second": 174.975,
82
+ "eval_steps_per_second": 11.228,
83
+ "step": 500
84
+ },
85
+ {
86
+ "epoch": 2.94,
87
+ "learning_rate": 5.0653266331658295e-06,
88
+ "loss": 0.6333,
89
+ "step": 600
90
+ },
91
+ {
92
+ "epoch": 2.94,
93
+ "eval_accuracy": 0.8235294222831726,
94
+ "eval_loss": 0.7350101470947266,
95
+ "eval_runtime": 2.1343,
96
+ "eval_samples_per_second": 175.234,
97
+ "eval_steps_per_second": 11.245,
98
+ "step": 600
99
+ },
100
+ {
101
+ "epoch": 3.43,
102
+ "learning_rate": 4.71356783919598e-06,
103
+ "loss": 0.5765,
104
+ "step": 700
105
+ },
106
+ {
107
+ "epoch": 3.43,
108
+ "eval_accuracy": 0.8235294222831726,
109
+ "eval_loss": 0.7560638785362244,
110
+ "eval_runtime": 2.1248,
111
+ "eval_samples_per_second": 176.019,
112
+ "eval_steps_per_second": 11.295,
113
+ "step": 700
114
+ },
115
+ {
116
+ "epoch": 3.92,
117
+ "learning_rate": 4.361809045226131e-06,
118
+ "loss": 0.5502,
119
+ "step": 800
120
+ },
121
+ {
122
+ "epoch": 3.92,
123
+ "eval_accuracy": 0.8475936055183411,
124
+ "eval_loss": 0.727317750453949,
125
+ "eval_runtime": 2.1509,
126
+ "eval_samples_per_second": 173.882,
127
+ "eval_steps_per_second": 11.158,
128
+ "step": 800
129
+ },
130
+ {
131
+ "epoch": 4.41,
132
+ "learning_rate": 4.010050251256282e-06,
133
+ "loss": 0.5049,
134
+ "step": 900
135
+ },
136
+ {
137
+ "epoch": 4.41,
138
+ "eval_accuracy": 0.8101603984832764,
139
+ "eval_loss": 0.8136795163154602,
140
+ "eval_runtime": 2.1343,
141
+ "eval_samples_per_second": 175.233,
142
+ "eval_steps_per_second": 11.245,
143
+ "step": 900
144
+ },
145
+ {
146
+ "epoch": 4.9,
147
+ "learning_rate": 3.6582914572864323e-06,
148
+ "loss": 0.4695,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 4.9,
153
+ "eval_accuracy": 0.8288770318031311,
154
+ "eval_loss": 0.7581244707107544,
155
+ "eval_runtime": 2.1366,
156
+ "eval_samples_per_second": 175.041,
157
+ "eval_steps_per_second": 11.233,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 5.39,
162
+ "learning_rate": 3.306532663316583e-06,
163
+ "loss": 0.4657,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 5.39,
168
+ "eval_accuracy": 0.8048128485679626,
169
+ "eval_loss": 0.8404039144515991,
170
+ "eval_runtime": 2.1382,
171
+ "eval_samples_per_second": 174.912,
172
+ "eval_steps_per_second": 11.224,
173
+ "step": 1100
174
+ },
175
+ {
176
+ "epoch": 5.88,
177
+ "learning_rate": 2.9547738693467334e-06,
178
+ "loss": 0.4549,
179
+ "step": 1200
180
+ },
181
+ {
182
+ "epoch": 5.88,
183
+ "eval_accuracy": 0.8368983864784241,
184
+ "eval_loss": 0.7800447940826416,
185
+ "eval_runtime": 2.143,
186
+ "eval_samples_per_second": 174.523,
187
+ "eval_steps_per_second": 11.199,
188
+ "step": 1200
189
+ },
190
+ {
191
+ "epoch": 6.37,
192
+ "learning_rate": 2.6030150753768844e-06,
193
+ "loss": 0.4305,
194
+ "step": 1300
195
+ },
196
+ {
197
+ "epoch": 6.37,
198
+ "eval_accuracy": 0.8235294222831726,
199
+ "eval_loss": 0.8575029969215393,
200
+ "eval_runtime": 2.1375,
201
+ "eval_samples_per_second": 174.967,
202
+ "eval_steps_per_second": 11.228,
203
+ "step": 1300
204
+ },
205
+ {
206
+ "epoch": 6.86,
207
+ "learning_rate": 2.251256281407035e-06,
208
+ "loss": 0.4209,
209
+ "step": 1400
210
+ },
211
+ {
212
+ "epoch": 6.86,
213
+ "eval_accuracy": 0.8101603984832764,
214
+ "eval_loss": 0.8572390675544739,
215
+ "eval_runtime": 2.1484,
216
+ "eval_samples_per_second": 174.083,
217
+ "eval_steps_per_second": 11.171,
218
+ "step": 1400
219
+ },
220
+ {
221
+ "epoch": 7.35,
222
+ "learning_rate": 1.899497487437186e-06,
223
+ "loss": 0.3983,
224
+ "step": 1500
225
+ },
226
+ {
227
+ "epoch": 7.35,
228
+ "eval_accuracy": 0.8315507769584656,
229
+ "eval_loss": 0.8391810655593872,
230
+ "eval_runtime": 2.1399,
231
+ "eval_samples_per_second": 174.771,
232
+ "eval_steps_per_second": 11.215,
233
+ "step": 1500
234
+ },
235
+ {
236
+ "epoch": 7.84,
237
+ "learning_rate": 1.5477386934673368e-06,
238
+ "loss": 0.4139,
239
+ "step": 1600
240
+ },
241
+ {
242
+ "epoch": 7.84,
243
+ "eval_accuracy": 0.8208556175231934,
244
+ "eval_loss": 0.8151516318321228,
245
+ "eval_runtime": 2.1644,
246
+ "eval_samples_per_second": 172.797,
247
+ "eval_steps_per_second": 11.089,
248
+ "step": 1600
249
+ },
250
+ {
251
+ "epoch": 8.33,
252
+ "learning_rate": 1.1959798994974873e-06,
253
+ "loss": 0.393,
254
+ "step": 1700
255
+ },
256
+ {
257
+ "epoch": 8.33,
258
+ "eval_accuracy": 0.8288770318031311,
259
+ "eval_loss": 0.8261328935623169,
260
+ "eval_runtime": 2.171,
261
+ "eval_samples_per_second": 172.27,
262
+ "eval_steps_per_second": 11.055,
263
+ "step": 1700
264
+ },
265
+ {
266
+ "epoch": 8.82,
267
+ "learning_rate": 8.442211055276381e-07,
268
+ "loss": 0.3979,
269
+ "step": 1800
270
+ },
271
+ {
272
+ "epoch": 8.82,
273
+ "eval_accuracy": 0.8235294222831726,
274
+ "eval_loss": 0.8327566385269165,
275
+ "eval_runtime": 2.1392,
276
+ "eval_samples_per_second": 174.828,
277
+ "eval_steps_per_second": 11.219,
278
+ "step": 1800
279
+ },
280
+ {
281
+ "epoch": 9.31,
282
+ "learning_rate": 4.924623115577889e-07,
283
+ "loss": 0.3928,
284
+ "step": 1900
285
+ },
286
+ {
287
+ "epoch": 9.31,
288
+ "eval_accuracy": 0.8208556175231934,
289
+ "eval_loss": 0.8364331126213074,
290
+ "eval_runtime": 2.138,
291
+ "eval_samples_per_second": 174.931,
292
+ "eval_steps_per_second": 11.226,
293
+ "step": 1900
294
+ },
295
+ {
296
+ "epoch": 9.8,
297
+ "learning_rate": 1.4070351758793969e-07,
298
+ "loss": 0.3848,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 9.8,
303
+ "eval_accuracy": 0.8235294222831726,
304
+ "eval_loss": 0.8322352170944214,
305
+ "eval_runtime": 2.142,
306
+ "eval_samples_per_second": 174.602,
307
+ "eval_steps_per_second": 11.204,
308
+ "step": 2000
309
+ },
310
+ {
311
+ "epoch": 10.0,
312
+ "step": 2040,
313
+ "total_flos": 1.517670726457344e+16,
314
+ "train_loss": 0.5818920486113605,
315
+ "train_runtime": 787.9964,
316
+ "train_samples_per_second": 41.333,
317
+ "train_steps_per_second": 2.589
318
+ }
319
+ ],
320
+ "max_steps": 2040,
321
+ "num_train_epochs": 10,
322
+ "total_flos": 1.517670726457344e+16,
323
+ "trial_name": null,
324
+ "trial_params": null
325
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c4d0aab9d5096465be0dc70b5c8ad406c93d795069341635b649369d4c01830
3
+ size 3311