Elron commited on
Commit
5b50f24
1 Parent(s): d287d46

Pushing deberta-v3-large-hate to hub

Browse files
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - generated_from_trainer
5
+ metrics:
6
+ - accuracy
7
+ model-index:
8
+ - name: deberta-v3-large-hate-lr7e-6-gas1-ls0.0
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # deberta-v3-large-hate-lr7e-6-gas1-ls0.0
16
+
17
+ This model is a fine-tuned version of [microsoft/deberta-v3-large](https://huggingface.co/microsoft/deberta-v3-large) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 1.5689
20
+ - Accuracy: 0.8028
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 7e-06
40
+ - train_batch_size: 16
41
+ - eval_batch_size: 16
42
+ - seed: 42
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: linear
45
+ - lr_scheduler_warmup_steps: 50
46
+ - num_epochs: 10.0
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
52
+ | 0.6362 | 0.18 | 100 | 0.5481 | 0.7197 |
53
+ | 0.4264 | 0.36 | 200 | 0.4550 | 0.8008 |
54
+ | 0.4174 | 0.53 | 300 | 0.4524 | 0.7868 |
55
+ | 0.4197 | 0.71 | 400 | 0.4586 | 0.7918 |
56
+ | 0.3819 | 0.89 | 500 | 0.4368 | 0.8078 |
57
+ | 0.3558 | 1.07 | 600 | 0.4525 | 0.8068 |
58
+ | 0.2982 | 1.24 | 700 | 0.4999 | 0.7928 |
59
+ | 0.2885 | 1.42 | 800 | 0.5129 | 0.8108 |
60
+ | 0.253 | 1.6 | 900 | 0.5873 | 0.8208 |
61
+ | 0.3354 | 1.78 | 1000 | 0.4244 | 0.8178 |
62
+ | 0.3083 | 1.95 | 1100 | 0.4853 | 0.8058 |
63
+ | 0.2301 | 2.13 | 1200 | 0.7209 | 0.8018 |
64
+ | 0.2167 | 2.31 | 1300 | 0.8090 | 0.7778 |
65
+ | 0.1863 | 2.49 | 1400 | 0.6812 | 0.8038 |
66
+ | 0.2181 | 2.66 | 1500 | 0.6958 | 0.8138 |
67
+ | 0.2159 | 2.84 | 1600 | 0.6315 | 0.8118 |
68
+ | 0.1828 | 3.02 | 1700 | 0.7173 | 0.8138 |
69
+ | 0.1287 | 3.2 | 1800 | 0.9081 | 0.8018 |
70
+ | 0.1711 | 3.37 | 1900 | 0.8858 | 0.8068 |
71
+ | 0.1598 | 3.55 | 2000 | 0.7878 | 0.8028 |
72
+ | 0.1467 | 3.73 | 2100 | 0.9003 | 0.7948 |
73
+ | 0.127 | 3.91 | 2200 | 0.9066 | 0.8048 |
74
+ | 0.1134 | 4.09 | 2300 | 0.9646 | 0.8118 |
75
+ | 0.1017 | 4.26 | 2400 | 0.9778 | 0.8048 |
76
+ | 0.085 | 4.44 | 2500 | 1.0529 | 0.8088 |
77
+ | 0.0996 | 4.62 | 2600 | 1.0082 | 0.8058 |
78
+ | 0.1054 | 4.8 | 2700 | 0.9698 | 0.8108 |
79
+ | 0.1375 | 4.97 | 2800 | 0.9334 | 0.8048 |
80
+ | 0.0487 | 5.15 | 2900 | 1.1273 | 0.8108 |
81
+ | 0.0611 | 5.33 | 3000 | 1.1528 | 0.8058 |
82
+ | 0.0668 | 5.51 | 3100 | 1.0148 | 0.8118 |
83
+ | 0.0582 | 5.68 | 3200 | 1.1333 | 0.8108 |
84
+ | 0.0869 | 5.86 | 3300 | 1.0607 | 0.8088 |
85
+ | 0.0623 | 6.04 | 3400 | 1.1880 | 0.8068 |
86
+ | 0.0317 | 6.22 | 3500 | 1.2836 | 0.8008 |
87
+ | 0.0546 | 6.39 | 3600 | 1.2148 | 0.8058 |
88
+ | 0.0486 | 6.57 | 3700 | 1.3348 | 0.8008 |
89
+ | 0.0332 | 6.75 | 3800 | 1.3734 | 0.8018 |
90
+ | 0.051 | 6.93 | 3900 | 1.2966 | 0.7978 |
91
+ | 0.0217 | 7.1 | 4000 | 1.3853 | 0.8048 |
92
+ | 0.0109 | 7.28 | 4100 | 1.4803 | 0.8068 |
93
+ | 0.0345 | 7.46 | 4200 | 1.4906 | 0.7998 |
94
+ | 0.0365 | 7.64 | 4300 | 1.4347 | 0.8028 |
95
+ | 0.0265 | 7.82 | 4400 | 1.3977 | 0.8128 |
96
+ | 0.0257 | 7.99 | 4500 | 1.3705 | 0.8108 |
97
+ | 0.0036 | 8.17 | 4600 | 1.4353 | 0.8168 |
98
+ | 0.0269 | 8.35 | 4700 | 1.4826 | 0.8068 |
99
+ | 0.0231 | 8.53 | 4800 | 1.4811 | 0.8118 |
100
+ | 0.0204 | 8.7 | 4900 | 1.5245 | 0.8028 |
101
+ | 0.0263 | 8.88 | 5000 | 1.5123 | 0.8018 |
102
+ | 0.0138 | 9.06 | 5100 | 1.5113 | 0.8028 |
103
+ | 0.0089 | 9.24 | 5200 | 1.5846 | 0.7978 |
104
+ | 0.029 | 9.41 | 5300 | 1.5362 | 0.8008 |
105
+ | 0.0058 | 9.59 | 5400 | 1.5759 | 0.8018 |
106
+ | 0.0084 | 9.77 | 5500 | 1.5679 | 0.8018 |
107
+ | 0.0065 | 9.95 | 5600 | 1.5683 | 0.8028 |
108
+
109
+
110
+ ### Framework versions
111
+
112
+ - Transformers 4.20.0.dev0
113
+ - Pytorch 1.9.0
114
+ - Datasets 2.2.2
115
+ - Tokenizers 0.11.6
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8028028011322021,
4
+ "eval_loss": 1.568860650062561,
5
+ "eval_runtime": 6.2145,
6
+ "eval_samples": 999,
7
+ "eval_samples_per_second": 160.753,
8
+ "eval_steps_per_second": 10.138,
9
+ "train_loss": 0.13640729715885533,
10
+ "train_runtime": 2182.3127,
11
+ "train_samples": 9000,
12
+ "train_samples_per_second": 41.241,
13
+ "train_steps_per_second": 2.58
14
+ }
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-large",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "id2label": {
11
+ "0": 0,
12
+ "1": 1
13
+ },
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4096,
16
+ "label2id": {
17
+ "0": 0,
18
+ "1": 1
19
+ },
20
+ "layer_norm_eps": 1e-07,
21
+ "max_position_embeddings": 512,
22
+ "max_relative_positions": -1,
23
+ "model_type": "deberta-v2",
24
+ "norm_rel_ebd": "layer_norm",
25
+ "num_attention_heads": 16,
26
+ "num_hidden_layers": 24,
27
+ "pad_token_id": 0,
28
+ "pooler_dropout": 0,
29
+ "pooler_hidden_act": "gelu",
30
+ "pooler_hidden_size": 1024,
31
+ "pos_att_type": [
32
+ "p2c",
33
+ "c2p"
34
+ ],
35
+ "position_biased_input": false,
36
+ "position_buckets": 256,
37
+ "relative_attention": true,
38
+ "share_att_key": true,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.20.0.dev0",
41
+ "type_vocab_size": 0,
42
+ "vocab_size": 128100
43
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.8208208084106445,
3
+ "eval_loss": 0.5872611403465271,
4
+ "eval_runtime": 6.1253,
5
+ "eval_samples": 999,
6
+ "eval_samples_per_second": 163.093,
7
+ "eval_steps_per_second": 10.285
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5223d2f36ffa1a5fa517d7faaf2c139d4aa6881beafc0a86c86709ddeb5fbb
3
+ size 1740393387
run_test.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_1h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/test.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/best_checkpoint --train_file data/tweet_eval/hate/train.csv --validation_file data/tweet_eval/hate/validation.csv --test_file data/tweet_eval/hate/test.csv --do_eval --do_predict --report_to none --per_device_eval_batch_size 16 --max_seq_length 256 --output_dir outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/best_checkpoint
run_train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_6h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/train.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path microsoft/deberta-v3-large --train_file data/tweet_eval/hate/train.csv --validation_file data/tweet_eval/hate/validation.csv --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_seq_length 256 --learning_rate 7e-6 --output_dir outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0 --evaluation_strategy steps --save_strategy no --warmup_steps 50 --num_train_epochs 10 --overwrite_output_dir --logging_steps 100 --gradient_accumulation_steps 1 --label_smoothing_factor 0.0 --report_to clearml --metric_for_best_model accuracy --logging_dir outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/tb \; rm -rf outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/tb \; rm -rf outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/checkpoint-* \; . outputs/train/tweet_eval2/hate/deberta-v3-large-hate-lr7e-6-gas1-ls0.0/run_test.sh
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.6127946376800537,
3
+ "eval_loss": 1.4348669052124023,
4
+ "eval_runtime": 17.0067,
5
+ "eval_samples_per_second": 174.637,
6
+ "eval_steps_per_second": 10.937,
7
+ "test_samples": 2970
8
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "mask_token": "[MASK]",
7
+ "name_or_path": "microsoft/deberta-v3-large",
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "sp_model_kwargs": {},
11
+ "special_tokens_map_file": null,
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }
trainer_state.json ADDED
@@ -0,0 +1,865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 5630,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.18,
12
+ "learning_rate": 6.937275985663082e-06,
13
+ "loss": 0.6362,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.18,
18
+ "eval_accuracy": 0.7197197079658508,
19
+ "eval_loss": 0.5481122136116028,
20
+ "eval_runtime": 6.2072,
21
+ "eval_samples_per_second": 160.941,
22
+ "eval_steps_per_second": 10.149,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.36,
27
+ "learning_rate": 6.811827956989247e-06,
28
+ "loss": 0.4264,
29
+ "step": 200
30
+ },
31
+ {
32
+ "epoch": 0.36,
33
+ "eval_accuracy": 0.8008008003234863,
34
+ "eval_loss": 0.4550396203994751,
35
+ "eval_runtime": 6.2195,
36
+ "eval_samples_per_second": 160.623,
37
+ "eval_steps_per_second": 10.129,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.53,
42
+ "learning_rate": 6.6863799283154114e-06,
43
+ "loss": 0.4174,
44
+ "step": 300
45
+ },
46
+ {
47
+ "epoch": 0.53,
48
+ "eval_accuracy": 0.7867867946624756,
49
+ "eval_loss": 0.452409952878952,
50
+ "eval_runtime": 6.2183,
51
+ "eval_samples_per_second": 160.655,
52
+ "eval_steps_per_second": 10.131,
53
+ "step": 300
54
+ },
55
+ {
56
+ "epoch": 0.71,
57
+ "learning_rate": 6.560931899641577e-06,
58
+ "loss": 0.4197,
59
+ "step": 400
60
+ },
61
+ {
62
+ "epoch": 0.71,
63
+ "eval_accuracy": 0.7917917966842651,
64
+ "eval_loss": 0.4586125910282135,
65
+ "eval_runtime": 6.2441,
66
+ "eval_samples_per_second": 159.991,
67
+ "eval_steps_per_second": 10.09,
68
+ "step": 400
69
+ },
70
+ {
71
+ "epoch": 0.89,
72
+ "learning_rate": 6.435483870967742e-06,
73
+ "loss": 0.3819,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 0.89,
78
+ "eval_accuracy": 0.8078078031539917,
79
+ "eval_loss": 0.4367608428001404,
80
+ "eval_runtime": 6.2213,
81
+ "eval_samples_per_second": 160.577,
82
+ "eval_steps_per_second": 10.126,
83
+ "step": 500
84
+ },
85
+ {
86
+ "epoch": 1.07,
87
+ "learning_rate": 6.310035842293907e-06,
88
+ "loss": 0.3558,
89
+ "step": 600
90
+ },
91
+ {
92
+ "epoch": 1.07,
93
+ "eval_accuracy": 0.8068068027496338,
94
+ "eval_loss": 0.4524727463722229,
95
+ "eval_runtime": 6.2342,
96
+ "eval_samples_per_second": 160.246,
97
+ "eval_steps_per_second": 10.106,
98
+ "step": 600
99
+ },
100
+ {
101
+ "epoch": 1.24,
102
+ "learning_rate": 6.184587813620071e-06,
103
+ "loss": 0.2982,
104
+ "step": 700
105
+ },
106
+ {
107
+ "epoch": 1.24,
108
+ "eval_accuracy": 0.792792797088623,
109
+ "eval_loss": 0.49992287158966064,
110
+ "eval_runtime": 6.206,
111
+ "eval_samples_per_second": 160.973,
112
+ "eval_steps_per_second": 10.151,
113
+ "step": 700
114
+ },
115
+ {
116
+ "epoch": 1.42,
117
+ "learning_rate": 6.059139784946236e-06,
118
+ "loss": 0.2885,
119
+ "step": 800
120
+ },
121
+ {
122
+ "epoch": 1.42,
123
+ "eval_accuracy": 0.8108108043670654,
124
+ "eval_loss": 0.5129059553146362,
125
+ "eval_runtime": 6.2199,
126
+ "eval_samples_per_second": 160.613,
127
+ "eval_steps_per_second": 10.129,
128
+ "step": 800
129
+ },
130
+ {
131
+ "epoch": 1.6,
132
+ "learning_rate": 5.933691756272401e-06,
133
+ "loss": 0.253,
134
+ "step": 900
135
+ },
136
+ {
137
+ "epoch": 1.6,
138
+ "eval_accuracy": 0.8208208084106445,
139
+ "eval_loss": 0.5872611403465271,
140
+ "eval_runtime": 6.2332,
141
+ "eval_samples_per_second": 160.27,
142
+ "eval_steps_per_second": 10.107,
143
+ "step": 900
144
+ },
145
+ {
146
+ "epoch": 1.78,
147
+ "learning_rate": 5.8082437275985665e-06,
148
+ "loss": 0.3354,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 1.78,
153
+ "eval_accuracy": 0.8178178071975708,
154
+ "eval_loss": 0.4244420826435089,
155
+ "eval_runtime": 6.2275,
156
+ "eval_samples_per_second": 160.417,
157
+ "eval_steps_per_second": 10.116,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 1.95,
162
+ "learning_rate": 5.682795698924731e-06,
163
+ "loss": 0.3083,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 1.95,
168
+ "eval_accuracy": 0.8058058023452759,
169
+ "eval_loss": 0.4852960705757141,
170
+ "eval_runtime": 6.2193,
171
+ "eval_samples_per_second": 160.63,
172
+ "eval_steps_per_second": 10.13,
173
+ "step": 1100
174
+ },
175
+ {
176
+ "epoch": 2.13,
177
+ "learning_rate": 5.557347670250896e-06,
178
+ "loss": 0.2301,
179
+ "step": 1200
180
+ },
181
+ {
182
+ "epoch": 2.13,
183
+ "eval_accuracy": 0.8018018007278442,
184
+ "eval_loss": 0.7208853960037231,
185
+ "eval_runtime": 6.2021,
186
+ "eval_samples_per_second": 161.075,
187
+ "eval_steps_per_second": 10.158,
188
+ "step": 1200
189
+ },
190
+ {
191
+ "epoch": 2.31,
192
+ "learning_rate": 5.431899641577061e-06,
193
+ "loss": 0.2167,
194
+ "step": 1300
195
+ },
196
+ {
197
+ "epoch": 2.31,
198
+ "eval_accuracy": 0.7777777910232544,
199
+ "eval_loss": 0.8089737892150879,
200
+ "eval_runtime": 6.2037,
201
+ "eval_samples_per_second": 161.034,
202
+ "eval_steps_per_second": 10.155,
203
+ "step": 1300
204
+ },
205
+ {
206
+ "epoch": 2.49,
207
+ "learning_rate": 5.306451612903225e-06,
208
+ "loss": 0.1863,
209
+ "step": 1400
210
+ },
211
+ {
212
+ "epoch": 2.49,
213
+ "eval_accuracy": 0.8038038015365601,
214
+ "eval_loss": 0.6812323927879333,
215
+ "eval_runtime": 6.2398,
216
+ "eval_samples_per_second": 160.102,
217
+ "eval_steps_per_second": 10.097,
218
+ "step": 1400
219
+ },
220
+ {
221
+ "epoch": 2.66,
222
+ "learning_rate": 5.181003584229391e-06,
223
+ "loss": 0.2181,
224
+ "step": 1500
225
+ },
226
+ {
227
+ "epoch": 2.66,
228
+ "eval_accuracy": 0.8138138055801392,
229
+ "eval_loss": 0.6958026885986328,
230
+ "eval_runtime": 6.2122,
231
+ "eval_samples_per_second": 160.812,
232
+ "eval_steps_per_second": 10.141,
233
+ "step": 1500
234
+ },
235
+ {
236
+ "epoch": 2.84,
237
+ "learning_rate": 5.0555555555555555e-06,
238
+ "loss": 0.2159,
239
+ "step": 1600
240
+ },
241
+ {
242
+ "epoch": 2.84,
243
+ "eval_accuracy": 0.8118118047714233,
244
+ "eval_loss": 0.6314735412597656,
245
+ "eval_runtime": 6.2306,
246
+ "eval_samples_per_second": 160.337,
247
+ "eval_steps_per_second": 10.111,
248
+ "step": 1600
249
+ },
250
+ {
251
+ "epoch": 3.02,
252
+ "learning_rate": 4.930107526881721e-06,
253
+ "loss": 0.1828,
254
+ "step": 1700
255
+ },
256
+ {
257
+ "epoch": 3.02,
258
+ "eval_accuracy": 0.8138138055801392,
259
+ "eval_loss": 0.7173236608505249,
260
+ "eval_runtime": 6.2107,
261
+ "eval_samples_per_second": 160.851,
262
+ "eval_steps_per_second": 10.144,
263
+ "step": 1700
264
+ },
265
+ {
266
+ "epoch": 3.2,
267
+ "learning_rate": 4.804659498207885e-06,
268
+ "loss": 0.1287,
269
+ "step": 1800
270
+ },
271
+ {
272
+ "epoch": 3.2,
273
+ "eval_accuracy": 0.8018018007278442,
274
+ "eval_loss": 0.9080932140350342,
275
+ "eval_runtime": 6.2027,
276
+ "eval_samples_per_second": 161.06,
277
+ "eval_steps_per_second": 10.157,
278
+ "step": 1800
279
+ },
280
+ {
281
+ "epoch": 3.37,
282
+ "learning_rate": 4.67921146953405e-06,
283
+ "loss": 0.1711,
284
+ "step": 1900
285
+ },
286
+ {
287
+ "epoch": 3.37,
288
+ "eval_accuracy": 0.8068068027496338,
289
+ "eval_loss": 0.8858422040939331,
290
+ "eval_runtime": 6.2188,
291
+ "eval_samples_per_second": 160.641,
292
+ "eval_steps_per_second": 10.131,
293
+ "step": 1900
294
+ },
295
+ {
296
+ "epoch": 3.55,
297
+ "learning_rate": 4.553763440860215e-06,
298
+ "loss": 0.1598,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 3.55,
303
+ "eval_accuracy": 0.8028028011322021,
304
+ "eval_loss": 0.7877860069274902,
305
+ "eval_runtime": 6.2062,
306
+ "eval_samples_per_second": 160.967,
307
+ "eval_steps_per_second": 10.151,
308
+ "step": 2000
309
+ },
310
+ {
311
+ "epoch": 3.73,
312
+ "learning_rate": 4.42831541218638e-06,
313
+ "loss": 0.1467,
314
+ "step": 2100
315
+ },
316
+ {
317
+ "epoch": 3.73,
318
+ "eval_accuracy": 0.7947947978973389,
319
+ "eval_loss": 0.900332510471344,
320
+ "eval_runtime": 6.2358,
321
+ "eval_samples_per_second": 160.203,
322
+ "eval_steps_per_second": 10.103,
323
+ "step": 2100
324
+ },
325
+ {
326
+ "epoch": 3.91,
327
+ "learning_rate": 4.302867383512545e-06,
328
+ "loss": 0.127,
329
+ "step": 2200
330
+ },
331
+ {
332
+ "epoch": 3.91,
333
+ "eval_accuracy": 0.804804801940918,
334
+ "eval_loss": 0.9066368341445923,
335
+ "eval_runtime": 6.2129,
336
+ "eval_samples_per_second": 160.795,
337
+ "eval_steps_per_second": 10.14,
338
+ "step": 2200
339
+ },
340
+ {
341
+ "epoch": 4.09,
342
+ "learning_rate": 4.17741935483871e-06,
343
+ "loss": 0.1134,
344
+ "step": 2300
345
+ },
346
+ {
347
+ "epoch": 4.09,
348
+ "eval_accuracy": 0.8118118047714233,
349
+ "eval_loss": 0.9645766615867615,
350
+ "eval_runtime": 6.2157,
351
+ "eval_samples_per_second": 160.721,
352
+ "eval_steps_per_second": 10.136,
353
+ "step": 2300
354
+ },
355
+ {
356
+ "epoch": 4.26,
357
+ "learning_rate": 4.051971326164874e-06,
358
+ "loss": 0.1017,
359
+ "step": 2400
360
+ },
361
+ {
362
+ "epoch": 4.26,
363
+ "eval_accuracy": 0.804804801940918,
364
+ "eval_loss": 0.9778422713279724,
365
+ "eval_runtime": 6.2303,
366
+ "eval_samples_per_second": 160.346,
367
+ "eval_steps_per_second": 10.112,
368
+ "step": 2400
369
+ },
370
+ {
371
+ "epoch": 4.44,
372
+ "learning_rate": 3.926523297491039e-06,
373
+ "loss": 0.085,
374
+ "step": 2500
375
+ },
376
+ {
377
+ "epoch": 4.44,
378
+ "eval_accuracy": 0.8088088035583496,
379
+ "eval_loss": 1.0528582334518433,
380
+ "eval_runtime": 6.238,
381
+ "eval_samples_per_second": 160.149,
382
+ "eval_steps_per_second": 10.099,
383
+ "step": 2500
384
+ },
385
+ {
386
+ "epoch": 4.62,
387
+ "learning_rate": 3.801075268817204e-06,
388
+ "loss": 0.0996,
389
+ "step": 2600
390
+ },
391
+ {
392
+ "epoch": 4.62,
393
+ "eval_accuracy": 0.8058058023452759,
394
+ "eval_loss": 1.0082268714904785,
395
+ "eval_runtime": 6.2065,
396
+ "eval_samples_per_second": 160.961,
397
+ "eval_steps_per_second": 10.151,
398
+ "step": 2600
399
+ },
400
+ {
401
+ "epoch": 4.8,
402
+ "learning_rate": 3.6756272401433694e-06,
403
+ "loss": 0.1054,
404
+ "step": 2700
405
+ },
406
+ {
407
+ "epoch": 4.8,
408
+ "eval_accuracy": 0.8108108043670654,
409
+ "eval_loss": 0.9697705507278442,
410
+ "eval_runtime": 6.2348,
411
+ "eval_samples_per_second": 160.231,
412
+ "eval_steps_per_second": 10.105,
413
+ "step": 2700
414
+ },
415
+ {
416
+ "epoch": 4.97,
417
+ "learning_rate": 3.5501792114695336e-06,
418
+ "loss": 0.1375,
419
+ "step": 2800
420
+ },
421
+ {
422
+ "epoch": 4.97,
423
+ "eval_accuracy": 0.804804801940918,
424
+ "eval_loss": 0.9333746433258057,
425
+ "eval_runtime": 6.2109,
426
+ "eval_samples_per_second": 160.846,
427
+ "eval_steps_per_second": 10.143,
428
+ "step": 2800
429
+ },
430
+ {
431
+ "epoch": 5.15,
432
+ "learning_rate": 3.4247311827956988e-06,
433
+ "loss": 0.0487,
434
+ "step": 2900
435
+ },
436
+ {
437
+ "epoch": 5.15,
438
+ "eval_accuracy": 0.8108108043670654,
439
+ "eval_loss": 1.1273365020751953,
440
+ "eval_runtime": 6.2065,
441
+ "eval_samples_per_second": 160.961,
442
+ "eval_steps_per_second": 10.151,
443
+ "step": 2900
444
+ },
445
+ {
446
+ "epoch": 5.33,
447
+ "learning_rate": 3.299283154121864e-06,
448
+ "loss": 0.0611,
449
+ "step": 3000
450
+ },
451
+ {
452
+ "epoch": 5.33,
453
+ "eval_accuracy": 0.8058058023452759,
454
+ "eval_loss": 1.1528337001800537,
455
+ "eval_runtime": 6.2119,
456
+ "eval_samples_per_second": 160.821,
457
+ "eval_steps_per_second": 10.142,
458
+ "step": 3000
459
+ },
460
+ {
461
+ "epoch": 5.51,
462
+ "learning_rate": 3.1738351254480286e-06,
463
+ "loss": 0.0668,
464
+ "step": 3100
465
+ },
466
+ {
467
+ "epoch": 5.51,
468
+ "eval_accuracy": 0.8118118047714233,
469
+ "eval_loss": 1.0147671699523926,
470
+ "eval_runtime": 6.2218,
471
+ "eval_samples_per_second": 160.564,
472
+ "eval_steps_per_second": 10.126,
473
+ "step": 3100
474
+ },
475
+ {
476
+ "epoch": 5.68,
477
+ "learning_rate": 3.0483870967741937e-06,
478
+ "loss": 0.0582,
479
+ "step": 3200
480
+ },
481
+ {
482
+ "epoch": 5.68,
483
+ "eval_accuracy": 0.8108108043670654,
484
+ "eval_loss": 1.1332666873931885,
485
+ "eval_runtime": 6.2186,
486
+ "eval_samples_per_second": 160.648,
487
+ "eval_steps_per_second": 10.131,
488
+ "step": 3200
489
+ },
490
+ {
491
+ "epoch": 5.86,
492
+ "learning_rate": 2.9229390681003584e-06,
493
+ "loss": 0.0869,
494
+ "step": 3300
495
+ },
496
+ {
497
+ "epoch": 5.86,
498
+ "eval_accuracy": 0.8088088035583496,
499
+ "eval_loss": 1.060727596282959,
500
+ "eval_runtime": 6.1932,
501
+ "eval_samples_per_second": 161.305,
502
+ "eval_steps_per_second": 10.172,
503
+ "step": 3300
504
+ },
505
+ {
506
+ "epoch": 6.04,
507
+ "learning_rate": 2.797491039426523e-06,
508
+ "loss": 0.0623,
509
+ "step": 3400
510
+ },
511
+ {
512
+ "epoch": 6.04,
513
+ "eval_accuracy": 0.8068068027496338,
514
+ "eval_loss": 1.1880476474761963,
515
+ "eval_runtime": 6.2192,
516
+ "eval_samples_per_second": 160.631,
517
+ "eval_steps_per_second": 10.13,
518
+ "step": 3400
519
+ },
520
+ {
521
+ "epoch": 6.22,
522
+ "learning_rate": 2.6720430107526883e-06,
523
+ "loss": 0.0317,
524
+ "step": 3500
525
+ },
526
+ {
527
+ "epoch": 6.22,
528
+ "eval_accuracy": 0.8008008003234863,
529
+ "eval_loss": 1.2836244106292725,
530
+ "eval_runtime": 6.2079,
531
+ "eval_samples_per_second": 160.925,
532
+ "eval_steps_per_second": 10.148,
533
+ "step": 3500
534
+ },
535
+ {
536
+ "epoch": 6.39,
537
+ "learning_rate": 2.546594982078853e-06,
538
+ "loss": 0.0546,
539
+ "step": 3600
540
+ },
541
+ {
542
+ "epoch": 6.39,
543
+ "eval_accuracy": 0.8058058023452759,
544
+ "eval_loss": 1.2147704362869263,
545
+ "eval_runtime": 6.2243,
546
+ "eval_samples_per_second": 160.501,
547
+ "eval_steps_per_second": 10.122,
548
+ "step": 3600
549
+ },
550
+ {
551
+ "epoch": 6.57,
552
+ "learning_rate": 2.4211469534050177e-06,
553
+ "loss": 0.0486,
554
+ "step": 3700
555
+ },
556
+ {
557
+ "epoch": 6.57,
558
+ "eval_accuracy": 0.8008008003234863,
559
+ "eval_loss": 1.334807276725769,
560
+ "eval_runtime": 6.1963,
561
+ "eval_samples_per_second": 161.225,
562
+ "eval_steps_per_second": 10.167,
563
+ "step": 3700
564
+ },
565
+ {
566
+ "epoch": 6.75,
567
+ "learning_rate": 2.2956989247311828e-06,
568
+ "loss": 0.0332,
569
+ "step": 3800
570
+ },
571
+ {
572
+ "epoch": 6.75,
573
+ "eval_accuracy": 0.8018018007278442,
574
+ "eval_loss": 1.3734461069107056,
575
+ "eval_runtime": 6.3321,
576
+ "eval_samples_per_second": 157.768,
577
+ "eval_steps_per_second": 9.949,
578
+ "step": 3800
579
+ },
580
+ {
581
+ "epoch": 6.93,
582
+ "learning_rate": 2.1702508960573475e-06,
583
+ "loss": 0.051,
584
+ "step": 3900
585
+ },
586
+ {
587
+ "epoch": 6.93,
588
+ "eval_accuracy": 0.7977977991104126,
589
+ "eval_loss": 1.2966439723968506,
590
+ "eval_runtime": 6.2073,
591
+ "eval_samples_per_second": 160.94,
592
+ "eval_steps_per_second": 10.149,
593
+ "step": 3900
594
+ },
595
+ {
596
+ "epoch": 7.1,
597
+ "learning_rate": 2.044802867383512e-06,
598
+ "loss": 0.0217,
599
+ "step": 4000
600
+ },
601
+ {
602
+ "epoch": 7.1,
603
+ "eval_accuracy": 0.804804801940918,
604
+ "eval_loss": 1.385273814201355,
605
+ "eval_runtime": 6.2117,
606
+ "eval_samples_per_second": 160.826,
607
+ "eval_steps_per_second": 10.142,
608
+ "step": 4000
609
+ },
610
+ {
611
+ "epoch": 7.28,
612
+ "learning_rate": 1.9193548387096773e-06,
613
+ "loss": 0.0109,
614
+ "step": 4100
615
+ },
616
+ {
617
+ "epoch": 7.28,
618
+ "eval_accuracy": 0.8068068027496338,
619
+ "eval_loss": 1.480326533317566,
620
+ "eval_runtime": 6.2106,
621
+ "eval_samples_per_second": 160.854,
622
+ "eval_steps_per_second": 10.144,
623
+ "step": 4100
624
+ },
625
+ {
626
+ "epoch": 7.46,
627
+ "learning_rate": 1.793906810035842e-06,
628
+ "loss": 0.0345,
629
+ "step": 4200
630
+ },
631
+ {
632
+ "epoch": 7.46,
633
+ "eval_accuracy": 0.7997997999191284,
634
+ "eval_loss": 1.4906260967254639,
635
+ "eval_runtime": 6.2002,
636
+ "eval_samples_per_second": 161.124,
637
+ "eval_steps_per_second": 10.161,
638
+ "step": 4200
639
+ },
640
+ {
641
+ "epoch": 7.64,
642
+ "learning_rate": 1.6684587813620071e-06,
643
+ "loss": 0.0365,
644
+ "step": 4300
645
+ },
646
+ {
647
+ "epoch": 7.64,
648
+ "eval_accuracy": 0.8028028011322021,
649
+ "eval_loss": 1.4347106218338013,
650
+ "eval_runtime": 6.2133,
651
+ "eval_samples_per_second": 160.783,
652
+ "eval_steps_per_second": 10.139,
653
+ "step": 4300
654
+ },
655
+ {
656
+ "epoch": 7.82,
657
+ "learning_rate": 1.543010752688172e-06,
658
+ "loss": 0.0265,
659
+ "step": 4400
660
+ },
661
+ {
662
+ "epoch": 7.82,
663
+ "eval_accuracy": 0.8128128051757812,
664
+ "eval_loss": 1.3976863622665405,
665
+ "eval_runtime": 6.224,
666
+ "eval_samples_per_second": 160.508,
667
+ "eval_steps_per_second": 10.122,
668
+ "step": 4400
669
+ },
670
+ {
671
+ "epoch": 7.99,
672
+ "learning_rate": 1.417562724014337e-06,
673
+ "loss": 0.0257,
674
+ "step": 4500
675
+ },
676
+ {
677
+ "epoch": 7.99,
678
+ "eval_accuracy": 0.8108108043670654,
679
+ "eval_loss": 1.370467185974121,
680
+ "eval_runtime": 6.2313,
681
+ "eval_samples_per_second": 160.321,
682
+ "eval_steps_per_second": 10.11,
683
+ "step": 4500
684
+ },
685
+ {
686
+ "epoch": 8.17,
687
+ "learning_rate": 1.2921146953405017e-06,
688
+ "loss": 0.0036,
689
+ "step": 4600
690
+ },
691
+ {
692
+ "epoch": 8.17,
693
+ "eval_accuracy": 0.8168168067932129,
694
+ "eval_loss": 1.4352822303771973,
695
+ "eval_runtime": 6.2072,
696
+ "eval_samples_per_second": 160.943,
697
+ "eval_steps_per_second": 10.15,
698
+ "step": 4600
699
+ },
700
+ {
701
+ "epoch": 8.35,
702
+ "learning_rate": 1.1666666666666666e-06,
703
+ "loss": 0.0269,
704
+ "step": 4700
705
+ },
706
+ {
707
+ "epoch": 8.35,
708
+ "eval_accuracy": 0.8068068027496338,
709
+ "eval_loss": 1.4826140403747559,
710
+ "eval_runtime": 6.2178,
711
+ "eval_samples_per_second": 160.669,
712
+ "eval_steps_per_second": 10.132,
713
+ "step": 4700
714
+ },
715
+ {
716
+ "epoch": 8.53,
717
+ "learning_rate": 1.0412186379928315e-06,
718
+ "loss": 0.0231,
719
+ "step": 4800
720
+ },
721
+ {
722
+ "epoch": 8.53,
723
+ "eval_accuracy": 0.8118118047714233,
724
+ "eval_loss": 1.4810999631881714,
725
+ "eval_runtime": 6.3061,
726
+ "eval_samples_per_second": 158.417,
727
+ "eval_steps_per_second": 9.99,
728
+ "step": 4800
729
+ },
730
+ {
731
+ "epoch": 8.7,
732
+ "learning_rate": 9.157706093189965e-07,
733
+ "loss": 0.0204,
734
+ "step": 4900
735
+ },
736
+ {
737
+ "epoch": 8.7,
738
+ "eval_accuracy": 0.8028028011322021,
739
+ "eval_loss": 1.5245323181152344,
740
+ "eval_runtime": 6.2057,
741
+ "eval_samples_per_second": 160.982,
742
+ "eval_steps_per_second": 10.152,
743
+ "step": 4900
744
+ },
745
+ {
746
+ "epoch": 8.88,
747
+ "learning_rate": 7.903225806451612e-07,
748
+ "loss": 0.0263,
749
+ "step": 5000
750
+ },
751
+ {
752
+ "epoch": 8.88,
753
+ "eval_accuracy": 0.8018018007278442,
754
+ "eval_loss": 1.5123308897018433,
755
+ "eval_runtime": 6.2053,
756
+ "eval_samples_per_second": 160.991,
757
+ "eval_steps_per_second": 10.153,
758
+ "step": 5000
759
+ },
760
+ {
761
+ "epoch": 9.06,
762
+ "learning_rate": 6.648745519713261e-07,
763
+ "loss": 0.0138,
764
+ "step": 5100
765
+ },
766
+ {
767
+ "epoch": 9.06,
768
+ "eval_accuracy": 0.8028028011322021,
769
+ "eval_loss": 1.51128089427948,
770
+ "eval_runtime": 6.2898,
771
+ "eval_samples_per_second": 158.83,
772
+ "eval_steps_per_second": 10.016,
773
+ "step": 5100
774
+ },
775
+ {
776
+ "epoch": 9.24,
777
+ "learning_rate": 5.39426523297491e-07,
778
+ "loss": 0.0089,
779
+ "step": 5200
780
+ },
781
+ {
782
+ "epoch": 9.24,
783
+ "eval_accuracy": 0.7977977991104126,
784
+ "eval_loss": 1.5846397876739502,
785
+ "eval_runtime": 6.2124,
786
+ "eval_samples_per_second": 160.808,
787
+ "eval_steps_per_second": 10.141,
788
+ "step": 5200
789
+ },
790
+ {
791
+ "epoch": 9.41,
792
+ "learning_rate": 4.1397849462365595e-07,
793
+ "loss": 0.029,
794
+ "step": 5300
795
+ },
796
+ {
797
+ "epoch": 9.41,
798
+ "eval_accuracy": 0.8008008003234863,
799
+ "eval_loss": 1.5361814498901367,
800
+ "eval_runtime": 6.2541,
801
+ "eval_samples_per_second": 159.736,
802
+ "eval_steps_per_second": 10.073,
803
+ "step": 5300
804
+ },
805
+ {
806
+ "epoch": 9.59,
807
+ "learning_rate": 2.8853046594982076e-07,
808
+ "loss": 0.0058,
809
+ "step": 5400
810
+ },
811
+ {
812
+ "epoch": 9.59,
813
+ "eval_accuracy": 0.8018018007278442,
814
+ "eval_loss": 1.5759379863739014,
815
+ "eval_runtime": 6.221,
816
+ "eval_samples_per_second": 160.585,
817
+ "eval_steps_per_second": 10.127,
818
+ "step": 5400
819
+ },
820
+ {
821
+ "epoch": 9.77,
822
+ "learning_rate": 1.6308243727598568e-07,
823
+ "loss": 0.0084,
824
+ "step": 5500
825
+ },
826
+ {
827
+ "epoch": 9.77,
828
+ "eval_accuracy": 0.8018018007278442,
829
+ "eval_loss": 1.5678976774215698,
830
+ "eval_runtime": 6.2009,
831
+ "eval_samples_per_second": 161.105,
832
+ "eval_steps_per_second": 10.16,
833
+ "step": 5500
834
+ },
835
+ {
836
+ "epoch": 9.95,
837
+ "learning_rate": 3.763440860215054e-08,
838
+ "loss": 0.0065,
839
+ "step": 5600
840
+ },
841
+ {
842
+ "epoch": 9.95,
843
+ "eval_accuracy": 0.8028028011322021,
844
+ "eval_loss": 1.568334937095642,
845
+ "eval_runtime": 6.2439,
846
+ "eval_samples_per_second": 159.996,
847
+ "eval_steps_per_second": 10.09,
848
+ "step": 5600
849
+ },
850
+ {
851
+ "epoch": 10.0,
852
+ "step": 5630,
853
+ "total_flos": 4.193719446528e+16,
854
+ "train_loss": 0.13640729715885533,
855
+ "train_runtime": 2182.3127,
856
+ "train_samples_per_second": 41.241,
857
+ "train_steps_per_second": 2.58
858
+ }
859
+ ],
860
+ "max_steps": 5630,
861
+ "num_train_epochs": 10,
862
+ "total_flos": 4.193719446528e+16,
863
+ "trial_name": null,
864
+ "trial_params": null
865
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:905229a66e041e28383381e29a8c40d438d511982854d7195ae501648f5d83ad
3
+ size 3311