KexuanShi commited on
Commit
9fd932d
·
verified ·
1 Parent(s): 793cf6c

Upload 84 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. cola/bert-base-uncased_lr1e-05/config.json +26 -0
  2. cola/bert-base-uncased_lr1e-05/model.safetensors +3 -0
  3. cola/bert-base-uncased_lr1e-05/special_tokens_map.json +7 -0
  4. cola/bert-base-uncased_lr1e-05/tokenizer.json +0 -0
  5. cola/bert-base-uncased_lr1e-05/tokenizer_config.json +55 -0
  6. cola/bert-base-uncased_lr1e-05/trainer_state.json +195 -0
  7. cola/bert-base-uncased_lr1e-05/training_args.bin +3 -0
  8. cola/bert-base-uncased_lr1e-05/vocab.txt +0 -0
  9. cola/roberta-base_lr1e-05/classifier_head.pt +3 -0
  10. cola/roberta-base_lr1e-05/config.json +27 -0
  11. cola/roberta-base_lr1e-05/merges.txt +0 -0
  12. cola/roberta-base_lr1e-05/model.safetensors +3 -0
  13. cola/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  14. cola/roberta-base_lr1e-05/tokenizer.json +0 -0
  15. cola/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  16. cola/roberta-base_lr1e-05/trainer_state.json +195 -0
  17. cola/roberta-base_lr1e-05/training_args.bin +3 -0
  18. cola/roberta-base_lr1e-05/vocab.json +0 -0
  19. cola/roberta-large_lr1e-05/classifier_head.pt +3 -0
  20. cola/roberta-large_lr1e-05/config.json +27 -0
  21. cola/roberta-large_lr1e-05/merges.txt +0 -0
  22. cola/roberta-large_lr1e-05/model.safetensors +3 -0
  23. cola/roberta-large_lr1e-05/special_tokens_map.json +15 -0
  24. cola/roberta-large_lr1e-05/tokenizer.json +0 -0
  25. cola/roberta-large_lr1e-05/tokenizer_config.json +57 -0
  26. cola/roberta-large_lr1e-05/trainer_state.json +195 -0
  27. cola/roberta-large_lr1e-05/training_args.bin +3 -0
  28. cola/roberta-large_lr1e-05/vocab.json +0 -0
  29. mnli/bert-base-uncased_lr1e-05/config.json +36 -0
  30. mnli/bert-base-uncased_lr1e-05/model.safetensors +3 -0
  31. mnli/bert-base-uncased_lr1e-05/special_tokens_map.json +7 -0
  32. mnli/bert-base-uncased_lr1e-05/tokenizer.json +0 -0
  33. mnli/bert-base-uncased_lr1e-05/tokenizer_config.json +55 -0
  34. mnli/bert-base-uncased_lr1e-05/trainer_state.json +3219 -0
  35. mnli/bert-base-uncased_lr1e-05/training_args.bin +3 -0
  36. mnli/bert-base-uncased_lr1e-05/vocab.txt +0 -0
  37. mnli/roberta-base_lr1e-05/classifier_head.pt +3 -0
  38. mnli/roberta-base_lr1e-05/config.json +37 -0
  39. mnli/roberta-base_lr1e-05/merges.txt +0 -0
  40. mnli/roberta-base_lr1e-05/model.safetensors +3 -0
  41. mnli/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  42. mnli/roberta-base_lr1e-05/tokenizer.json +0 -0
  43. mnli/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  44. mnli/roberta-base_lr1e-05/trainer_state.json +3219 -0
  45. mnli/roberta-base_lr1e-05/training_args.bin +3 -0
  46. mnli/roberta-base_lr1e-05/vocab.json +0 -0
  47. mnli/roberta-large_lr1e-05/classifier_head.pt +3 -0
  48. mnli/roberta-large_lr1e-05/config.json +37 -0
  49. mnli/roberta-large_lr1e-05/merges.txt +0 -0
  50. mnli/roberta-large_lr1e-05/model.safetensors +3 -0
cola/bert-base-uncased_lr1e-05/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/shikexuan/nlu_model/bert-base-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.45.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
cola/bert-base-uncased_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65214c1cd06c7ace891c6b813034486238cf891179509facb05ae5e592b49dc
3
+ size 437958648
cola/bert-base-uncased_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
cola/bert-base-uncased_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cola/bert-base-uncased_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
cola/bert-base-uncased_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6033268648340611,
3
+ "best_model_checkpoint": "./nlu_finetuned_models/cola/bert-base-uncased_lr1e-05/checkpoint-3367",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4810,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.4246382415294647,
14
+ "eval_matthews_correlation": 0.5212265735601975,
15
+ "eval_runtime": 0.4359,
16
+ "eval_samples_per_second": 1963.789,
17
+ "eval_steps_per_second": 123.884,
18
+ "step": 481
19
+ },
20
+ {
21
+ "epoch": 1.0395010395010396,
22
+ "grad_norm": 22.556371688842773,
23
+ "learning_rate": 9.533289095332891e-06,
24
+ "loss": 0.524,
25
+ "step": 500
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "eval_loss": 0.4183749258518219,
30
+ "eval_matthews_correlation": 0.5182707021157785,
31
+ "eval_runtime": 0.4174,
32
+ "eval_samples_per_second": 2050.804,
33
+ "eval_steps_per_second": 129.373,
34
+ "step": 962
35
+ },
36
+ {
37
+ "epoch": 2.079002079002079,
38
+ "grad_norm": 11.332476615905762,
39
+ "learning_rate": 8.427339084273391e-06,
40
+ "loss": 0.3607,
41
+ "step": 1000
42
+ },
43
+ {
44
+ "epoch": 3.0,
45
+ "eval_loss": 0.4737670123577118,
46
+ "eval_matthews_correlation": 0.5528480933747183,
47
+ "eval_runtime": 0.4152,
48
+ "eval_samples_per_second": 2061.677,
49
+ "eval_steps_per_second": 130.059,
50
+ "step": 1443
51
+ },
52
+ {
53
+ "epoch": 3.1185031185031185,
54
+ "grad_norm": 1.7414240837097168,
55
+ "learning_rate": 7.3213890732138915e-06,
56
+ "loss": 0.2474,
57
+ "step": 1500
58
+ },
59
+ {
60
+ "epoch": 4.0,
61
+ "eval_loss": 0.604235053062439,
62
+ "eval_matthews_correlation": 0.5775461143960999,
63
+ "eval_runtime": 0.4144,
64
+ "eval_samples_per_second": 2065.494,
65
+ "eval_steps_per_second": 130.3,
66
+ "step": 1924
67
+ },
68
+ {
69
+ "epoch": 4.158004158004158,
70
+ "grad_norm": 1.0630062818527222,
71
+ "learning_rate": 6.2154390621543915e-06,
72
+ "loss": 0.1789,
73
+ "step": 2000
74
+ },
75
+ {
76
+ "epoch": 5.0,
77
+ "eval_loss": 0.6851717233657837,
78
+ "eval_matthews_correlation": 0.5868698844413494,
79
+ "eval_runtime": 0.4073,
80
+ "eval_samples_per_second": 2101.732,
81
+ "eval_steps_per_second": 132.586,
82
+ "step": 2405
83
+ },
84
+ {
85
+ "epoch": 5.197505197505198,
86
+ "grad_norm": 35.31257247924805,
87
+ "learning_rate": 5.1094890510948916e-06,
88
+ "loss": 0.152,
89
+ "step": 2500
90
+ },
91
+ {
92
+ "epoch": 6.0,
93
+ "eval_loss": 0.7323755025863647,
94
+ "eval_matthews_correlation": 0.5776036075873803,
95
+ "eval_runtime": 0.4074,
96
+ "eval_samples_per_second": 2101.249,
97
+ "eval_steps_per_second": 132.555,
98
+ "step": 2886
99
+ },
100
+ {
101
+ "epoch": 6.237006237006237,
102
+ "grad_norm": 24.80868911743164,
103
+ "learning_rate": 4.003539040035391e-06,
104
+ "loss": 0.1163,
105
+ "step": 3000
106
+ },
107
+ {
108
+ "epoch": 7.0,
109
+ "eval_loss": 0.8122499585151672,
110
+ "eval_matthews_correlation": 0.6033268648340611,
111
+ "eval_runtime": 0.4037,
112
+ "eval_samples_per_second": 2120.501,
113
+ "eval_steps_per_second": 133.77,
114
+ "step": 3367
115
+ },
116
+ {
117
+ "epoch": 7.276507276507276,
118
+ "grad_norm": 0.17492461204528809,
119
+ "learning_rate": 2.897589028975891e-06,
120
+ "loss": 0.085,
121
+ "step": 3500
122
+ },
123
+ {
124
+ "epoch": 8.0,
125
+ "eval_loss": 0.8224215507507324,
126
+ "eval_matthews_correlation": 0.5901838943346132,
127
+ "eval_runtime": 0.4068,
128
+ "eval_samples_per_second": 2104.297,
129
+ "eval_steps_per_second": 132.748,
130
+ "step": 3848
131
+ },
132
+ {
133
+ "epoch": 8.316008316008316,
134
+ "grad_norm": 0.08201944828033447,
135
+ "learning_rate": 1.7916390179163902e-06,
136
+ "loss": 0.0786,
137
+ "step": 4000
138
+ },
139
+ {
140
+ "epoch": 9.0,
141
+ "eval_loss": 0.8545462489128113,
142
+ "eval_matthews_correlation": 0.5823814386000223,
143
+ "eval_runtime": 0.4115,
144
+ "eval_samples_per_second": 2079.997,
145
+ "eval_steps_per_second": 131.215,
146
+ "step": 4329
147
+ },
148
+ {
149
+ "epoch": 9.355509355509355,
150
+ "grad_norm": 0.11403591185808182,
151
+ "learning_rate": 6.856890068568902e-07,
152
+ "loss": 0.0562,
153
+ "step": 4500
154
+ },
155
+ {
156
+ "epoch": 10.0,
157
+ "eval_loss": 0.8709535598754883,
158
+ "eval_matthews_correlation": 0.5859609460754699,
159
+ "eval_runtime": 0.4136,
160
+ "eval_samples_per_second": 2069.416,
161
+ "eval_steps_per_second": 130.547,
162
+ "step": 4810
163
+ },
164
+ {
165
+ "epoch": 10.0,
166
+ "step": 4810,
167
+ "total_flos": 816557451978060.0,
168
+ "train_loss": 0.19049413486726566,
169
+ "train_runtime": 286.7383,
170
+ "train_samples_per_second": 268.363,
171
+ "train_steps_per_second": 16.775
172
+ }
173
+ ],
174
+ "logging_steps": 500,
175
+ "max_steps": 4810,
176
+ "num_input_tokens_seen": 0,
177
+ "num_train_epochs": 10,
178
+ "save_steps": 500,
179
+ "stateful_callbacks": {
180
+ "TrainerControl": {
181
+ "args": {
182
+ "should_epoch_stop": false,
183
+ "should_evaluate": false,
184
+ "should_log": false,
185
+ "should_save": true,
186
+ "should_training_stop": true
187
+ },
188
+ "attributes": {}
189
+ }
190
+ },
191
+ "total_flos": 816557451978060.0,
192
+ "train_batch_size": 16,
193
+ "trial_name": null,
194
+ "trial_params": null
195
+ }
cola/bert-base-uncased_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b94e2089e8c9a32b4763aad585eccb5567b65ac5c24c8965ebc02c346d72bd3
3
+ size 5304
cola/bert-base-uncased_lr1e-05/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c341999cc75104616e5425dd4e782119c49075c641490a4c89f71ee598cc653
3
+ size 2371920
cola/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/shikexuan/nlu_model/roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.45.2",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
cola/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f8bb689ac1932cdf913bcf84b3a894830a5955f925c375af73b17fe8e531a49
3
+ size 498612824
cola/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
cola/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
cola/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6476547574288157,
3
+ "best_model_checkpoint": "./nlu_finetuned_models/cola/roberta-base_lr1e-05/checkpoint-1924",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4810,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.39855843782424927,
14
+ "eval_matthews_correlation": 0.551445188783165,
15
+ "eval_runtime": 0.4311,
16
+ "eval_samples_per_second": 1985.797,
17
+ "eval_steps_per_second": 125.272,
18
+ "step": 481
19
+ },
20
+ {
21
+ "epoch": 1.0395010395010396,
22
+ "grad_norm": 23.592742919921875,
23
+ "learning_rate": 9.533289095332891e-06,
24
+ "loss": 0.537,
25
+ "step": 500
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "eval_loss": 0.3867790997028351,
30
+ "eval_matthews_correlation": 0.5922946478625156,
31
+ "eval_runtime": 0.4252,
32
+ "eval_samples_per_second": 2013.273,
33
+ "eval_steps_per_second": 127.006,
34
+ "step": 962
35
+ },
36
+ {
37
+ "epoch": 2.079002079002079,
38
+ "grad_norm": 24.047794342041016,
39
+ "learning_rate": 8.427339084273391e-06,
40
+ "loss": 0.3663,
41
+ "step": 1000
42
+ },
43
+ {
44
+ "epoch": 3.0,
45
+ "eval_loss": 0.44944295287132263,
46
+ "eval_matthews_correlation": 0.5697569536882547,
47
+ "eval_runtime": 0.432,
48
+ "eval_samples_per_second": 1981.347,
49
+ "eval_steps_per_second": 124.992,
50
+ "step": 1443
51
+ },
52
+ {
53
+ "epoch": 3.1185031185031185,
54
+ "grad_norm": 19.57173728942871,
55
+ "learning_rate": 7.3213890732138915e-06,
56
+ "loss": 0.2692,
57
+ "step": 1500
58
+ },
59
+ {
60
+ "epoch": 4.0,
61
+ "eval_loss": 0.5625013709068298,
62
+ "eval_matthews_correlation": 0.6476547574288157,
63
+ "eval_runtime": 0.4183,
64
+ "eval_samples_per_second": 2046.415,
65
+ "eval_steps_per_second": 129.096,
66
+ "step": 1924
67
+ },
68
+ {
69
+ "epoch": 4.158004158004158,
70
+ "grad_norm": 18.871004104614258,
71
+ "learning_rate": 6.2154390621543915e-06,
72
+ "loss": 0.2002,
73
+ "step": 2000
74
+ },
75
+ {
76
+ "epoch": 5.0,
77
+ "eval_loss": 0.6681837439537048,
78
+ "eval_matthews_correlation": 0.5953927103011889,
79
+ "eval_runtime": 0.4264,
80
+ "eval_samples_per_second": 2007.326,
81
+ "eval_steps_per_second": 126.63,
82
+ "step": 2405
83
+ },
84
+ {
85
+ "epoch": 5.197505197505198,
86
+ "grad_norm": 12.891629219055176,
87
+ "learning_rate": 5.1094890510948916e-06,
88
+ "loss": 0.1761,
89
+ "step": 2500
90
+ },
91
+ {
92
+ "epoch": 6.0,
93
+ "eval_loss": 0.7049034237861633,
94
+ "eval_matthews_correlation": 0.6376896027079079,
95
+ "eval_runtime": 0.4161,
96
+ "eval_samples_per_second": 2057.2,
97
+ "eval_steps_per_second": 129.777,
98
+ "step": 2886
99
+ },
100
+ {
101
+ "epoch": 6.237006237006237,
102
+ "grad_norm": 0.21968619525432587,
103
+ "learning_rate": 4.003539040035391e-06,
104
+ "loss": 0.1486,
105
+ "step": 3000
106
+ },
107
+ {
108
+ "epoch": 7.0,
109
+ "eval_loss": 0.8099717497825623,
110
+ "eval_matthews_correlation": 0.6372565507596143,
111
+ "eval_runtime": 0.4311,
112
+ "eval_samples_per_second": 1985.553,
113
+ "eval_steps_per_second": 125.257,
114
+ "step": 3367
115
+ },
116
+ {
117
+ "epoch": 7.276507276507276,
118
+ "grad_norm": 0.38824957609176636,
119
+ "learning_rate": 2.897589028975891e-06,
120
+ "loss": 0.1261,
121
+ "step": 3500
122
+ },
123
+ {
124
+ "epoch": 8.0,
125
+ "eval_loss": 0.8209366202354431,
126
+ "eval_matthews_correlation": 0.6409103987573772,
127
+ "eval_runtime": 0.4313,
128
+ "eval_samples_per_second": 1984.868,
129
+ "eval_steps_per_second": 125.214,
130
+ "step": 3848
131
+ },
132
+ {
133
+ "epoch": 8.316008316008316,
134
+ "grad_norm": 0.6946465969085693,
135
+ "learning_rate": 1.7916390179163902e-06,
136
+ "loss": 0.0963,
137
+ "step": 4000
138
+ },
139
+ {
140
+ "epoch": 9.0,
141
+ "eval_loss": 0.866554319858551,
142
+ "eval_matthews_correlation": 0.6465132548864322,
143
+ "eval_runtime": 0.4204,
144
+ "eval_samples_per_second": 2036.18,
145
+ "eval_steps_per_second": 128.451,
146
+ "step": 4329
147
+ },
148
+ {
149
+ "epoch": 9.355509355509355,
150
+ "grad_norm": 0.8278540968894958,
151
+ "learning_rate": 6.856890068568902e-07,
152
+ "loss": 0.0918,
153
+ "step": 4500
154
+ },
155
+ {
156
+ "epoch": 10.0,
157
+ "eval_loss": 0.9010610580444336,
158
+ "eval_matthews_correlation": 0.6344646755477841,
159
+ "eval_runtime": 0.4206,
160
+ "eval_samples_per_second": 2035.283,
161
+ "eval_steps_per_second": 128.394,
162
+ "step": 4810
163
+ },
164
+ {
165
+ "epoch": 10.0,
166
+ "step": 4810,
167
+ "total_flos": 806520176327100.0,
168
+ "train_loss": 0.2141543533103134,
169
+ "train_runtime": 285.3143,
170
+ "train_samples_per_second": 269.703,
171
+ "train_steps_per_second": 16.859
172
+ }
173
+ ],
174
+ "logging_steps": 500,
175
+ "max_steps": 4810,
176
+ "num_input_tokens_seen": 0,
177
+ "num_train_epochs": 10,
178
+ "save_steps": 500,
179
+ "stateful_callbacks": {
180
+ "TrainerControl": {
181
+ "args": {
182
+ "should_epoch_stop": false,
183
+ "should_evaluate": false,
184
+ "should_log": false,
185
+ "should_save": true,
186
+ "should_training_stop": true
187
+ },
188
+ "attributes": {}
189
+ }
190
+ },
191
+ "total_flos": 806520176327100.0,
192
+ "train_batch_size": 16,
193
+ "trial_name": null,
194
+ "trial_params": null
195
+ }
cola/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c366ac50b12c62e8d32aece12e2d9a8214ab559f21c1da232666f859c3938eeb
3
+ size 5240
cola/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-large_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68ea289932b18061036e5b723c08a4d4e38b5c19ef439ce78da8beb3aae26de0
3
+ size 4210000
cola/roberta-large_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/projects/shikexuan/nlu_model/roberta-large",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.44.1",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
cola/roberta-large_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-large_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d86c69428bcafb99a33526a826aeed7d41df8b7b872f0d384ba86b8f0ab1bcac
3
+ size 1421495416
cola/roberta-large_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
cola/roberta-large_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-large_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
cola/roberta-large_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7028411284769034,
3
+ "best_model_checkpoint": "./nlu_finetuned_models/cola/roberta-large_lr1e-05/checkpoint-3848",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4810,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.3714715838432312,
14
+ "eval_matthews_correlation": 0.6178566076184573,
15
+ "eval_runtime": 0.9181,
16
+ "eval_samples_per_second": 932.378,
17
+ "eval_steps_per_second": 58.818,
18
+ "step": 481
19
+ },
20
+ {
21
+ "epoch": 1.0395010395010396,
22
+ "grad_norm": 31.42021942138672,
23
+ "learning_rate": 9.533289095332891e-06,
24
+ "loss": 0.5165,
25
+ "step": 500
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "eval_loss": 0.36615189909935,
30
+ "eval_matthews_correlation": 0.672732946200059,
31
+ "eval_runtime": 0.8969,
32
+ "eval_samples_per_second": 954.433,
33
+ "eval_steps_per_second": 60.21,
34
+ "step": 962
35
+ },
36
+ {
37
+ "epoch": 2.079002079002079,
38
+ "grad_norm": 6.862401962280273,
39
+ "learning_rate": 8.427339084273391e-06,
40
+ "loss": 0.3468,
41
+ "step": 1000
42
+ },
43
+ {
44
+ "epoch": 3.0,
45
+ "eval_loss": 0.39393267035484314,
46
+ "eval_matthews_correlation": 0.6436994142086258,
47
+ "eval_runtime": 0.8844,
48
+ "eval_samples_per_second": 967.838,
49
+ "eval_steps_per_second": 61.055,
50
+ "step": 1443
51
+ },
52
+ {
53
+ "epoch": 3.1185031185031185,
54
+ "grad_norm": 8.505253791809082,
55
+ "learning_rate": 7.3213890732138915e-06,
56
+ "loss": 0.2448,
57
+ "step": 1500
58
+ },
59
+ {
60
+ "epoch": 4.0,
61
+ "eval_loss": 0.5278908014297485,
62
+ "eval_matthews_correlation": 0.7005174840601028,
63
+ "eval_runtime": 0.8814,
64
+ "eval_samples_per_second": 971.171,
65
+ "eval_steps_per_second": 61.265,
66
+ "step": 1924
67
+ },
68
+ {
69
+ "epoch": 4.158004158004158,
70
+ "grad_norm": 61.827049255371094,
71
+ "learning_rate": 6.2154390621543915e-06,
72
+ "loss": 0.1829,
73
+ "step": 2000
74
+ },
75
+ {
76
+ "epoch": 5.0,
77
+ "eval_loss": 0.5846863389015198,
78
+ "eval_matthews_correlation": 0.6887827880453191,
79
+ "eval_runtime": 0.8956,
80
+ "eval_samples_per_second": 955.76,
81
+ "eval_steps_per_second": 60.293,
82
+ "step": 2405
83
+ },
84
+ {
85
+ "epoch": 5.197505197505198,
86
+ "grad_norm": 0.20127160847187042,
87
+ "learning_rate": 5.1094890510948916e-06,
88
+ "loss": 0.149,
89
+ "step": 2500
90
+ },
91
+ {
92
+ "epoch": 6.0,
93
+ "eval_loss": 0.7285173535346985,
94
+ "eval_matthews_correlation": 0.6970512081270855,
95
+ "eval_runtime": 0.88,
96
+ "eval_samples_per_second": 972.692,
97
+ "eval_steps_per_second": 61.361,
98
+ "step": 2886
99
+ },
100
+ {
101
+ "epoch": 6.237006237006237,
102
+ "grad_norm": 0.9558518528938293,
103
+ "learning_rate": 4.003539040035391e-06,
104
+ "loss": 0.1009,
105
+ "step": 3000
106
+ },
107
+ {
108
+ "epoch": 7.0,
109
+ "eval_loss": 0.8499833345413208,
110
+ "eval_matthews_correlation": 0.6829154400379273,
111
+ "eval_runtime": 0.8827,
112
+ "eval_samples_per_second": 969.703,
113
+ "eval_steps_per_second": 61.173,
114
+ "step": 3367
115
+ },
116
+ {
117
+ "epoch": 7.276507276507276,
118
+ "grad_norm": 9.234942436218262,
119
+ "learning_rate": 2.897589028975891e-06,
120
+ "loss": 0.083,
121
+ "step": 3500
122
+ },
123
+ {
124
+ "epoch": 8.0,
125
+ "eval_loss": 0.7806959748268127,
126
+ "eval_matthews_correlation": 0.7028411284769034,
127
+ "eval_runtime": 0.8826,
128
+ "eval_samples_per_second": 969.849,
129
+ "eval_steps_per_second": 61.182,
130
+ "step": 3848
131
+ },
132
+ {
133
+ "epoch": 8.316008316008316,
134
+ "grad_norm": 0.022788817062973976,
135
+ "learning_rate": 1.7916390179163902e-06,
136
+ "loss": 0.056,
137
+ "step": 4000
138
+ },
139
+ {
140
+ "epoch": 9.0,
141
+ "eval_loss": 0.8418498039245605,
142
+ "eval_matthews_correlation": 0.7012056940021191,
143
+ "eval_runtime": 0.8786,
144
+ "eval_samples_per_second": 974.326,
145
+ "eval_steps_per_second": 61.464,
146
+ "step": 4329
147
+ },
148
+ {
149
+ "epoch": 9.355509355509355,
150
+ "grad_norm": 0.3841009736061096,
151
+ "learning_rate": 6.856890068568902e-07,
152
+ "loss": 0.051,
153
+ "step": 4500
154
+ },
155
+ {
156
+ "epoch": 10.0,
157
+ "eval_loss": 0.8874779939651489,
158
+ "eval_matthews_correlation": 0.6993251302364928,
159
+ "eval_runtime": 0.8567,
160
+ "eval_samples_per_second": 999.194,
161
+ "eval_steps_per_second": 63.033,
162
+ "step": 4810
163
+ },
164
+ {
165
+ "epoch": 10.0,
166
+ "step": 4810,
167
+ "total_flos": 2856669958043580.0,
168
+ "train_loss": 0.18191755675476454,
169
+ "train_runtime": 542.0493,
170
+ "train_samples_per_second": 141.961,
171
+ "train_steps_per_second": 8.874
172
+ }
173
+ ],
174
+ "logging_steps": 500,
175
+ "max_steps": 4810,
176
+ "num_input_tokens_seen": 0,
177
+ "num_train_epochs": 10,
178
+ "save_steps": 500,
179
+ "stateful_callbacks": {
180
+ "TrainerControl": {
181
+ "args": {
182
+ "should_epoch_stop": false,
183
+ "should_evaluate": false,
184
+ "should_log": false,
185
+ "should_save": true,
186
+ "should_training_stop": true
187
+ },
188
+ "attributes": {}
189
+ }
190
+ },
191
+ "total_flos": 2856669958043580.0,
192
+ "train_batch_size": 16,
193
+ "trial_name": null,
194
+ "trial_params": null
195
+ }
cola/roberta-large_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e5d3fff2e2c5906af773a16627b3247fc0af9d89a4d9fe671c652ce2ab6c087
3
+ size 5240
cola/roberta-large_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/bert-base-uncased_lr1e-05/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/shikexuan/nlu_model/bert-base-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.45.2",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 30522
36
+ }
mnli/bert-base-uncased_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa573f005d822ed28a352bf3a40e32fdb89c838d7cb3c807570b9871f89543b0
3
+ size 437961724
mnli/bert-base-uncased_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
mnli/bert-base-uncased_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/bert-base-uncased_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
mnli/bert-base-uncased_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,3219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8352473835654809,
3
+ "best_model_checkpoint": "./nlu_finetuned_models/mnli/bert-base-uncased_lr1e-05/checkpoint-66270",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 220900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.022634676324128564,
13
+ "grad_norm": 3.446760416030884,
14
+ "learning_rate": 3.772446054021428e-07,
15
+ "loss": 1.173,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.04526935264825713,
20
+ "grad_norm": 5.33619499206543,
21
+ "learning_rate": 7.544892108042856e-07,
22
+ "loss": 1.1321,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.06790402897238569,
27
+ "grad_norm": 8.791085243225098,
28
+ "learning_rate": 1.1317338162064282e-06,
29
+ "loss": 1.0749,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.09053870529651425,
34
+ "grad_norm": 6.260791301727295,
35
+ "learning_rate": 1.5089784216085712e-06,
36
+ "loss": 1.015,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.11317338162064282,
41
+ "grad_norm": 6.4760637283325195,
42
+ "learning_rate": 1.886223027010714e-06,
43
+ "loss": 0.9274,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.13580805794477138,
48
+ "grad_norm": 12.70169448852539,
49
+ "learning_rate": 2.2634676324128565e-06,
50
+ "loss": 0.8618,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.15844273426889996,
55
+ "grad_norm": 13.638602256774902,
56
+ "learning_rate": 2.6407122378149996e-06,
57
+ "loss": 0.811,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.1810774105930285,
62
+ "grad_norm": 6.7600932121276855,
63
+ "learning_rate": 3.0179568432171424e-06,
64
+ "loss": 0.7514,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.2037120869171571,
69
+ "grad_norm": 10.7721586227417,
70
+ "learning_rate": 3.395201448619285e-06,
71
+ "loss": 0.7147,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.22634676324128564,
76
+ "grad_norm": 11.066295623779297,
77
+ "learning_rate": 3.772446054021428e-06,
78
+ "loss": 0.6998,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.24898143956541421,
83
+ "grad_norm": 11.129741668701172,
84
+ "learning_rate": 4.149690659423571e-06,
85
+ "loss": 0.6902,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.27161611588954276,
90
+ "grad_norm": 9.823887825012207,
91
+ "learning_rate": 4.526935264825713e-06,
92
+ "loss": 0.6694,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.29425079221367134,
97
+ "grad_norm": 9.396687507629395,
98
+ "learning_rate": 4.904179870227856e-06,
99
+ "loss": 0.6444,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.3168854685377999,
104
+ "grad_norm": 10.890174865722656,
105
+ "learning_rate": 5.281424475629999e-06,
106
+ "loss": 0.6555,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.3395201448619285,
111
+ "grad_norm": 8.391996383666992,
112
+ "learning_rate": 5.658669081032142e-06,
113
+ "loss": 0.635,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.362154821186057,
118
+ "grad_norm": 10.972713470458984,
119
+ "learning_rate": 6.035913686434285e-06,
120
+ "loss": 0.6369,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.3847894975101856,
125
+ "grad_norm": 7.503176212310791,
126
+ "learning_rate": 6.4131582918364275e-06,
127
+ "loss": 0.5998,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.4074241738343142,
132
+ "grad_norm": 12.73490047454834,
133
+ "learning_rate": 6.79040289723857e-06,
134
+ "loss": 0.5965,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.43005885015844275,
139
+ "grad_norm": 12.150643348693848,
140
+ "learning_rate": 7.167647502640713e-06,
141
+ "loss": 0.593,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.4526935264825713,
146
+ "grad_norm": 13.659591674804688,
147
+ "learning_rate": 7.544892108042856e-06,
148
+ "loss": 0.572,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.47532820280669985,
153
+ "grad_norm": 19.394298553466797,
154
+ "learning_rate": 7.922136713445e-06,
155
+ "loss": 0.5832,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.49796287913082843,
160
+ "grad_norm": 10.426199913024902,
161
+ "learning_rate": 8.299381318847142e-06,
162
+ "loss": 0.5735,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.520597555454957,
167
+ "grad_norm": 12.519038200378418,
168
+ "learning_rate": 8.676625924249283e-06,
169
+ "loss": 0.5782,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.5432322317790855,
174
+ "grad_norm": 11.444819450378418,
175
+ "learning_rate": 9.053870529651426e-06,
176
+ "loss": 0.5637,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 0.5658669081032142,
181
+ "grad_norm": 9.843756675720215,
182
+ "learning_rate": 9.431115135053569e-06,
183
+ "loss": 0.5546,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 0.5885015844273427,
188
+ "grad_norm": 13.871683120727539,
189
+ "learning_rate": 9.808359740455711e-06,
190
+ "loss": 0.5449,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 0.6111362607514712,
195
+ "grad_norm": 14.887154579162598,
196
+ "learning_rate": 9.98815291409418e-06,
197
+ "loss": 0.5496,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 0.6337709370755998,
202
+ "grad_norm": 11.629434585571289,
203
+ "learning_rate": 9.964073471196171e-06,
204
+ "loss": 0.5468,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 0.6564056133997284,
209
+ "grad_norm": 9.616654396057129,
210
+ "learning_rate": 9.939994028298163e-06,
211
+ "loss": 0.5358,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 0.679040289723857,
216
+ "grad_norm": 5.07706356048584,
217
+ "learning_rate": 9.915914585400153e-06,
218
+ "loss": 0.5348,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 0.7016749660479855,
223
+ "grad_norm": 12.040781021118164,
224
+ "learning_rate": 9.891835142502145e-06,
225
+ "loss": 0.5255,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 0.724309642372114,
230
+ "grad_norm": 10.14538860321045,
231
+ "learning_rate": 9.867755699604135e-06,
232
+ "loss": 0.5359,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 0.7469443186962427,
237
+ "grad_norm": 4.373657703399658,
238
+ "learning_rate": 9.843676256706126e-06,
239
+ "loss": 0.516,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 0.7695789950203712,
244
+ "grad_norm": 7.4355950355529785,
245
+ "learning_rate": 9.819596813808116e-06,
246
+ "loss": 0.5272,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 0.7922136713444998,
251
+ "grad_norm": 9.89004135131836,
252
+ "learning_rate": 9.795517370910108e-06,
253
+ "loss": 0.5103,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 0.8148483476686283,
258
+ "grad_norm": 16.285531997680664,
259
+ "learning_rate": 9.771437928012098e-06,
260
+ "loss": 0.5164,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.8374830239927569,
265
+ "grad_norm": 12.120766639709473,
266
+ "learning_rate": 9.74735848511409e-06,
267
+ "loss": 0.5295,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.8601177003168855,
272
+ "grad_norm": 8.174781799316406,
273
+ "learning_rate": 9.723279042216081e-06,
274
+ "loss": 0.5126,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 0.882752376641014,
279
+ "grad_norm": 12.264520645141602,
280
+ "learning_rate": 9.699199599318071e-06,
281
+ "loss": 0.5028,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 0.9053870529651425,
286
+ "grad_norm": 13.481684684753418,
287
+ "learning_rate": 9.675120156420061e-06,
288
+ "loss": 0.5083,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 0.9280217292892712,
293
+ "grad_norm": 5.885311603546143,
294
+ "learning_rate": 9.651040713522053e-06,
295
+ "loss": 0.508,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 0.9506564056133997,
300
+ "grad_norm": 11.39742374420166,
301
+ "learning_rate": 9.626961270624043e-06,
302
+ "loss": 0.5076,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 0.9732910819375283,
307
+ "grad_norm": 12.728748321533203,
308
+ "learning_rate": 9.602881827726035e-06,
309
+ "loss": 0.5109,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 0.9959257582616569,
314
+ "grad_norm": 7.6700592041015625,
315
+ "learning_rate": 9.578802384828026e-06,
316
+ "loss": 0.4849,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 1.0,
321
+ "eval_accuracy": 0.8162511777138346,
322
+ "eval_loss": 0.48325616121292114,
323
+ "eval_runtime": 25.2613,
324
+ "eval_samples_per_second": 1554.593,
325
+ "eval_steps_per_second": 97.184,
326
+ "step": 22090
327
+ },
328
+ {
329
+ "epoch": 1.0185604345857855,
330
+ "grad_norm": 12.493118286132812,
331
+ "learning_rate": 9.554722941930016e-06,
332
+ "loss": 0.4426,
333
+ "step": 22500
334
+ },
335
+ {
336
+ "epoch": 1.041195110909914,
337
+ "grad_norm": 6.584425926208496,
338
+ "learning_rate": 9.530643499032008e-06,
339
+ "loss": 0.4212,
340
+ "step": 23000
341
+ },
342
+ {
343
+ "epoch": 1.0638297872340425,
344
+ "grad_norm": 8.161721229553223,
345
+ "learning_rate": 9.506564056133998e-06,
346
+ "loss": 0.4266,
347
+ "step": 23500
348
+ },
349
+ {
350
+ "epoch": 1.086464463558171,
351
+ "grad_norm": 18.355859756469727,
352
+ "learning_rate": 9.48248461323599e-06,
353
+ "loss": 0.4191,
354
+ "step": 24000
355
+ },
356
+ {
357
+ "epoch": 1.1090991398822996,
358
+ "grad_norm": 10.349095344543457,
359
+ "learning_rate": 9.45840517033798e-06,
360
+ "loss": 0.4223,
361
+ "step": 24500
362
+ },
363
+ {
364
+ "epoch": 1.1317338162064283,
365
+ "grad_norm": 16.80712890625,
366
+ "learning_rate": 9.43432572743997e-06,
367
+ "loss": 0.4393,
368
+ "step": 25000
369
+ },
370
+ {
371
+ "epoch": 1.1543684925305568,
372
+ "grad_norm": 14.750687599182129,
373
+ "learning_rate": 9.410246284541961e-06,
374
+ "loss": 0.4254,
375
+ "step": 25500
376
+ },
377
+ {
378
+ "epoch": 1.1770031688546854,
379
+ "grad_norm": 3.8574748039245605,
380
+ "learning_rate": 9.386166841643953e-06,
381
+ "loss": 0.4326,
382
+ "step": 26000
383
+ },
384
+ {
385
+ "epoch": 1.1996378451788139,
386
+ "grad_norm": 8.065531730651855,
387
+ "learning_rate": 9.362087398745945e-06,
388
+ "loss": 0.4304,
389
+ "step": 26500
390
+ },
391
+ {
392
+ "epoch": 1.2222725215029424,
393
+ "grad_norm": 7.019877910614014,
394
+ "learning_rate": 9.338007955847935e-06,
395
+ "loss": 0.421,
396
+ "step": 27000
397
+ },
398
+ {
399
+ "epoch": 1.2449071978270712,
400
+ "grad_norm": 5.952245235443115,
401
+ "learning_rate": 9.313928512949925e-06,
402
+ "loss": 0.4296,
403
+ "step": 27500
404
+ },
405
+ {
406
+ "epoch": 1.2675418741511997,
407
+ "grad_norm": 8.255633354187012,
408
+ "learning_rate": 9.289849070051916e-06,
409
+ "loss": 0.4256,
410
+ "step": 28000
411
+ },
412
+ {
413
+ "epoch": 1.2901765504753282,
414
+ "grad_norm": 6.78666877746582,
415
+ "learning_rate": 9.265769627153906e-06,
416
+ "loss": 0.427,
417
+ "step": 28500
418
+ },
419
+ {
420
+ "epoch": 1.3128112267994567,
421
+ "grad_norm": 10.57790470123291,
422
+ "learning_rate": 9.241690184255898e-06,
423
+ "loss": 0.4095,
424
+ "step": 29000
425
+ },
426
+ {
427
+ "epoch": 1.3354459031235852,
428
+ "grad_norm": 11.112567901611328,
429
+ "learning_rate": 9.217610741357888e-06,
430
+ "loss": 0.4258,
431
+ "step": 29500
432
+ },
433
+ {
434
+ "epoch": 1.358080579447714,
435
+ "grad_norm": 3.372868776321411,
436
+ "learning_rate": 9.19353129845988e-06,
437
+ "loss": 0.4133,
438
+ "step": 30000
439
+ },
440
+ {
441
+ "epoch": 1.3807152557718425,
442
+ "grad_norm": 9.852280616760254,
443
+ "learning_rate": 9.169451855561871e-06,
444
+ "loss": 0.4207,
445
+ "step": 30500
446
+ },
447
+ {
448
+ "epoch": 1.403349932095971,
449
+ "grad_norm": 5.258566856384277,
450
+ "learning_rate": 9.145372412663861e-06,
451
+ "loss": 0.4209,
452
+ "step": 31000
453
+ },
454
+ {
455
+ "epoch": 1.4259846084200996,
456
+ "grad_norm": 13.098512649536133,
457
+ "learning_rate": 9.121292969765853e-06,
458
+ "loss": 0.4219,
459
+ "step": 31500
460
+ },
461
+ {
462
+ "epoch": 1.448619284744228,
463
+ "grad_norm": 10.451237678527832,
464
+ "learning_rate": 9.097213526867843e-06,
465
+ "loss": 0.4214,
466
+ "step": 32000
467
+ },
468
+ {
469
+ "epoch": 1.4712539610683568,
470
+ "grad_norm": 10.141883850097656,
471
+ "learning_rate": 9.073134083969835e-06,
472
+ "loss": 0.4149,
473
+ "step": 32500
474
+ },
475
+ {
476
+ "epoch": 1.4938886373924853,
477
+ "grad_norm": 13.736973762512207,
478
+ "learning_rate": 9.049054641071825e-06,
479
+ "loss": 0.4078,
480
+ "step": 33000
481
+ },
482
+ {
483
+ "epoch": 1.5165233137166139,
484
+ "grad_norm": 19.68891143798828,
485
+ "learning_rate": 9.024975198173815e-06,
486
+ "loss": 0.3956,
487
+ "step": 33500
488
+ },
489
+ {
490
+ "epoch": 1.5391579900407424,
491
+ "grad_norm": 7.632669925689697,
492
+ "learning_rate": 9.000895755275806e-06,
493
+ "loss": 0.4281,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 1.561792666364871,
498
+ "grad_norm": 7.729524612426758,
499
+ "learning_rate": 8.976816312377798e-06,
500
+ "loss": 0.4187,
501
+ "step": 34500
502
+ },
503
+ {
504
+ "epoch": 1.5844273426889997,
505
+ "grad_norm": 9.902143478393555,
506
+ "learning_rate": 8.95273686947979e-06,
507
+ "loss": 0.4175,
508
+ "step": 35000
509
+ },
510
+ {
511
+ "epoch": 1.607062019013128,
512
+ "grad_norm": 7.277866363525391,
513
+ "learning_rate": 8.92865742658178e-06,
514
+ "loss": 0.4168,
515
+ "step": 35500
516
+ },
517
+ {
518
+ "epoch": 1.6296966953372567,
519
+ "grad_norm": 12.389451026916504,
520
+ "learning_rate": 8.90457798368377e-06,
521
+ "loss": 0.4117,
522
+ "step": 36000
523
+ },
524
+ {
525
+ "epoch": 1.6523313716613852,
526
+ "grad_norm": 7.954280376434326,
527
+ "learning_rate": 8.880498540785761e-06,
528
+ "loss": 0.4257,
529
+ "step": 36500
530
+ },
531
+ {
532
+ "epoch": 1.6749660479855137,
533
+ "grad_norm": 10.025848388671875,
534
+ "learning_rate": 8.856419097887751e-06,
535
+ "loss": 0.4305,
536
+ "step": 37000
537
+ },
538
+ {
539
+ "epoch": 1.6976007243096425,
540
+ "grad_norm": 7.972113609313965,
541
+ "learning_rate": 8.832339654989743e-06,
542
+ "loss": 0.4268,
543
+ "step": 37500
544
+ },
545
+ {
546
+ "epoch": 1.7202354006337708,
547
+ "grad_norm": 12.047213554382324,
548
+ "learning_rate": 8.808260212091733e-06,
549
+ "loss": 0.4028,
550
+ "step": 38000
551
+ },
552
+ {
553
+ "epoch": 1.7428700769578995,
554
+ "grad_norm": 16.330598831176758,
555
+ "learning_rate": 8.784180769193725e-06,
556
+ "loss": 0.4074,
557
+ "step": 38500
558
+ },
559
+ {
560
+ "epoch": 1.765504753282028,
561
+ "grad_norm": 10.4528226852417,
562
+ "learning_rate": 8.760101326295716e-06,
563
+ "loss": 0.411,
564
+ "step": 39000
565
+ },
566
+ {
567
+ "epoch": 1.7881394296061566,
568
+ "grad_norm": 12.689846992492676,
569
+ "learning_rate": 8.736021883397706e-06,
570
+ "loss": 0.4123,
571
+ "step": 39500
572
+ },
573
+ {
574
+ "epoch": 1.8107741059302853,
575
+ "grad_norm": 12.51934814453125,
576
+ "learning_rate": 8.711942440499698e-06,
577
+ "loss": 0.4086,
578
+ "step": 40000
579
+ },
580
+ {
581
+ "epoch": 1.8334087822544136,
582
+ "grad_norm": 14.149301528930664,
583
+ "learning_rate": 8.687862997601688e-06,
584
+ "loss": 0.412,
585
+ "step": 40500
586
+ },
587
+ {
588
+ "epoch": 1.8560434585785424,
589
+ "grad_norm": 7.224659442901611,
590
+ "learning_rate": 8.663783554703678e-06,
591
+ "loss": 0.4224,
592
+ "step": 41000
593
+ },
594
+ {
595
+ "epoch": 1.8786781349026709,
596
+ "grad_norm": 9.472103118896484,
597
+ "learning_rate": 8.63970411180567e-06,
598
+ "loss": 0.4133,
599
+ "step": 41500
600
+ },
601
+ {
602
+ "epoch": 1.9013128112267994,
603
+ "grad_norm": 9.505095481872559,
604
+ "learning_rate": 8.615624668907661e-06,
605
+ "loss": 0.4146,
606
+ "step": 42000
607
+ },
608
+ {
609
+ "epoch": 1.9239474875509281,
610
+ "grad_norm": 5.6151299476623535,
611
+ "learning_rate": 8.591545226009653e-06,
612
+ "loss": 0.4099,
613
+ "step": 42500
614
+ },
615
+ {
616
+ "epoch": 1.9465821638750564,
617
+ "grad_norm": 9.442501068115234,
618
+ "learning_rate": 8.567465783111643e-06,
619
+ "loss": 0.4016,
620
+ "step": 43000
621
+ },
622
+ {
623
+ "epoch": 1.9692168401991852,
624
+ "grad_norm": 7.8139777183532715,
625
+ "learning_rate": 8.543386340213633e-06,
626
+ "loss": 0.3923,
627
+ "step": 43500
628
+ },
629
+ {
630
+ "epoch": 1.9918515165233137,
631
+ "grad_norm": 8.070769309997559,
632
+ "learning_rate": 8.519306897315625e-06,
633
+ "loss": 0.4119,
634
+ "step": 44000
635
+ },
636
+ {
637
+ "epoch": 2.0,
638
+ "eval_accuracy": 0.8304346718952917,
639
+ "eval_loss": 0.44952502846717834,
640
+ "eval_runtime": 25.1503,
641
+ "eval_samples_per_second": 1561.452,
642
+ "eval_steps_per_second": 97.613,
643
+ "step": 44180
644
+ },
645
+ {
646
+ "epoch": 2.0144861928474422,
647
+ "grad_norm": 13.169942855834961,
648
+ "learning_rate": 8.495227454417615e-06,
649
+ "loss": 0.3237,
650
+ "step": 44500
651
+ },
652
+ {
653
+ "epoch": 2.037120869171571,
654
+ "grad_norm": 14.817279815673828,
655
+ "learning_rate": 8.471148011519606e-06,
656
+ "loss": 0.3116,
657
+ "step": 45000
658
+ },
659
+ {
660
+ "epoch": 2.0597555454956993,
661
+ "grad_norm": 19.022254943847656,
662
+ "learning_rate": 8.447068568621596e-06,
663
+ "loss": 0.2983,
664
+ "step": 45500
665
+ },
666
+ {
667
+ "epoch": 2.082390221819828,
668
+ "grad_norm": 8.058989524841309,
669
+ "learning_rate": 8.422989125723588e-06,
670
+ "loss": 0.3023,
671
+ "step": 46000
672
+ },
673
+ {
674
+ "epoch": 2.1050248981439568,
675
+ "grad_norm": 29.2261962890625,
676
+ "learning_rate": 8.39890968282558e-06,
677
+ "loss": 0.3017,
678
+ "step": 46500
679
+ },
680
+ {
681
+ "epoch": 2.127659574468085,
682
+ "grad_norm": 24.260656356811523,
683
+ "learning_rate": 8.37483023992757e-06,
684
+ "loss": 0.3115,
685
+ "step": 47000
686
+ },
687
+ {
688
+ "epoch": 2.150294250792214,
689
+ "grad_norm": 12.767081260681152,
690
+ "learning_rate": 8.350750797029561e-06,
691
+ "loss": 0.2936,
692
+ "step": 47500
693
+ },
694
+ {
695
+ "epoch": 2.172928927116342,
696
+ "grad_norm": 19.231029510498047,
697
+ "learning_rate": 8.326671354131551e-06,
698
+ "loss": 0.3012,
699
+ "step": 48000
700
+ },
701
+ {
702
+ "epoch": 2.195563603440471,
703
+ "grad_norm": 5.552412986755371,
704
+ "learning_rate": 8.302591911233543e-06,
705
+ "loss": 0.31,
706
+ "step": 48500
707
+ },
708
+ {
709
+ "epoch": 2.218198279764599,
710
+ "grad_norm": 15.026745796203613,
711
+ "learning_rate": 8.278512468335533e-06,
712
+ "loss": 0.2869,
713
+ "step": 49000
714
+ },
715
+ {
716
+ "epoch": 2.240832956088728,
717
+ "grad_norm": 18.39385223388672,
718
+ "learning_rate": 8.254433025437523e-06,
719
+ "loss": 0.293,
720
+ "step": 49500
721
+ },
722
+ {
723
+ "epoch": 2.2634676324128566,
724
+ "grad_norm": 10.1619873046875,
725
+ "learning_rate": 8.230353582539515e-06,
726
+ "loss": 0.3022,
727
+ "step": 50000
728
+ },
729
+ {
730
+ "epoch": 2.286102308736985,
731
+ "grad_norm": 3.5289433002471924,
732
+ "learning_rate": 8.206274139641506e-06,
733
+ "loss": 0.3004,
734
+ "step": 50500
735
+ },
736
+ {
737
+ "epoch": 2.3087369850611137,
738
+ "grad_norm": 14.791669845581055,
739
+ "learning_rate": 8.182194696743498e-06,
740
+ "loss": 0.3219,
741
+ "step": 51000
742
+ },
743
+ {
744
+ "epoch": 2.3313716613852424,
745
+ "grad_norm": 17.115280151367188,
746
+ "learning_rate": 8.158115253845488e-06,
747
+ "loss": 0.3295,
748
+ "step": 51500
749
+ },
750
+ {
751
+ "epoch": 2.3540063377093707,
752
+ "grad_norm": 2.3479862213134766,
753
+ "learning_rate": 8.134035810947478e-06,
754
+ "loss": 0.3119,
755
+ "step": 52000
756
+ },
757
+ {
758
+ "epoch": 2.3766410140334995,
759
+ "grad_norm": 9.805758476257324,
760
+ "learning_rate": 8.10995636804947e-06,
761
+ "loss": 0.2982,
762
+ "step": 52500
763
+ },
764
+ {
765
+ "epoch": 2.3992756903576278,
766
+ "grad_norm": 41.41348648071289,
767
+ "learning_rate": 8.08587692515146e-06,
768
+ "loss": 0.3063,
769
+ "step": 53000
770
+ },
771
+ {
772
+ "epoch": 2.4219103666817565,
773
+ "grad_norm": 9.925851821899414,
774
+ "learning_rate": 8.061797482253451e-06,
775
+ "loss": 0.3075,
776
+ "step": 53500
777
+ },
778
+ {
779
+ "epoch": 2.444545043005885,
780
+ "grad_norm": 13.075074195861816,
781
+ "learning_rate": 8.037718039355441e-06,
782
+ "loss": 0.3143,
783
+ "step": 54000
784
+ },
785
+ {
786
+ "epoch": 2.4671797193300136,
787
+ "grad_norm": 25.823867797851562,
788
+ "learning_rate": 8.013638596457433e-06,
789
+ "loss": 0.3033,
790
+ "step": 54500
791
+ },
792
+ {
793
+ "epoch": 2.4898143956541423,
794
+ "grad_norm": 13.127443313598633,
795
+ "learning_rate": 7.989559153559425e-06,
796
+ "loss": 0.3252,
797
+ "step": 55000
798
+ },
799
+ {
800
+ "epoch": 2.5124490719782706,
801
+ "grad_norm": 9.526931762695312,
802
+ "learning_rate": 7.965479710661415e-06,
803
+ "loss": 0.3209,
804
+ "step": 55500
805
+ },
806
+ {
807
+ "epoch": 2.5350837483023994,
808
+ "grad_norm": 21.300325393676758,
809
+ "learning_rate": 7.941400267763406e-06,
810
+ "loss": 0.3111,
811
+ "step": 56000
812
+ },
813
+ {
814
+ "epoch": 2.557718424626528,
815
+ "grad_norm": 14.492232322692871,
816
+ "learning_rate": 7.917320824865396e-06,
817
+ "loss": 0.2963,
818
+ "step": 56500
819
+ },
820
+ {
821
+ "epoch": 2.5803531009506564,
822
+ "grad_norm": 19.685089111328125,
823
+ "learning_rate": 7.893241381967386e-06,
824
+ "loss": 0.3046,
825
+ "step": 57000
826
+ },
827
+ {
828
+ "epoch": 2.6029877772747847,
829
+ "grad_norm": 23.311918258666992,
830
+ "learning_rate": 7.869161939069378e-06,
831
+ "loss": 0.3119,
832
+ "step": 57500
833
+ },
834
+ {
835
+ "epoch": 2.6256224535989134,
836
+ "grad_norm": 15.841225624084473,
837
+ "learning_rate": 7.84508249617137e-06,
838
+ "loss": 0.3151,
839
+ "step": 58000
840
+ },
841
+ {
842
+ "epoch": 2.648257129923042,
843
+ "grad_norm": 6.224856853485107,
844
+ "learning_rate": 7.82100305327336e-06,
845
+ "loss": 0.298,
846
+ "step": 58500
847
+ },
848
+ {
849
+ "epoch": 2.6708918062471705,
850
+ "grad_norm": 16.06321907043457,
851
+ "learning_rate": 7.796923610375351e-06,
852
+ "loss": 0.3109,
853
+ "step": 59000
854
+ },
855
+ {
856
+ "epoch": 2.6935264825712992,
857
+ "grad_norm": 30.554990768432617,
858
+ "learning_rate": 7.772844167477341e-06,
859
+ "loss": 0.2959,
860
+ "step": 59500
861
+ },
862
+ {
863
+ "epoch": 2.716161158895428,
864
+ "grad_norm": 13.276334762573242,
865
+ "learning_rate": 7.748764724579333e-06,
866
+ "loss": 0.3186,
867
+ "step": 60000
868
+ },
869
+ {
870
+ "epoch": 2.7387958352195563,
871
+ "grad_norm": 28.817773818969727,
872
+ "learning_rate": 7.724685281681323e-06,
873
+ "loss": 0.3142,
874
+ "step": 60500
875
+ },
876
+ {
877
+ "epoch": 2.761430511543685,
878
+ "grad_norm": 14.737340927124023,
879
+ "learning_rate": 7.700605838783315e-06,
880
+ "loss": 0.3114,
881
+ "step": 61000
882
+ },
883
+ {
884
+ "epoch": 2.7840651878678138,
885
+ "grad_norm": 17.803508758544922,
886
+ "learning_rate": 7.676526395885305e-06,
887
+ "loss": 0.3119,
888
+ "step": 61500
889
+ },
890
+ {
891
+ "epoch": 2.806699864191942,
892
+ "grad_norm": 28.533275604248047,
893
+ "learning_rate": 7.652446952987296e-06,
894
+ "loss": 0.3155,
895
+ "step": 62000
896
+ },
897
+ {
898
+ "epoch": 2.8293345405160704,
899
+ "grad_norm": 15.924571990966797,
900
+ "learning_rate": 7.628367510089287e-06,
901
+ "loss": 0.3054,
902
+ "step": 62500
903
+ },
904
+ {
905
+ "epoch": 2.851969216840199,
906
+ "grad_norm": 7.778402328491211,
907
+ "learning_rate": 7.604288067191278e-06,
908
+ "loss": 0.3064,
909
+ "step": 63000
910
+ },
911
+ {
912
+ "epoch": 2.874603893164328,
913
+ "grad_norm": 14.384367942810059,
914
+ "learning_rate": 7.58020862429327e-06,
915
+ "loss": 0.3315,
916
+ "step": 63500
917
+ },
918
+ {
919
+ "epoch": 2.897238569488456,
920
+ "grad_norm": 16.093135833740234,
921
+ "learning_rate": 7.55612918139526e-06,
922
+ "loss": 0.3099,
923
+ "step": 64000
924
+ },
925
+ {
926
+ "epoch": 2.919873245812585,
927
+ "grad_norm": 9.859313011169434,
928
+ "learning_rate": 7.532049738497251e-06,
929
+ "loss": 0.3004,
930
+ "step": 64500
931
+ },
932
+ {
933
+ "epoch": 2.9425079221367136,
934
+ "grad_norm": 28.369918823242188,
935
+ "learning_rate": 7.507970295599241e-06,
936
+ "loss": 0.3186,
937
+ "step": 65000
938
+ },
939
+ {
940
+ "epoch": 2.965142598460842,
941
+ "grad_norm": 11.795418739318848,
942
+ "learning_rate": 7.483890852701232e-06,
943
+ "loss": 0.3064,
944
+ "step": 65500
945
+ },
946
+ {
947
+ "epoch": 2.9877772747849707,
948
+ "grad_norm": 9.499896049499512,
949
+ "learning_rate": 7.459811409803224e-06,
950
+ "loss": 0.3291,
951
+ "step": 66000
952
+ },
953
+ {
954
+ "epoch": 3.0,
955
+ "eval_accuracy": 0.8352473835654809,
956
+ "eval_loss": 0.464701771736145,
957
+ "eval_runtime": 25.2286,
958
+ "eval_samples_per_second": 1556.605,
959
+ "eval_steps_per_second": 97.31,
960
+ "step": 66270
961
+ },
962
+ {
963
+ "epoch": 3.010411951109099,
964
+ "grad_norm": 45.7053108215332,
965
+ "learning_rate": 7.435731966905214e-06,
966
+ "loss": 0.2729,
967
+ "step": 66500
968
+ },
969
+ {
970
+ "epoch": 3.0330466274332277,
971
+ "grad_norm": 16.285181045532227,
972
+ "learning_rate": 7.4116525240072056e-06,
973
+ "loss": 0.2225,
974
+ "step": 67000
975
+ },
976
+ {
977
+ "epoch": 3.0556813037573565,
978
+ "grad_norm": 2.762075662612915,
979
+ "learning_rate": 7.387573081109196e-06,
980
+ "loss": 0.2222,
981
+ "step": 67500
982
+ },
983
+ {
984
+ "epoch": 3.0783159800814848,
985
+ "grad_norm": 14.039849281311035,
986
+ "learning_rate": 7.363493638211186e-06,
987
+ "loss": 0.2286,
988
+ "step": 68000
989
+ },
990
+ {
991
+ "epoch": 3.1009506564056135,
992
+ "grad_norm": 18.8682918548584,
993
+ "learning_rate": 7.339414195313178e-06,
994
+ "loss": 0.2256,
995
+ "step": 68500
996
+ },
997
+ {
998
+ "epoch": 3.123585332729742,
999
+ "grad_norm": 17.37663459777832,
1000
+ "learning_rate": 7.315334752415169e-06,
1001
+ "loss": 0.2248,
1002
+ "step": 69000
1003
+ },
1004
+ {
1005
+ "epoch": 3.1462200090538706,
1006
+ "grad_norm": 3.116631031036377,
1007
+ "learning_rate": 7.29125530951716e-06,
1008
+ "loss": 0.2138,
1009
+ "step": 69500
1010
+ },
1011
+ {
1012
+ "epoch": 3.1688546853779993,
1013
+ "grad_norm": 9.453999519348145,
1014
+ "learning_rate": 7.2671758666191506e-06,
1015
+ "loss": 0.2267,
1016
+ "step": 70000
1017
+ },
1018
+ {
1019
+ "epoch": 3.1914893617021276,
1020
+ "grad_norm": 25.803009033203125,
1021
+ "learning_rate": 7.2430964237211406e-06,
1022
+ "loss": 0.218,
1023
+ "step": 70500
1024
+ },
1025
+ {
1026
+ "epoch": 3.2141240380262563,
1027
+ "grad_norm": 27.520082473754883,
1028
+ "learning_rate": 7.219016980823132e-06,
1029
+ "loss": 0.2334,
1030
+ "step": 71000
1031
+ },
1032
+ {
1033
+ "epoch": 3.2367587143503846,
1034
+ "grad_norm": 1.3826326131820679,
1035
+ "learning_rate": 7.194937537925123e-06,
1036
+ "loss": 0.2361,
1037
+ "step": 71500
1038
+ },
1039
+ {
1040
+ "epoch": 3.2593933906745134,
1041
+ "grad_norm": 30.99222183227539,
1042
+ "learning_rate": 7.170858095027115e-06,
1043
+ "loss": 0.2369,
1044
+ "step": 72000
1045
+ },
1046
+ {
1047
+ "epoch": 3.2820280669986417,
1048
+ "grad_norm": 30.45430564880371,
1049
+ "learning_rate": 7.146778652129105e-06,
1050
+ "loss": 0.2169,
1051
+ "step": 72500
1052
+ },
1053
+ {
1054
+ "epoch": 3.3046627433227704,
1055
+ "grad_norm": 0.6857367753982544,
1056
+ "learning_rate": 7.1226992092310956e-06,
1057
+ "loss": 0.2427,
1058
+ "step": 73000
1059
+ },
1060
+ {
1061
+ "epoch": 3.327297419646899,
1062
+ "grad_norm": 16.717971801757812,
1063
+ "learning_rate": 7.098619766333087e-06,
1064
+ "loss": 0.2194,
1065
+ "step": 73500
1066
+ },
1067
+ {
1068
+ "epoch": 3.3499320959710275,
1069
+ "grad_norm": 21.98202896118164,
1070
+ "learning_rate": 7.074540323435077e-06,
1071
+ "loss": 0.2464,
1072
+ "step": 74000
1073
+ },
1074
+ {
1075
+ "epoch": 3.3725667722951562,
1076
+ "grad_norm": 14.20096206665039,
1077
+ "learning_rate": 7.050460880537069e-06,
1078
+ "loss": 0.2426,
1079
+ "step": 74500
1080
+ },
1081
+ {
1082
+ "epoch": 3.395201448619285,
1083
+ "grad_norm": 31.20425033569336,
1084
+ "learning_rate": 7.026381437639059e-06,
1085
+ "loss": 0.239,
1086
+ "step": 75000
1087
+ },
1088
+ {
1089
+ "epoch": 3.4178361249434133,
1090
+ "grad_norm": 2.355678081512451,
1091
+ "learning_rate": 7.00230199474105e-06,
1092
+ "loss": 0.2286,
1093
+ "step": 75500
1094
+ },
1095
+ {
1096
+ "epoch": 3.440470801267542,
1097
+ "grad_norm": 1.089072823524475,
1098
+ "learning_rate": 6.9782225518430414e-06,
1099
+ "loss": 0.2202,
1100
+ "step": 76000
1101
+ },
1102
+ {
1103
+ "epoch": 3.4631054775916703,
1104
+ "grad_norm": 13.892812728881836,
1105
+ "learning_rate": 6.954143108945031e-06,
1106
+ "loss": 0.2401,
1107
+ "step": 76500
1108
+ },
1109
+ {
1110
+ "epoch": 3.485740153915799,
1111
+ "grad_norm": 10.377739906311035,
1112
+ "learning_rate": 6.930063666047023e-06,
1113
+ "loss": 0.2376,
1114
+ "step": 77000
1115
+ },
1116
+ {
1117
+ "epoch": 3.5083748302399274,
1118
+ "grad_norm": 25.727222442626953,
1119
+ "learning_rate": 6.905984223149014e-06,
1120
+ "loss": 0.2446,
1121
+ "step": 77500
1122
+ },
1123
+ {
1124
+ "epoch": 3.531009506564056,
1125
+ "grad_norm": 21.669788360595703,
1126
+ "learning_rate": 6.881904780251004e-06,
1127
+ "loss": 0.2301,
1128
+ "step": 78000
1129
+ },
1130
+ {
1131
+ "epoch": 3.553644182888185,
1132
+ "grad_norm": 15.954744338989258,
1133
+ "learning_rate": 6.857825337352996e-06,
1134
+ "loss": 0.2325,
1135
+ "step": 78500
1136
+ },
1137
+ {
1138
+ "epoch": 3.576278859212313,
1139
+ "grad_norm": 20.296842575073242,
1140
+ "learning_rate": 6.8337458944549864e-06,
1141
+ "loss": 0.2405,
1142
+ "step": 79000
1143
+ },
1144
+ {
1145
+ "epoch": 3.598913535536442,
1146
+ "grad_norm": 21.685443878173828,
1147
+ "learning_rate": 6.809666451556978e-06,
1148
+ "loss": 0.2223,
1149
+ "step": 79500
1150
+ },
1151
+ {
1152
+ "epoch": 3.6215482118605706,
1153
+ "grad_norm": 20.43084144592285,
1154
+ "learning_rate": 6.785587008658968e-06,
1155
+ "loss": 0.2323,
1156
+ "step": 80000
1157
+ },
1158
+ {
1159
+ "epoch": 3.644182888184699,
1160
+ "grad_norm": 25.059410095214844,
1161
+ "learning_rate": 6.761507565760959e-06,
1162
+ "loss": 0.2429,
1163
+ "step": 80500
1164
+ },
1165
+ {
1166
+ "epoch": 3.6668175645088277,
1167
+ "grad_norm": 7.98044490814209,
1168
+ "learning_rate": 6.73742812286295e-06,
1169
+ "loss": 0.2337,
1170
+ "step": 81000
1171
+ },
1172
+ {
1173
+ "epoch": 3.689452240832956,
1174
+ "grad_norm": 16.71809196472168,
1175
+ "learning_rate": 6.713348679964941e-06,
1176
+ "loss": 0.2415,
1177
+ "step": 81500
1178
+ },
1179
+ {
1180
+ "epoch": 3.7120869171570847,
1181
+ "grad_norm": 10.189652442932129,
1182
+ "learning_rate": 6.689269237066932e-06,
1183
+ "loss": 0.2398,
1184
+ "step": 82000
1185
+ },
1186
+ {
1187
+ "epoch": 3.734721593481213,
1188
+ "grad_norm": 11.444443702697754,
1189
+ "learning_rate": 6.665189794168922e-06,
1190
+ "loss": 0.242,
1191
+ "step": 82500
1192
+ },
1193
+ {
1194
+ "epoch": 3.7573562698053418,
1195
+ "grad_norm": 26.375669479370117,
1196
+ "learning_rate": 6.641110351270914e-06,
1197
+ "loss": 0.238,
1198
+ "step": 83000
1199
+ },
1200
+ {
1201
+ "epoch": 3.7799909461294705,
1202
+ "grad_norm": 18.25746726989746,
1203
+ "learning_rate": 6.617030908372905e-06,
1204
+ "loss": 0.2372,
1205
+ "step": 83500
1206
+ },
1207
+ {
1208
+ "epoch": 3.802625622453599,
1209
+ "grad_norm": 24.402315139770508,
1210
+ "learning_rate": 6.592951465474895e-06,
1211
+ "loss": 0.2251,
1212
+ "step": 84000
1213
+ },
1214
+ {
1215
+ "epoch": 3.8252602987777276,
1216
+ "grad_norm": 9.18320083618164,
1217
+ "learning_rate": 6.5688720225768865e-06,
1218
+ "loss": 0.2419,
1219
+ "step": 84500
1220
+ },
1221
+ {
1222
+ "epoch": 3.8478949751018563,
1223
+ "grad_norm": 16.251399993896484,
1224
+ "learning_rate": 6.544792579678877e-06,
1225
+ "loss": 0.2447,
1226
+ "step": 85000
1227
+ },
1228
+ {
1229
+ "epoch": 3.8705296514259846,
1230
+ "grad_norm": 18.18793296813965,
1231
+ "learning_rate": 6.520713136780868e-06,
1232
+ "loss": 0.2388,
1233
+ "step": 85500
1234
+ },
1235
+ {
1236
+ "epoch": 3.893164327750113,
1237
+ "grad_norm": 6.440770626068115,
1238
+ "learning_rate": 6.496633693882859e-06,
1239
+ "loss": 0.2349,
1240
+ "step": 86000
1241
+ },
1242
+ {
1243
+ "epoch": 3.9157990040742416,
1244
+ "grad_norm": 23.809585571289062,
1245
+ "learning_rate": 6.472554250984849e-06,
1246
+ "loss": 0.2432,
1247
+ "step": 86500
1248
+ },
1249
+ {
1250
+ "epoch": 3.9384336803983704,
1251
+ "grad_norm": 13.243400573730469,
1252
+ "learning_rate": 6.448474808086841e-06,
1253
+ "loss": 0.2472,
1254
+ "step": 87000
1255
+ },
1256
+ {
1257
+ "epoch": 3.9610683567224987,
1258
+ "grad_norm": 4.604409694671631,
1259
+ "learning_rate": 6.4243953651888315e-06,
1260
+ "loss": 0.2508,
1261
+ "step": 87500
1262
+ },
1263
+ {
1264
+ "epoch": 3.9837030330466274,
1265
+ "grad_norm": 16.728988647460938,
1266
+ "learning_rate": 6.400315922290823e-06,
1267
+ "loss": 0.2401,
1268
+ "step": 88000
1269
+ },
1270
+ {
1271
+ "epoch": 4.0,
1272
+ "eval_accuracy": 0.82967074940796,
1273
+ "eval_loss": 0.5707918405532837,
1274
+ "eval_runtime": 25.1516,
1275
+ "eval_samples_per_second": 1561.373,
1276
+ "eval_steps_per_second": 97.608,
1277
+ "step": 88360
1278
+ },
1279
+ {
1280
+ "epoch": 4.006337709370756,
1281
+ "grad_norm": 12.84592342376709,
1282
+ "learning_rate": 6.376236479392813e-06,
1283
+ "loss": 0.2232,
1284
+ "step": 88500
1285
+ },
1286
+ {
1287
+ "epoch": 4.0289723856948845,
1288
+ "grad_norm": 35.41975021362305,
1289
+ "learning_rate": 6.352157036494804e-06,
1290
+ "loss": 0.169,
1291
+ "step": 89000
1292
+ },
1293
+ {
1294
+ "epoch": 4.051607062019013,
1295
+ "grad_norm": 9.213374137878418,
1296
+ "learning_rate": 6.328077593596796e-06,
1297
+ "loss": 0.1641,
1298
+ "step": 89500
1299
+ },
1300
+ {
1301
+ "epoch": 4.074241738343142,
1302
+ "grad_norm": 12.796171188354492,
1303
+ "learning_rate": 6.303998150698786e-06,
1304
+ "loss": 0.1806,
1305
+ "step": 90000
1306
+ },
1307
+ {
1308
+ "epoch": 4.09687641466727,
1309
+ "grad_norm": 2.487410068511963,
1310
+ "learning_rate": 6.279918707800777e-06,
1311
+ "loss": 0.1846,
1312
+ "step": 90500
1313
+ },
1314
+ {
1315
+ "epoch": 4.119511090991399,
1316
+ "grad_norm": 30.724252700805664,
1317
+ "learning_rate": 6.255839264902767e-06,
1318
+ "loss": 0.1741,
1319
+ "step": 91000
1320
+ },
1321
+ {
1322
+ "epoch": 4.142145767315528,
1323
+ "grad_norm": 13.641295433044434,
1324
+ "learning_rate": 6.231759822004758e-06,
1325
+ "loss": 0.1817,
1326
+ "step": 91500
1327
+ },
1328
+ {
1329
+ "epoch": 4.164780443639656,
1330
+ "grad_norm": 4.760425567626953,
1331
+ "learning_rate": 6.20768037910675e-06,
1332
+ "loss": 0.1886,
1333
+ "step": 92000
1334
+ },
1335
+ {
1336
+ "epoch": 4.187415119963784,
1337
+ "grad_norm": 17.544599533081055,
1338
+ "learning_rate": 6.18360093620874e-06,
1339
+ "loss": 0.1895,
1340
+ "step": 92500
1341
+ },
1342
+ {
1343
+ "epoch": 4.2100497962879135,
1344
+ "grad_norm": 22.361560821533203,
1345
+ "learning_rate": 6.1595214933107315e-06,
1346
+ "loss": 0.1768,
1347
+ "step": 93000
1348
+ },
1349
+ {
1350
+ "epoch": 4.232684472612042,
1351
+ "grad_norm": 0.7933844327926636,
1352
+ "learning_rate": 6.135442050412722e-06,
1353
+ "loss": 0.1835,
1354
+ "step": 93500
1355
+ },
1356
+ {
1357
+ "epoch": 4.25531914893617,
1358
+ "grad_norm": 21.576366424560547,
1359
+ "learning_rate": 6.111362607514712e-06,
1360
+ "loss": 0.1769,
1361
+ "step": 94000
1362
+ },
1363
+ {
1364
+ "epoch": 4.277953825260298,
1365
+ "grad_norm": 19.043777465820312,
1366
+ "learning_rate": 6.087283164616704e-06,
1367
+ "loss": 0.191,
1368
+ "step": 94500
1369
+ },
1370
+ {
1371
+ "epoch": 4.300588501584428,
1372
+ "grad_norm": 34.29130935668945,
1373
+ "learning_rate": 6.063203721718695e-06,
1374
+ "loss": 0.1764,
1375
+ "step": 95000
1376
+ },
1377
+ {
1378
+ "epoch": 4.323223177908556,
1379
+ "grad_norm": 3.8550281524658203,
1380
+ "learning_rate": 6.039124278820686e-06,
1381
+ "loss": 0.1808,
1382
+ "step": 95500
1383
+ },
1384
+ {
1385
+ "epoch": 4.345857854232684,
1386
+ "grad_norm": 6.541236877441406,
1387
+ "learning_rate": 6.0150448359226765e-06,
1388
+ "loss": 0.1909,
1389
+ "step": 96000
1390
+ },
1391
+ {
1392
+ "epoch": 4.368492530556813,
1393
+ "grad_norm": 2.227952480316162,
1394
+ "learning_rate": 5.990965393024667e-06,
1395
+ "loss": 0.1824,
1396
+ "step": 96500
1397
+ },
1398
+ {
1399
+ "epoch": 4.391127206880942,
1400
+ "grad_norm": 10.062329292297363,
1401
+ "learning_rate": 5.966885950126658e-06,
1402
+ "loss": 0.1935,
1403
+ "step": 97000
1404
+ },
1405
+ {
1406
+ "epoch": 4.41376188320507,
1407
+ "grad_norm": 24.6075439453125,
1408
+ "learning_rate": 5.942806507228649e-06,
1409
+ "loss": 0.1785,
1410
+ "step": 97500
1411
+ },
1412
+ {
1413
+ "epoch": 4.436396559529198,
1414
+ "grad_norm": 7.842912197113037,
1415
+ "learning_rate": 5.918727064330641e-06,
1416
+ "loss": 0.1875,
1417
+ "step": 98000
1418
+ },
1419
+ {
1420
+ "epoch": 4.4590312358533275,
1421
+ "grad_norm": 61.13163757324219,
1422
+ "learning_rate": 5.894647621432631e-06,
1423
+ "loss": 0.1917,
1424
+ "step": 98500
1425
+ },
1426
+ {
1427
+ "epoch": 4.481665912177456,
1428
+ "grad_norm": 18.254608154296875,
1429
+ "learning_rate": 5.8705681785346215e-06,
1430
+ "loss": 0.1954,
1431
+ "step": 99000
1432
+ },
1433
+ {
1434
+ "epoch": 4.504300588501584,
1435
+ "grad_norm": 24.859031677246094,
1436
+ "learning_rate": 5.846488735636613e-06,
1437
+ "loss": 0.1916,
1438
+ "step": 99500
1439
+ },
1440
+ {
1441
+ "epoch": 4.526935264825713,
1442
+ "grad_norm": 28.81034278869629,
1443
+ "learning_rate": 5.822409292738603e-06,
1444
+ "loss": 0.1829,
1445
+ "step": 100000
1446
+ },
1447
+ {
1448
+ "epoch": 4.549569941149842,
1449
+ "grad_norm": 46.6638069152832,
1450
+ "learning_rate": 5.798329849840595e-06,
1451
+ "loss": 0.176,
1452
+ "step": 100500
1453
+ },
1454
+ {
1455
+ "epoch": 4.57220461747397,
1456
+ "grad_norm": 76.71381378173828,
1457
+ "learning_rate": 5.774250406942586e-06,
1458
+ "loss": 0.1879,
1459
+ "step": 101000
1460
+ },
1461
+ {
1462
+ "epoch": 4.594839293798099,
1463
+ "grad_norm": 1.6686662435531616,
1464
+ "learning_rate": 5.7501709640445765e-06,
1465
+ "loss": 0.199,
1466
+ "step": 101500
1467
+ },
1468
+ {
1469
+ "epoch": 4.617473970122227,
1470
+ "grad_norm": 39.20078659057617,
1471
+ "learning_rate": 5.726091521146567e-06,
1472
+ "loss": 0.18,
1473
+ "step": 102000
1474
+ },
1475
+ {
1476
+ "epoch": 4.640108646446356,
1477
+ "grad_norm": 10.581939697265625,
1478
+ "learning_rate": 5.702012078248557e-06,
1479
+ "loss": 0.1829,
1480
+ "step": 102500
1481
+ },
1482
+ {
1483
+ "epoch": 4.662743322770485,
1484
+ "grad_norm": 41.204193115234375,
1485
+ "learning_rate": 5.677932635350549e-06,
1486
+ "loss": 0.196,
1487
+ "step": 103000
1488
+ },
1489
+ {
1490
+ "epoch": 4.685377999094613,
1491
+ "grad_norm": 0.44744327664375305,
1492
+ "learning_rate": 5.65385319245254e-06,
1493
+ "loss": 0.2079,
1494
+ "step": 103500
1495
+ },
1496
+ {
1497
+ "epoch": 4.7080126754187415,
1498
+ "grad_norm": 56.338253021240234,
1499
+ "learning_rate": 5.6297737495545315e-06,
1500
+ "loss": 0.1768,
1501
+ "step": 104000
1502
+ },
1503
+ {
1504
+ "epoch": 4.73064735174287,
1505
+ "grad_norm": 35.820560455322266,
1506
+ "learning_rate": 5.6056943066565215e-06,
1507
+ "loss": 0.1828,
1508
+ "step": 104500
1509
+ },
1510
+ {
1511
+ "epoch": 4.753282028066999,
1512
+ "grad_norm": 24.01755142211914,
1513
+ "learning_rate": 5.581614863758512e-06,
1514
+ "loss": 0.1933,
1515
+ "step": 105000
1516
+ },
1517
+ {
1518
+ "epoch": 4.775916704391127,
1519
+ "grad_norm": 0.4996182322502136,
1520
+ "learning_rate": 5.557535420860504e-06,
1521
+ "loss": 0.1958,
1522
+ "step": 105500
1523
+ },
1524
+ {
1525
+ "epoch": 4.7985513807152556,
1526
+ "grad_norm": 14.128357887268066,
1527
+ "learning_rate": 5.533455977962494e-06,
1528
+ "loss": 0.1854,
1529
+ "step": 106000
1530
+ },
1531
+ {
1532
+ "epoch": 4.821186057039384,
1533
+ "grad_norm": 32.995784759521484,
1534
+ "learning_rate": 5.509376535064486e-06,
1535
+ "loss": 0.1957,
1536
+ "step": 106500
1537
+ },
1538
+ {
1539
+ "epoch": 4.843820733363513,
1540
+ "grad_norm": 33.00492477416992,
1541
+ "learning_rate": 5.485297092166476e-06,
1542
+ "loss": 0.2023,
1543
+ "step": 107000
1544
+ },
1545
+ {
1546
+ "epoch": 4.866455409687641,
1547
+ "grad_norm": 12.003284454345703,
1548
+ "learning_rate": 5.4612176492684665e-06,
1549
+ "loss": 0.1922,
1550
+ "step": 107500
1551
+ },
1552
+ {
1553
+ "epoch": 4.88909008601177,
1554
+ "grad_norm": 48.15534210205078,
1555
+ "learning_rate": 5.437138206370458e-06,
1556
+ "loss": 0.1797,
1557
+ "step": 108000
1558
+ },
1559
+ {
1560
+ "epoch": 4.911724762335899,
1561
+ "grad_norm": 9.628497123718262,
1562
+ "learning_rate": 5.413058763472448e-06,
1563
+ "loss": 0.189,
1564
+ "step": 108500
1565
+ },
1566
+ {
1567
+ "epoch": 4.934359438660027,
1568
+ "grad_norm": 1.054288387298584,
1569
+ "learning_rate": 5.38897932057444e-06,
1570
+ "loss": 0.1942,
1571
+ "step": 109000
1572
+ },
1573
+ {
1574
+ "epoch": 4.956994114984155,
1575
+ "grad_norm": 1.9454457759857178,
1576
+ "learning_rate": 5.364899877676431e-06,
1577
+ "loss": 0.1981,
1578
+ "step": 109500
1579
+ },
1580
+ {
1581
+ "epoch": 4.979628791308285,
1582
+ "grad_norm": 12.427486419677734,
1583
+ "learning_rate": 5.340820434778421e-06,
1584
+ "loss": 0.1909,
1585
+ "step": 110000
1586
+ },
1587
+ {
1588
+ "epoch": 5.0,
1589
+ "eval_accuracy": 0.8328792238547529,
1590
+ "eval_loss": 0.7050167918205261,
1591
+ "eval_runtime": 25.1358,
1592
+ "eval_samples_per_second": 1562.355,
1593
+ "eval_steps_per_second": 97.67,
1594
+ "step": 110450
1595
+ },
1596
+ {
1597
+ "epoch": 5.002263467632413,
1598
+ "grad_norm": 6.582240581512451,
1599
+ "learning_rate": 5.316740991880412e-06,
1600
+ "loss": 0.1962,
1601
+ "step": 110500
1602
+ },
1603
+ {
1604
+ "epoch": 5.024898143956541,
1605
+ "grad_norm": 0.4056040048599243,
1606
+ "learning_rate": 5.292661548982403e-06,
1607
+ "loss": 0.1323,
1608
+ "step": 111000
1609
+ },
1610
+ {
1611
+ "epoch": 5.04753282028067,
1612
+ "grad_norm": 0.18058906495571136,
1613
+ "learning_rate": 5.268582106084394e-06,
1614
+ "loss": 0.1383,
1615
+ "step": 111500
1616
+ },
1617
+ {
1618
+ "epoch": 5.070167496604799,
1619
+ "grad_norm": 0.056140393018722534,
1620
+ "learning_rate": 5.244502663186385e-06,
1621
+ "loss": 0.1325,
1622
+ "step": 112000
1623
+ },
1624
+ {
1625
+ "epoch": 5.092802172928927,
1626
+ "grad_norm": 20.949678421020508,
1627
+ "learning_rate": 5.220423220288376e-06,
1628
+ "loss": 0.1564,
1629
+ "step": 112500
1630
+ },
1631
+ {
1632
+ "epoch": 5.115436849253055,
1633
+ "grad_norm": 7.811840534210205,
1634
+ "learning_rate": 5.1963437773903666e-06,
1635
+ "loss": 0.1415,
1636
+ "step": 113000
1637
+ },
1638
+ {
1639
+ "epoch": 5.1380715255771845,
1640
+ "grad_norm": 0.05319051817059517,
1641
+ "learning_rate": 5.172264334492357e-06,
1642
+ "loss": 0.1566,
1643
+ "step": 113500
1644
+ },
1645
+ {
1646
+ "epoch": 5.160706201901313,
1647
+ "grad_norm": 49.17327880859375,
1648
+ "learning_rate": 5.148184891594349e-06,
1649
+ "loss": 0.1564,
1650
+ "step": 114000
1651
+ },
1652
+ {
1653
+ "epoch": 5.183340878225441,
1654
+ "grad_norm": 44.031063079833984,
1655
+ "learning_rate": 5.124105448696339e-06,
1656
+ "loss": 0.1202,
1657
+ "step": 114500
1658
+ },
1659
+ {
1660
+ "epoch": 5.20597555454957,
1661
+ "grad_norm": 12.423316955566406,
1662
+ "learning_rate": 5.10002600579833e-06,
1663
+ "loss": 0.1553,
1664
+ "step": 115000
1665
+ },
1666
+ {
1667
+ "epoch": 5.228610230873699,
1668
+ "grad_norm": 0.1754394769668579,
1669
+ "learning_rate": 5.075946562900322e-06,
1670
+ "loss": 0.1381,
1671
+ "step": 115500
1672
+ },
1673
+ {
1674
+ "epoch": 5.251244907197827,
1675
+ "grad_norm": 9.577567100524902,
1676
+ "learning_rate": 5.0518671200023116e-06,
1677
+ "loss": 0.1457,
1678
+ "step": 116000
1679
+ },
1680
+ {
1681
+ "epoch": 5.273879583521955,
1682
+ "grad_norm": 106.29130554199219,
1683
+ "learning_rate": 5.027787677104303e-06,
1684
+ "loss": 0.1527,
1685
+ "step": 116500
1686
+ },
1687
+ {
1688
+ "epoch": 5.296514259846084,
1689
+ "grad_norm": 20.000438690185547,
1690
+ "learning_rate": 5.003708234206294e-06,
1691
+ "loss": 0.1527,
1692
+ "step": 117000
1693
+ },
1694
+ {
1695
+ "epoch": 5.319148936170213,
1696
+ "grad_norm": 20.148639678955078,
1697
+ "learning_rate": 4.979628791308285e-06,
1698
+ "loss": 0.1424,
1699
+ "step": 117500
1700
+ },
1701
+ {
1702
+ "epoch": 5.341783612494341,
1703
+ "grad_norm": 0.2889052629470825,
1704
+ "learning_rate": 4.955549348410276e-06,
1705
+ "loss": 0.1616,
1706
+ "step": 118000
1707
+ },
1708
+ {
1709
+ "epoch": 5.36441828881847,
1710
+ "grad_norm": 0.16069917380809784,
1711
+ "learning_rate": 4.931469905512267e-06,
1712
+ "loss": 0.158,
1713
+ "step": 118500
1714
+ },
1715
+ {
1716
+ "epoch": 5.3870529651425985,
1717
+ "grad_norm": 1.3615031242370605,
1718
+ "learning_rate": 4.9073904626142574e-06,
1719
+ "loss": 0.1527,
1720
+ "step": 119000
1721
+ },
1722
+ {
1723
+ "epoch": 5.409687641466727,
1724
+ "grad_norm": 58.8338737487793,
1725
+ "learning_rate": 4.883311019716248e-06,
1726
+ "loss": 0.1387,
1727
+ "step": 119500
1728
+ },
1729
+ {
1730
+ "epoch": 5.432322317790856,
1731
+ "grad_norm": 0.8877764344215393,
1732
+ "learning_rate": 4.859231576818239e-06,
1733
+ "loss": 0.1394,
1734
+ "step": 120000
1735
+ },
1736
+ {
1737
+ "epoch": 5.454956994114984,
1738
+ "grad_norm": 30.472936630249023,
1739
+ "learning_rate": 4.83515213392023e-06,
1740
+ "loss": 0.1544,
1741
+ "step": 120500
1742
+ },
1743
+ {
1744
+ "epoch": 5.4775916704391125,
1745
+ "grad_norm": 0.35457274317741394,
1746
+ "learning_rate": 4.811072691022221e-06,
1747
+ "loss": 0.1551,
1748
+ "step": 121000
1749
+ },
1750
+ {
1751
+ "epoch": 5.500226346763242,
1752
+ "grad_norm": 0.993865430355072,
1753
+ "learning_rate": 4.7869932481242124e-06,
1754
+ "loss": 0.1707,
1755
+ "step": 121500
1756
+ },
1757
+ {
1758
+ "epoch": 5.52286102308737,
1759
+ "grad_norm": 62.242698669433594,
1760
+ "learning_rate": 4.7629138052262024e-06,
1761
+ "loss": 0.1433,
1762
+ "step": 122000
1763
+ },
1764
+ {
1765
+ "epoch": 5.545495699411498,
1766
+ "grad_norm": 35.37886047363281,
1767
+ "learning_rate": 4.738834362328193e-06,
1768
+ "loss": 0.1509,
1769
+ "step": 122500
1770
+ },
1771
+ {
1772
+ "epoch": 5.568130375735627,
1773
+ "grad_norm": 11.451070785522461,
1774
+ "learning_rate": 4.714754919430184e-06,
1775
+ "loss": 0.1392,
1776
+ "step": 123000
1777
+ },
1778
+ {
1779
+ "epoch": 5.590765052059756,
1780
+ "grad_norm": 22.20694923400879,
1781
+ "learning_rate": 4.690675476532176e-06,
1782
+ "loss": 0.1572,
1783
+ "step": 123500
1784
+ },
1785
+ {
1786
+ "epoch": 5.613399728383884,
1787
+ "grad_norm": 18.609609603881836,
1788
+ "learning_rate": 4.666596033634167e-06,
1789
+ "loss": 0.1598,
1790
+ "step": 124000
1791
+ },
1792
+ {
1793
+ "epoch": 5.636034404708012,
1794
+ "grad_norm": 15.019495010375977,
1795
+ "learning_rate": 4.642516590736157e-06,
1796
+ "loss": 0.1582,
1797
+ "step": 124500
1798
+ },
1799
+ {
1800
+ "epoch": 5.658669081032142,
1801
+ "grad_norm": 1.4315303564071655,
1802
+ "learning_rate": 4.618437147838148e-06,
1803
+ "loss": 0.1515,
1804
+ "step": 125000
1805
+ },
1806
+ {
1807
+ "epoch": 5.68130375735627,
1808
+ "grad_norm": 29.807109832763672,
1809
+ "learning_rate": 4.594357704940139e-06,
1810
+ "loss": 0.1471,
1811
+ "step": 125500
1812
+ },
1813
+ {
1814
+ "epoch": 5.703938433680398,
1815
+ "grad_norm": 0.13336171209812164,
1816
+ "learning_rate": 4.57027826204213e-06,
1817
+ "loss": 0.1294,
1818
+ "step": 126000
1819
+ },
1820
+ {
1821
+ "epoch": 5.7265731100045265,
1822
+ "grad_norm": 12.578352928161621,
1823
+ "learning_rate": 4.546198819144121e-06,
1824
+ "loss": 0.1653,
1825
+ "step": 126500
1826
+ },
1827
+ {
1828
+ "epoch": 5.749207786328656,
1829
+ "grad_norm": 67.32268524169922,
1830
+ "learning_rate": 4.522119376246112e-06,
1831
+ "loss": 0.1358,
1832
+ "step": 127000
1833
+ },
1834
+ {
1835
+ "epoch": 5.771842462652784,
1836
+ "grad_norm": 13.508434295654297,
1837
+ "learning_rate": 4.4980399333481025e-06,
1838
+ "loss": 0.1592,
1839
+ "step": 127500
1840
+ },
1841
+ {
1842
+ "epoch": 5.794477138976912,
1843
+ "grad_norm": 47.93122863769531,
1844
+ "learning_rate": 4.473960490450093e-06,
1845
+ "loss": 0.167,
1846
+ "step": 128000
1847
+ },
1848
+ {
1849
+ "epoch": 5.8171118153010415,
1850
+ "grad_norm": 22.402637481689453,
1851
+ "learning_rate": 4.449881047552084e-06,
1852
+ "loss": 0.1565,
1853
+ "step": 128500
1854
+ },
1855
+ {
1856
+ "epoch": 5.83974649162517,
1857
+ "grad_norm": 1.559098720550537,
1858
+ "learning_rate": 4.425801604654075e-06,
1859
+ "loss": 0.1476,
1860
+ "step": 129000
1861
+ },
1862
+ {
1863
+ "epoch": 5.862381167949298,
1864
+ "grad_norm": 0.17111951112747192,
1865
+ "learning_rate": 4.401722161756066e-06,
1866
+ "loss": 0.1579,
1867
+ "step": 129500
1868
+ },
1869
+ {
1870
+ "epoch": 5.885015844273427,
1871
+ "grad_norm": 39.580440521240234,
1872
+ "learning_rate": 4.377642718858057e-06,
1873
+ "loss": 0.1617,
1874
+ "step": 130000
1875
+ },
1876
+ {
1877
+ "epoch": 5.907650520597556,
1878
+ "grad_norm": 0.1319379210472107,
1879
+ "learning_rate": 4.3535632759600475e-06,
1880
+ "loss": 0.1687,
1881
+ "step": 130500
1882
+ },
1883
+ {
1884
+ "epoch": 5.930285196921684,
1885
+ "grad_norm": 0.8287597894668579,
1886
+ "learning_rate": 4.329483833062038e-06,
1887
+ "loss": 0.1587,
1888
+ "step": 131000
1889
+ },
1890
+ {
1891
+ "epoch": 5.952919873245813,
1892
+ "grad_norm": 4.215353488922119,
1893
+ "learning_rate": 4.30540439016403e-06,
1894
+ "loss": 0.1614,
1895
+ "step": 131500
1896
+ },
1897
+ {
1898
+ "epoch": 5.975554549569941,
1899
+ "grad_norm": 40.749576568603516,
1900
+ "learning_rate": 4.281324947266021e-06,
1901
+ "loss": 0.1765,
1902
+ "step": 132000
1903
+ },
1904
+ {
1905
+ "epoch": 5.99818922589407,
1906
+ "grad_norm": 0.21512405574321747,
1907
+ "learning_rate": 4.257245504368011e-06,
1908
+ "loss": 0.1484,
1909
+ "step": 132500
1910
+ },
1911
+ {
1912
+ "epoch": 6.0,
1913
+ "eval_accuracy": 0.8314786992946449,
1914
+ "eval_loss": 0.8563234806060791,
1915
+ "eval_runtime": 25.1644,
1916
+ "eval_samples_per_second": 1560.576,
1917
+ "eval_steps_per_second": 97.558,
1918
+ "step": 132540
1919
+ },
1920
+ {
1921
+ "epoch": 6.020823902218198,
1922
+ "grad_norm": 1.4446039199829102,
1923
+ "learning_rate": 4.2331660614700025e-06,
1924
+ "loss": 0.1143,
1925
+ "step": 133000
1926
+ },
1927
+ {
1928
+ "epoch": 6.043458578542327,
1929
+ "grad_norm": 0.6583614945411682,
1930
+ "learning_rate": 4.209086618571993e-06,
1931
+ "loss": 0.1107,
1932
+ "step": 133500
1933
+ },
1934
+ {
1935
+ "epoch": 6.0660932548664555,
1936
+ "grad_norm": 28.823888778686523,
1937
+ "learning_rate": 4.185007175673984e-06,
1938
+ "loss": 0.1086,
1939
+ "step": 134000
1940
+ },
1941
+ {
1942
+ "epoch": 6.088727931190584,
1943
+ "grad_norm": 42.349857330322266,
1944
+ "learning_rate": 4.160927732775975e-06,
1945
+ "loss": 0.1161,
1946
+ "step": 134500
1947
+ },
1948
+ {
1949
+ "epoch": 6.111362607514713,
1950
+ "grad_norm": 93.61304473876953,
1951
+ "learning_rate": 4.136848289877966e-06,
1952
+ "loss": 0.1281,
1953
+ "step": 135000
1954
+ },
1955
+ {
1956
+ "epoch": 6.133997283838841,
1957
+ "grad_norm": 24.18094253540039,
1958
+ "learning_rate": 4.112768846979957e-06,
1959
+ "loss": 0.1161,
1960
+ "step": 135500
1961
+ },
1962
+ {
1963
+ "epoch": 6.1566319601629695,
1964
+ "grad_norm": 2.7703983783721924,
1965
+ "learning_rate": 4.0886894040819475e-06,
1966
+ "loss": 0.1208,
1967
+ "step": 136000
1968
+ },
1969
+ {
1970
+ "epoch": 6.179266636487098,
1971
+ "grad_norm": 2.829185962677002,
1972
+ "learning_rate": 4.064609961183938e-06,
1973
+ "loss": 0.1299,
1974
+ "step": 136500
1975
+ },
1976
+ {
1977
+ "epoch": 6.201901312811227,
1978
+ "grad_norm": 0.07901770621538162,
1979
+ "learning_rate": 4.040530518285929e-06,
1980
+ "loss": 0.1222,
1981
+ "step": 137000
1982
+ },
1983
+ {
1984
+ "epoch": 6.224535989135355,
1985
+ "grad_norm": 17.905927658081055,
1986
+ "learning_rate": 4.01645107538792e-06,
1987
+ "loss": 0.1104,
1988
+ "step": 137500
1989
+ },
1990
+ {
1991
+ "epoch": 6.247170665459484,
1992
+ "grad_norm": 38.864986419677734,
1993
+ "learning_rate": 3.992371632489911e-06,
1994
+ "loss": 0.1231,
1995
+ "step": 138000
1996
+ },
1997
+ {
1998
+ "epoch": 6.269805341783613,
1999
+ "grad_norm": 0.39027053117752075,
2000
+ "learning_rate": 3.968292189591902e-06,
2001
+ "loss": 0.1131,
2002
+ "step": 138500
2003
+ },
2004
+ {
2005
+ "epoch": 6.292440018107741,
2006
+ "grad_norm": 6.372051239013672,
2007
+ "learning_rate": 3.9442127466938925e-06,
2008
+ "loss": 0.1261,
2009
+ "step": 139000
2010
+ },
2011
+ {
2012
+ "epoch": 6.315074694431869,
2013
+ "grad_norm": 0.10187414288520813,
2014
+ "learning_rate": 3.920133303795884e-06,
2015
+ "loss": 0.1207,
2016
+ "step": 139500
2017
+ },
2018
+ {
2019
+ "epoch": 6.337709370755999,
2020
+ "grad_norm": 0.5153215527534485,
2021
+ "learning_rate": 3.896053860897875e-06,
2022
+ "loss": 0.1101,
2023
+ "step": 140000
2024
+ },
2025
+ {
2026
+ "epoch": 6.360344047080127,
2027
+ "grad_norm": 0.06458627432584763,
2028
+ "learning_rate": 3.871974417999865e-06,
2029
+ "loss": 0.1293,
2030
+ "step": 140500
2031
+ },
2032
+ {
2033
+ "epoch": 6.382978723404255,
2034
+ "grad_norm": 28.89288902282715,
2035
+ "learning_rate": 3.847894975101857e-06,
2036
+ "loss": 0.1217,
2037
+ "step": 141000
2038
+ },
2039
+ {
2040
+ "epoch": 6.4056133997283835,
2041
+ "grad_norm": 14.432846069335938,
2042
+ "learning_rate": 3.8238155322038475e-06,
2043
+ "loss": 0.1259,
2044
+ "step": 141500
2045
+ },
2046
+ {
2047
+ "epoch": 6.428248076052513,
2048
+ "grad_norm": 16.554767608642578,
2049
+ "learning_rate": 3.7997360893058384e-06,
2050
+ "loss": 0.123,
2051
+ "step": 142000
2052
+ },
2053
+ {
2054
+ "epoch": 6.450882752376641,
2055
+ "grad_norm": 0.031149500980973244,
2056
+ "learning_rate": 3.775656646407829e-06,
2057
+ "loss": 0.1262,
2058
+ "step": 142500
2059
+ },
2060
+ {
2061
+ "epoch": 6.473517428700769,
2062
+ "grad_norm": 4.206363677978516,
2063
+ "learning_rate": 3.7515772035098196e-06,
2064
+ "loss": 0.1185,
2065
+ "step": 143000
2066
+ },
2067
+ {
2068
+ "epoch": 6.4961521050248985,
2069
+ "grad_norm": 75.40314483642578,
2070
+ "learning_rate": 3.727497760611811e-06,
2071
+ "loss": 0.1296,
2072
+ "step": 143500
2073
+ },
2074
+ {
2075
+ "epoch": 6.518786781349027,
2076
+ "grad_norm": 59.042388916015625,
2077
+ "learning_rate": 3.7034183177138017e-06,
2078
+ "loss": 0.1264,
2079
+ "step": 144000
2080
+ },
2081
+ {
2082
+ "epoch": 6.541421457673155,
2083
+ "grad_norm": 0.23703084886074066,
2084
+ "learning_rate": 3.6793388748157925e-06,
2085
+ "loss": 0.1243,
2086
+ "step": 144500
2087
+ },
2088
+ {
2089
+ "epoch": 6.564056133997283,
2090
+ "grad_norm": 1.2637842893600464,
2091
+ "learning_rate": 3.6552594319177838e-06,
2092
+ "loss": 0.1207,
2093
+ "step": 145000
2094
+ },
2095
+ {
2096
+ "epoch": 6.586690810321413,
2097
+ "grad_norm": 0.08345487713813782,
2098
+ "learning_rate": 3.631179989019774e-06,
2099
+ "loss": 0.1439,
2100
+ "step": 145500
2101
+ },
2102
+ {
2103
+ "epoch": 6.609325486645541,
2104
+ "grad_norm": 1.2345664501190186,
2105
+ "learning_rate": 3.607100546121765e-06,
2106
+ "loss": 0.119,
2107
+ "step": 146000
2108
+ },
2109
+ {
2110
+ "epoch": 6.631960162969669,
2111
+ "grad_norm": 3.166825771331787,
2112
+ "learning_rate": 3.583021103223756e-06,
2113
+ "loss": 0.118,
2114
+ "step": 146500
2115
+ },
2116
+ {
2117
+ "epoch": 6.654594839293798,
2118
+ "grad_norm": 45.547122955322266,
2119
+ "learning_rate": 3.558941660325747e-06,
2120
+ "loss": 0.1302,
2121
+ "step": 147000
2122
+ },
2123
+ {
2124
+ "epoch": 6.677229515617927,
2125
+ "grad_norm": 0.10651729255914688,
2126
+ "learning_rate": 3.534862217427738e-06,
2127
+ "loss": 0.1293,
2128
+ "step": 147500
2129
+ },
2130
+ {
2131
+ "epoch": 6.699864191942055,
2132
+ "grad_norm": 40.16682815551758,
2133
+ "learning_rate": 3.5107827745297292e-06,
2134
+ "loss": 0.1274,
2135
+ "step": 148000
2136
+ },
2137
+ {
2138
+ "epoch": 6.722498868266184,
2139
+ "grad_norm": 0.051575180143117905,
2140
+ "learning_rate": 3.4867033316317196e-06,
2141
+ "loss": 0.1122,
2142
+ "step": 148500
2143
+ },
2144
+ {
2145
+ "epoch": 6.7451335445903124,
2146
+ "grad_norm": 0.2128569483757019,
2147
+ "learning_rate": 3.4626238887337105e-06,
2148
+ "loss": 0.1237,
2149
+ "step": 149000
2150
+ },
2151
+ {
2152
+ "epoch": 6.767768220914441,
2153
+ "grad_norm": 40.65754318237305,
2154
+ "learning_rate": 3.4385444458357013e-06,
2155
+ "loss": 0.1126,
2156
+ "step": 149500
2157
+ },
2158
+ {
2159
+ "epoch": 6.79040289723857,
2160
+ "grad_norm": 0.2590946853160858,
2161
+ "learning_rate": 3.4144650029376926e-06,
2162
+ "loss": 0.1222,
2163
+ "step": 150000
2164
+ },
2165
+ {
2166
+ "epoch": 6.813037573562698,
2167
+ "grad_norm": 36.63773727416992,
2168
+ "learning_rate": 3.3903855600396834e-06,
2169
+ "loss": 0.126,
2170
+ "step": 150500
2171
+ },
2172
+ {
2173
+ "epoch": 6.8356722498868265,
2174
+ "grad_norm": 0.037315912544727325,
2175
+ "learning_rate": 3.366306117141674e-06,
2176
+ "loss": 0.1098,
2177
+ "step": 151000
2178
+ },
2179
+ {
2180
+ "epoch": 6.858306926210955,
2181
+ "grad_norm": 0.37742793560028076,
2182
+ "learning_rate": 3.342226674243665e-06,
2183
+ "loss": 0.1172,
2184
+ "step": 151500
2185
+ },
2186
+ {
2187
+ "epoch": 6.880941602535084,
2188
+ "grad_norm": 0.050493206828832626,
2189
+ "learning_rate": 3.318147231345656e-06,
2190
+ "loss": 0.1261,
2191
+ "step": 152000
2192
+ },
2193
+ {
2194
+ "epoch": 6.903576278859212,
2195
+ "grad_norm": 48.29159164428711,
2196
+ "learning_rate": 3.2940677884476467e-06,
2197
+ "loss": 0.124,
2198
+ "step": 152500
2199
+ },
2200
+ {
2201
+ "epoch": 6.926210955183341,
2202
+ "grad_norm": 51.89788055419922,
2203
+ "learning_rate": 3.269988345549638e-06,
2204
+ "loss": 0.1253,
2205
+ "step": 153000
2206
+ },
2207
+ {
2208
+ "epoch": 6.94884563150747,
2209
+ "grad_norm": 0.9628204107284546,
2210
+ "learning_rate": 3.2459089026516284e-06,
2211
+ "loss": 0.1234,
2212
+ "step": 153500
2213
+ },
2214
+ {
2215
+ "epoch": 6.971480307831598,
2216
+ "grad_norm": 65.29854583740234,
2217
+ "learning_rate": 3.2218294597536192e-06,
2218
+ "loss": 0.1206,
2219
+ "step": 154000
2220
+ },
2221
+ {
2222
+ "epoch": 6.994114984155726,
2223
+ "grad_norm": 0.5797120332717896,
2224
+ "learning_rate": 3.19775001685561e-06,
2225
+ "loss": 0.1322,
2226
+ "step": 154500
2227
+ },
2228
+ {
2229
+ "epoch": 7.0,
2230
+ "eval_accuracy": 0.8291105395839169,
2231
+ "eval_loss": 0.9395027160644531,
2232
+ "eval_runtime": 25.1555,
2233
+ "eval_samples_per_second": 1561.129,
2234
+ "eval_steps_per_second": 97.593,
2235
+ "step": 154630
2236
+ },
2237
+ {
2238
+ "epoch": 7.016749660479855,
2239
+ "grad_norm": 61.21876525878906,
2240
+ "learning_rate": 3.1736705739576013e-06,
2241
+ "loss": 0.117,
2242
+ "step": 155000
2243
+ },
2244
+ {
2245
+ "epoch": 7.039384336803984,
2246
+ "grad_norm": 67.23644256591797,
2247
+ "learning_rate": 3.149591131059592e-06,
2248
+ "loss": 0.0929,
2249
+ "step": 155500
2250
+ },
2251
+ {
2252
+ "epoch": 7.062019013128112,
2253
+ "grad_norm": 0.02777719311416149,
2254
+ "learning_rate": 3.1255116881615826e-06,
2255
+ "loss": 0.092,
2256
+ "step": 156000
2257
+ },
2258
+ {
2259
+ "epoch": 7.0846536894522405,
2260
+ "grad_norm": 17.658966064453125,
2261
+ "learning_rate": 3.101432245263574e-06,
2262
+ "loss": 0.0928,
2263
+ "step": 156500
2264
+ },
2265
+ {
2266
+ "epoch": 7.10728836577637,
2267
+ "grad_norm": 0.0420079231262207,
2268
+ "learning_rate": 3.0773528023655647e-06,
2269
+ "loss": 0.0865,
2270
+ "step": 157000
2271
+ },
2272
+ {
2273
+ "epoch": 7.129923042100498,
2274
+ "grad_norm": 0.05451720952987671,
2275
+ "learning_rate": 3.0532733594675555e-06,
2276
+ "loss": 0.1069,
2277
+ "step": 157500
2278
+ },
2279
+ {
2280
+ "epoch": 7.152557718424626,
2281
+ "grad_norm": 32.596221923828125,
2282
+ "learning_rate": 3.0291939165695468e-06,
2283
+ "loss": 0.0961,
2284
+ "step": 158000
2285
+ },
2286
+ {
2287
+ "epoch": 7.1751923947487555,
2288
+ "grad_norm": 1.718901515007019,
2289
+ "learning_rate": 3.0051144736715376e-06,
2290
+ "loss": 0.1037,
2291
+ "step": 158500
2292
+ },
2293
+ {
2294
+ "epoch": 7.197827071072884,
2295
+ "grad_norm": 0.0346030667424202,
2296
+ "learning_rate": 2.981035030773528e-06,
2297
+ "loss": 0.0919,
2298
+ "step": 159000
2299
+ },
2300
+ {
2301
+ "epoch": 7.220461747397012,
2302
+ "grad_norm": 32.53134536743164,
2303
+ "learning_rate": 2.9569555878755193e-06,
2304
+ "loss": 0.1035,
2305
+ "step": 159500
2306
+ },
2307
+ {
2308
+ "epoch": 7.24309642372114,
2309
+ "grad_norm": 0.03724272549152374,
2310
+ "learning_rate": 2.93287614497751e-06,
2311
+ "loss": 0.1039,
2312
+ "step": 160000
2313
+ },
2314
+ {
2315
+ "epoch": 7.26573110004527,
2316
+ "grad_norm": 0.04580220580101013,
2317
+ "learning_rate": 2.908796702079501e-06,
2318
+ "loss": 0.0812,
2319
+ "step": 160500
2320
+ },
2321
+ {
2322
+ "epoch": 7.288365776369398,
2323
+ "grad_norm": 0.06553350389003754,
2324
+ "learning_rate": 2.884717259181492e-06,
2325
+ "loss": 0.0941,
2326
+ "step": 161000
2327
+ },
2328
+ {
2329
+ "epoch": 7.311000452693526,
2330
+ "grad_norm": 0.12175572663545609,
2331
+ "learning_rate": 2.8606378162834826e-06,
2332
+ "loss": 0.1116,
2333
+ "step": 161500
2334
+ },
2335
+ {
2336
+ "epoch": 7.333635129017655,
2337
+ "grad_norm": 51.450286865234375,
2338
+ "learning_rate": 2.8365583733854734e-06,
2339
+ "loss": 0.1057,
2340
+ "step": 162000
2341
+ },
2342
+ {
2343
+ "epoch": 7.356269805341784,
2344
+ "grad_norm": 0.1844579130411148,
2345
+ "learning_rate": 2.8124789304874643e-06,
2346
+ "loss": 0.0984,
2347
+ "step": 162500
2348
+ },
2349
+ {
2350
+ "epoch": 7.378904481665912,
2351
+ "grad_norm": 0.030503317713737488,
2352
+ "learning_rate": 2.7883994875894555e-06,
2353
+ "loss": 0.09,
2354
+ "step": 163000
2355
+ },
2356
+ {
2357
+ "epoch": 7.401539157990041,
2358
+ "grad_norm": 0.0265294648706913,
2359
+ "learning_rate": 2.7643200446914464e-06,
2360
+ "loss": 0.0896,
2361
+ "step": 163500
2362
+ },
2363
+ {
2364
+ "epoch": 7.424173834314169,
2365
+ "grad_norm": 5.12315559387207,
2366
+ "learning_rate": 2.7402406017934368e-06,
2367
+ "loss": 0.109,
2368
+ "step": 164000
2369
+ },
2370
+ {
2371
+ "epoch": 7.446808510638298,
2372
+ "grad_norm": 6.6580891609191895,
2373
+ "learning_rate": 2.716161158895428e-06,
2374
+ "loss": 0.1019,
2375
+ "step": 164500
2376
+ },
2377
+ {
2378
+ "epoch": 7.469443186962426,
2379
+ "grad_norm": 68.3709716796875,
2380
+ "learning_rate": 2.692081715997419e-06,
2381
+ "loss": 0.0963,
2382
+ "step": 165000
2383
+ },
2384
+ {
2385
+ "epoch": 7.492077863286555,
2386
+ "grad_norm": 9.346389770507812,
2387
+ "learning_rate": 2.6680022730994097e-06,
2388
+ "loss": 0.0944,
2389
+ "step": 165500
2390
+ },
2391
+ {
2392
+ "epoch": 7.5147125396106835,
2393
+ "grad_norm": 1.0322046279907227,
2394
+ "learning_rate": 2.643922830201401e-06,
2395
+ "loss": 0.0972,
2396
+ "step": 166000
2397
+ },
2398
+ {
2399
+ "epoch": 7.537347215934812,
2400
+ "grad_norm": 0.0681796446442604,
2401
+ "learning_rate": 2.6198433873033918e-06,
2402
+ "loss": 0.0937,
2403
+ "step": 166500
2404
+ },
2405
+ {
2406
+ "epoch": 7.559981892258941,
2407
+ "grad_norm": 7.419528484344482,
2408
+ "learning_rate": 2.595763944405382e-06,
2409
+ "loss": 0.0991,
2410
+ "step": 167000
2411
+ },
2412
+ {
2413
+ "epoch": 7.582616568583069,
2414
+ "grad_norm": 0.04105505347251892,
2415
+ "learning_rate": 2.5716845015073735e-06,
2416
+ "loss": 0.101,
2417
+ "step": 167500
2418
+ },
2419
+ {
2420
+ "epoch": 7.605251244907198,
2421
+ "grad_norm": 107.24671173095703,
2422
+ "learning_rate": 2.5476050586093643e-06,
2423
+ "loss": 0.1087,
2424
+ "step": 168000
2425
+ },
2426
+ {
2427
+ "epoch": 7.627885921231327,
2428
+ "grad_norm": 0.025944950059056282,
2429
+ "learning_rate": 2.523525615711355e-06,
2430
+ "loss": 0.092,
2431
+ "step": 168500
2432
+ },
2433
+ {
2434
+ "epoch": 7.650520597555455,
2435
+ "grad_norm": 0.029952147975564003,
2436
+ "learning_rate": 2.499446172813346e-06,
2437
+ "loss": 0.0984,
2438
+ "step": 169000
2439
+ },
2440
+ {
2441
+ "epoch": 7.673155273879583,
2442
+ "grad_norm": 22.573421478271484,
2443
+ "learning_rate": 2.475366729915337e-06,
2444
+ "loss": 0.1192,
2445
+ "step": 169500
2446
+ },
2447
+ {
2448
+ "epoch": 7.695789950203712,
2449
+ "grad_norm": 0.019312582910060883,
2450
+ "learning_rate": 2.4512872870173276e-06,
2451
+ "loss": 0.0935,
2452
+ "step": 170000
2453
+ },
2454
+ {
2455
+ "epoch": 7.718424626527841,
2456
+ "grad_norm": 0.07312732934951782,
2457
+ "learning_rate": 2.4272078441193185e-06,
2458
+ "loss": 0.0914,
2459
+ "step": 170500
2460
+ },
2461
+ {
2462
+ "epoch": 7.741059302851969,
2463
+ "grad_norm": 1.7685819864273071,
2464
+ "learning_rate": 2.4031284012213097e-06,
2465
+ "loss": 0.1105,
2466
+ "step": 171000
2467
+ },
2468
+ {
2469
+ "epoch": 7.7636939791760975,
2470
+ "grad_norm": 22.89219856262207,
2471
+ "learning_rate": 2.3790489583233006e-06,
2472
+ "loss": 0.0893,
2473
+ "step": 171500
2474
+ },
2475
+ {
2476
+ "epoch": 7.786328655500227,
2477
+ "grad_norm": 0.03957263007760048,
2478
+ "learning_rate": 2.3549695154252914e-06,
2479
+ "loss": 0.0925,
2480
+ "step": 172000
2481
+ },
2482
+ {
2483
+ "epoch": 7.808963331824355,
2484
+ "grad_norm": 102.64043426513672,
2485
+ "learning_rate": 2.3308900725272822e-06,
2486
+ "loss": 0.1009,
2487
+ "step": 172500
2488
+ },
2489
+ {
2490
+ "epoch": 7.831598008148483,
2491
+ "grad_norm": 0.41924870014190674,
2492
+ "learning_rate": 2.306810629629273e-06,
2493
+ "loss": 0.0946,
2494
+ "step": 173000
2495
+ },
2496
+ {
2497
+ "epoch": 7.854232684472612,
2498
+ "grad_norm": 48.82762145996094,
2499
+ "learning_rate": 2.282731186731264e-06,
2500
+ "loss": 0.1056,
2501
+ "step": 173500
2502
+ },
2503
+ {
2504
+ "epoch": 7.876867360796741,
2505
+ "grad_norm": 4.439919471740723,
2506
+ "learning_rate": 2.2586517438332547e-06,
2507
+ "loss": 0.1042,
2508
+ "step": 174000
2509
+ },
2510
+ {
2511
+ "epoch": 7.899502037120869,
2512
+ "grad_norm": 0.15483979880809784,
2513
+ "learning_rate": 2.2345723009352456e-06,
2514
+ "loss": 0.1082,
2515
+ "step": 174500
2516
+ },
2517
+ {
2518
+ "epoch": 7.922136713444997,
2519
+ "grad_norm": 0.009561842307448387,
2520
+ "learning_rate": 2.210492858037237e-06,
2521
+ "loss": 0.082,
2522
+ "step": 175000
2523
+ },
2524
+ {
2525
+ "epoch": 7.944771389769127,
2526
+ "grad_norm": 0.04719580337405205,
2527
+ "learning_rate": 2.1864134151392277e-06,
2528
+ "loss": 0.0985,
2529
+ "step": 175500
2530
+ },
2531
+ {
2532
+ "epoch": 7.967406066093255,
2533
+ "grad_norm": 0.052117399871349335,
2534
+ "learning_rate": 2.1623339722412185e-06,
2535
+ "loss": 0.099,
2536
+ "step": 176000
2537
+ },
2538
+ {
2539
+ "epoch": 7.990040742417383,
2540
+ "grad_norm": 1.7922203540802002,
2541
+ "learning_rate": 2.1382545293432093e-06,
2542
+ "loss": 0.0947,
2543
+ "step": 176500
2544
+ },
2545
+ {
2546
+ "epoch": 8.0,
2547
+ "eval_accuracy": 0.8305110641440249,
2548
+ "eval_loss": 1.0375256538391113,
2549
+ "eval_runtime": 25.1873,
2550
+ "eval_samples_per_second": 1559.156,
2551
+ "eval_steps_per_second": 97.47,
2552
+ "step": 176720
2553
+ },
2554
+ {
2555
+ "epoch": 8.012675418741512,
2556
+ "grad_norm": 58.80795669555664,
2557
+ "learning_rate": 2.1141750864452e-06,
2558
+ "loss": 0.0821,
2559
+ "step": 177000
2560
+ },
2561
+ {
2562
+ "epoch": 8.03531009506564,
2563
+ "grad_norm": 2.5211691856384277,
2564
+ "learning_rate": 2.090095643547191e-06,
2565
+ "loss": 0.0762,
2566
+ "step": 177500
2567
+ },
2568
+ {
2569
+ "epoch": 8.057944771389769,
2570
+ "grad_norm": 0.0988275557756424,
2571
+ "learning_rate": 2.066016200649182e-06,
2572
+ "loss": 0.0768,
2573
+ "step": 178000
2574
+ },
2575
+ {
2576
+ "epoch": 8.080579447713898,
2577
+ "grad_norm": 63.19043731689453,
2578
+ "learning_rate": 2.0419367577511727e-06,
2579
+ "loss": 0.0705,
2580
+ "step": 178500
2581
+ },
2582
+ {
2583
+ "epoch": 8.103214124038026,
2584
+ "grad_norm": 58.85744094848633,
2585
+ "learning_rate": 2.017857314853164e-06,
2586
+ "loss": 0.0772,
2587
+ "step": 179000
2588
+ },
2589
+ {
2590
+ "epoch": 8.125848800362155,
2591
+ "grad_norm": 9.104388236999512,
2592
+ "learning_rate": 1.9937778719551548e-06,
2593
+ "loss": 0.0782,
2594
+ "step": 179500
2595
+ },
2596
+ {
2597
+ "epoch": 8.148483476686284,
2598
+ "grad_norm": 0.023754891008138657,
2599
+ "learning_rate": 1.9696984290571456e-06,
2600
+ "loss": 0.077,
2601
+ "step": 180000
2602
+ },
2603
+ {
2604
+ "epoch": 8.171118153010411,
2605
+ "grad_norm": 8.145458221435547,
2606
+ "learning_rate": 1.9456189861591364e-06,
2607
+ "loss": 0.0785,
2608
+ "step": 180500
2609
+ },
2610
+ {
2611
+ "epoch": 8.19375282933454,
2612
+ "grad_norm": 0.15183167159557343,
2613
+ "learning_rate": 1.9215395432611273e-06,
2614
+ "loss": 0.0817,
2615
+ "step": 181000
2616
+ },
2617
+ {
2618
+ "epoch": 8.21638750565867,
2619
+ "grad_norm": 10.982128143310547,
2620
+ "learning_rate": 1.8974601003631183e-06,
2621
+ "loss": 0.0791,
2622
+ "step": 181500
2623
+ },
2624
+ {
2625
+ "epoch": 8.239022181982797,
2626
+ "grad_norm": 0.029009494930505753,
2627
+ "learning_rate": 1.873380657465109e-06,
2628
+ "loss": 0.0743,
2629
+ "step": 182000
2630
+ },
2631
+ {
2632
+ "epoch": 8.261656858306926,
2633
+ "grad_norm": 0.13061901926994324,
2634
+ "learning_rate": 1.8493012145671e-06,
2635
+ "loss": 0.0883,
2636
+ "step": 182500
2637
+ },
2638
+ {
2639
+ "epoch": 8.284291534631055,
2640
+ "grad_norm": 0.09501996636390686,
2641
+ "learning_rate": 1.825221771669091e-06,
2642
+ "loss": 0.0819,
2643
+ "step": 183000
2644
+ },
2645
+ {
2646
+ "epoch": 8.306926210955183,
2647
+ "grad_norm": 0.05193400755524635,
2648
+ "learning_rate": 1.8011423287710816e-06,
2649
+ "loss": 0.0848,
2650
+ "step": 183500
2651
+ },
2652
+ {
2653
+ "epoch": 8.329560887279312,
2654
+ "grad_norm": 0.11178288608789444,
2655
+ "learning_rate": 1.7770628858730727e-06,
2656
+ "loss": 0.067,
2657
+ "step": 184000
2658
+ },
2659
+ {
2660
+ "epoch": 8.352195563603441,
2661
+ "grad_norm": 36.04344940185547,
2662
+ "learning_rate": 1.7529834429750633e-06,
2663
+ "loss": 0.0813,
2664
+ "step": 184500
2665
+ },
2666
+ {
2667
+ "epoch": 8.374830239927569,
2668
+ "grad_norm": 2.5800764560699463,
2669
+ "learning_rate": 1.7289040000770544e-06,
2670
+ "loss": 0.073,
2671
+ "step": 185000
2672
+ },
2673
+ {
2674
+ "epoch": 8.397464916251698,
2675
+ "grad_norm": 0.05370645597577095,
2676
+ "learning_rate": 1.7048245571790454e-06,
2677
+ "loss": 0.0708,
2678
+ "step": 185500
2679
+ },
2680
+ {
2681
+ "epoch": 8.420099592575827,
2682
+ "grad_norm": 0.02777365781366825,
2683
+ "learning_rate": 1.680745114281036e-06,
2684
+ "loss": 0.0789,
2685
+ "step": 186000
2686
+ },
2687
+ {
2688
+ "epoch": 8.442734268899954,
2689
+ "grad_norm": 0.1498650312423706,
2690
+ "learning_rate": 1.656665671383027e-06,
2691
+ "loss": 0.0722,
2692
+ "step": 186500
2693
+ },
2694
+ {
2695
+ "epoch": 8.465368945224084,
2696
+ "grad_norm": 0.03529064729809761,
2697
+ "learning_rate": 1.6325862284850181e-06,
2698
+ "loss": 0.0801,
2699
+ "step": 187000
2700
+ },
2701
+ {
2702
+ "epoch": 8.488003621548211,
2703
+ "grad_norm": 0.09018663316965103,
2704
+ "learning_rate": 1.6085067855870087e-06,
2705
+ "loss": 0.0763,
2706
+ "step": 187500
2707
+ },
2708
+ {
2709
+ "epoch": 8.51063829787234,
2710
+ "grad_norm": 0.02561176009476185,
2711
+ "learning_rate": 1.5844273426889998e-06,
2712
+ "loss": 0.0776,
2713
+ "step": 188000
2714
+ },
2715
+ {
2716
+ "epoch": 8.53327297419647,
2717
+ "grad_norm": 47.35366439819336,
2718
+ "learning_rate": 1.5603478997909904e-06,
2719
+ "loss": 0.0729,
2720
+ "step": 188500
2721
+ },
2722
+ {
2723
+ "epoch": 8.555907650520597,
2724
+ "grad_norm": 0.06393767893314362,
2725
+ "learning_rate": 1.5362684568929815e-06,
2726
+ "loss": 0.0685,
2727
+ "step": 189000
2728
+ },
2729
+ {
2730
+ "epoch": 8.578542326844726,
2731
+ "grad_norm": 0.017419660463929176,
2732
+ "learning_rate": 1.5121890139949725e-06,
2733
+ "loss": 0.0838,
2734
+ "step": 189500
2735
+ },
2736
+ {
2737
+ "epoch": 8.601177003168855,
2738
+ "grad_norm": 0.030293475836515427,
2739
+ "learning_rate": 1.4881095710969631e-06,
2740
+ "loss": 0.0849,
2741
+ "step": 190000
2742
+ },
2743
+ {
2744
+ "epoch": 8.623811679492983,
2745
+ "grad_norm": 114.80469512939453,
2746
+ "learning_rate": 1.4640301281989542e-06,
2747
+ "loss": 0.0755,
2748
+ "step": 190500
2749
+ },
2750
+ {
2751
+ "epoch": 8.646446355817112,
2752
+ "grad_norm": 5.5691118240356445,
2753
+ "learning_rate": 1.4399506853009452e-06,
2754
+ "loss": 0.0719,
2755
+ "step": 191000
2756
+ },
2757
+ {
2758
+ "epoch": 8.669081032141241,
2759
+ "grad_norm": 0.02766902558505535,
2760
+ "learning_rate": 1.4158712424029358e-06,
2761
+ "loss": 0.0706,
2762
+ "step": 191500
2763
+ },
2764
+ {
2765
+ "epoch": 8.691715708465368,
2766
+ "grad_norm": 106.3794174194336,
2767
+ "learning_rate": 1.3917917995049269e-06,
2768
+ "loss": 0.0879,
2769
+ "step": 192000
2770
+ },
2771
+ {
2772
+ "epoch": 8.714350384789498,
2773
+ "grad_norm": 0.12758715450763702,
2774
+ "learning_rate": 1.3677123566069175e-06,
2775
+ "loss": 0.0909,
2776
+ "step": 192500
2777
+ },
2778
+ {
2779
+ "epoch": 8.736985061113627,
2780
+ "grad_norm": 0.020692672580480576,
2781
+ "learning_rate": 1.3436329137089086e-06,
2782
+ "loss": 0.0751,
2783
+ "step": 193000
2784
+ },
2785
+ {
2786
+ "epoch": 8.759619737437754,
2787
+ "grad_norm": 0.05015513673424721,
2788
+ "learning_rate": 1.3195534708108996e-06,
2789
+ "loss": 0.0885,
2790
+ "step": 193500
2791
+ },
2792
+ {
2793
+ "epoch": 8.782254413761883,
2794
+ "grad_norm": 0.04662444815039635,
2795
+ "learning_rate": 1.2954740279128902e-06,
2796
+ "loss": 0.0835,
2797
+ "step": 194000
2798
+ },
2799
+ {
2800
+ "epoch": 8.80488909008601,
2801
+ "grad_norm": 0.04807688295841217,
2802
+ "learning_rate": 1.2713945850148813e-06,
2803
+ "loss": 0.0796,
2804
+ "step": 194500
2805
+ },
2806
+ {
2807
+ "epoch": 8.82752376641014,
2808
+ "grad_norm": 2.124725103378296,
2809
+ "learning_rate": 1.247315142116872e-06,
2810
+ "loss": 0.0713,
2811
+ "step": 195000
2812
+ },
2813
+ {
2814
+ "epoch": 8.85015844273427,
2815
+ "grad_norm": 32.81934356689453,
2816
+ "learning_rate": 1.223235699218863e-06,
2817
+ "loss": 0.0762,
2818
+ "step": 195500
2819
+ },
2820
+ {
2821
+ "epoch": 8.872793119058397,
2822
+ "grad_norm": 0.08307647705078125,
2823
+ "learning_rate": 1.199156256320854e-06,
2824
+ "loss": 0.0869,
2825
+ "step": 196000
2826
+ },
2827
+ {
2828
+ "epoch": 8.895427795382526,
2829
+ "grad_norm": 0.09742166846990585,
2830
+ "learning_rate": 1.1750768134228448e-06,
2831
+ "loss": 0.085,
2832
+ "step": 196500
2833
+ },
2834
+ {
2835
+ "epoch": 8.918062471706655,
2836
+ "grad_norm": 0.04141981527209282,
2837
+ "learning_rate": 1.1509973705248357e-06,
2838
+ "loss": 0.0728,
2839
+ "step": 197000
2840
+ },
2841
+ {
2842
+ "epoch": 8.940697148030782,
2843
+ "grad_norm": 6.632791996002197,
2844
+ "learning_rate": 1.1269179276268265e-06,
2845
+ "loss": 0.0843,
2846
+ "step": 197500
2847
+ },
2848
+ {
2849
+ "epoch": 8.963331824354912,
2850
+ "grad_norm": 73.18407440185547,
2851
+ "learning_rate": 1.1028384847288175e-06,
2852
+ "loss": 0.076,
2853
+ "step": 198000
2854
+ },
2855
+ {
2856
+ "epoch": 8.98596650067904,
2857
+ "grad_norm": 0.08090908825397491,
2858
+ "learning_rate": 1.0787590418308084e-06,
2859
+ "loss": 0.0866,
2860
+ "step": 198500
2861
+ },
2862
+ {
2863
+ "epoch": 9.0,
2864
+ "eval_accuracy": 0.8302564233149143,
2865
+ "eval_loss": 1.1189287900924683,
2866
+ "eval_runtime": 25.1471,
2867
+ "eval_samples_per_second": 1561.652,
2868
+ "eval_steps_per_second": 97.626,
2869
+ "step": 198810
2870
+ },
2871
+ {
2872
+ "epoch": 9.008601177003168,
2873
+ "grad_norm": 0.05791415274143219,
2874
+ "learning_rate": 1.0546795989327992e-06,
2875
+ "loss": 0.0801,
2876
+ "step": 199000
2877
+ },
2878
+ {
2879
+ "epoch": 9.031235853327297,
2880
+ "grad_norm": 8.80756950378418,
2881
+ "learning_rate": 1.03060015603479e-06,
2882
+ "loss": 0.0634,
2883
+ "step": 199500
2884
+ },
2885
+ {
2886
+ "epoch": 9.053870529651427,
2887
+ "grad_norm": 2.5306966304779053,
2888
+ "learning_rate": 1.006520713136781e-06,
2889
+ "loss": 0.0585,
2890
+ "step": 200000
2891
+ },
2892
+ {
2893
+ "epoch": 9.076505205975554,
2894
+ "grad_norm": 0.22930902242660522,
2895
+ "learning_rate": 9.82441270238772e-07,
2896
+ "loss": 0.0678,
2897
+ "step": 200500
2898
+ },
2899
+ {
2900
+ "epoch": 9.099139882299683,
2901
+ "grad_norm": 36.6495246887207,
2902
+ "learning_rate": 9.583618273407628e-07,
2903
+ "loss": 0.067,
2904
+ "step": 201000
2905
+ },
2906
+ {
2907
+ "epoch": 9.121774558623812,
2908
+ "grad_norm": 5.1042327880859375,
2909
+ "learning_rate": 9.342823844427536e-07,
2910
+ "loss": 0.0527,
2911
+ "step": 201500
2912
+ },
2913
+ {
2914
+ "epoch": 9.14440923494794,
2915
+ "grad_norm": 5.1377339363098145,
2916
+ "learning_rate": 9.102029415447445e-07,
2917
+ "loss": 0.0605,
2918
+ "step": 202000
2919
+ },
2920
+ {
2921
+ "epoch": 9.167043911272069,
2922
+ "grad_norm": 0.8074043393135071,
2923
+ "learning_rate": 8.861234986467354e-07,
2924
+ "loss": 0.0644,
2925
+ "step": 202500
2926
+ },
2927
+ {
2928
+ "epoch": 9.189678587596198,
2929
+ "grad_norm": 26.022066116333008,
2930
+ "learning_rate": 8.620440557487263e-07,
2931
+ "loss": 0.0715,
2932
+ "step": 203000
2933
+ },
2934
+ {
2935
+ "epoch": 9.212313263920326,
2936
+ "grad_norm": 24.25147819519043,
2937
+ "learning_rate": 8.379646128507171e-07,
2938
+ "loss": 0.0671,
2939
+ "step": 203500
2940
+ },
2941
+ {
2942
+ "epoch": 9.234947940244455,
2943
+ "grad_norm": 0.5930929780006409,
2944
+ "learning_rate": 8.138851699527081e-07,
2945
+ "loss": 0.0664,
2946
+ "step": 204000
2947
+ },
2948
+ {
2949
+ "epoch": 9.257582616568584,
2950
+ "grad_norm": 5.489414215087891,
2951
+ "learning_rate": 7.898057270546989e-07,
2952
+ "loss": 0.0595,
2953
+ "step": 204500
2954
+ },
2955
+ {
2956
+ "epoch": 9.280217292892711,
2957
+ "grad_norm": 47.08729934692383,
2958
+ "learning_rate": 7.657262841566899e-07,
2959
+ "loss": 0.0645,
2960
+ "step": 205000
2961
+ },
2962
+ {
2963
+ "epoch": 9.30285196921684,
2964
+ "grad_norm": 0.41389790177345276,
2965
+ "learning_rate": 7.416468412586807e-07,
2966
+ "loss": 0.0643,
2967
+ "step": 205500
2968
+ },
2969
+ {
2970
+ "epoch": 9.325486645540968,
2971
+ "grad_norm": 0.016123546287417412,
2972
+ "learning_rate": 7.175673983606715e-07,
2973
+ "loss": 0.056,
2974
+ "step": 206000
2975
+ },
2976
+ {
2977
+ "epoch": 9.348121321865097,
2978
+ "grad_norm": 1.958616018295288,
2979
+ "learning_rate": 6.934879554626625e-07,
2980
+ "loss": 0.0646,
2981
+ "step": 206500
2982
+ },
2983
+ {
2984
+ "epoch": 9.370755998189226,
2985
+ "grad_norm": 0.15595561265945435,
2986
+ "learning_rate": 6.694085125646534e-07,
2987
+ "loss": 0.0521,
2988
+ "step": 207000
2989
+ },
2990
+ {
2991
+ "epoch": 9.393390674513354,
2992
+ "grad_norm": 0.10638213902711868,
2993
+ "learning_rate": 6.453290696666442e-07,
2994
+ "loss": 0.0629,
2995
+ "step": 207500
2996
+ },
2997
+ {
2998
+ "epoch": 9.416025350837483,
2999
+ "grad_norm": 23.703229904174805,
3000
+ "learning_rate": 6.212496267686352e-07,
3001
+ "loss": 0.066,
3002
+ "step": 208000
3003
+ },
3004
+ {
3005
+ "epoch": 9.438660027161612,
3006
+ "grad_norm": 0.07439934462308884,
3007
+ "learning_rate": 5.97170183870626e-07,
3008
+ "loss": 0.0575,
3009
+ "step": 208500
3010
+ },
3011
+ {
3012
+ "epoch": 9.46129470348574,
3013
+ "grad_norm": 0.5851670503616333,
3014
+ "learning_rate": 5.73090740972617e-07,
3015
+ "loss": 0.0545,
3016
+ "step": 209000
3017
+ },
3018
+ {
3019
+ "epoch": 9.483929379809869,
3020
+ "grad_norm": 8.999300956726074,
3021
+ "learning_rate": 5.490112980746078e-07,
3022
+ "loss": 0.0681,
3023
+ "step": 209500
3024
+ },
3025
+ {
3026
+ "epoch": 9.506564056133998,
3027
+ "grad_norm": 0.02180316112935543,
3028
+ "learning_rate": 5.249318551765987e-07,
3029
+ "loss": 0.0604,
3030
+ "step": 210000
3031
+ },
3032
+ {
3033
+ "epoch": 9.529198732458125,
3034
+ "grad_norm": 0.061831023544073105,
3035
+ "learning_rate": 5.008524122785896e-07,
3036
+ "loss": 0.0602,
3037
+ "step": 210500
3038
+ },
3039
+ {
3040
+ "epoch": 9.551833408782255,
3041
+ "grad_norm": 0.13079918920993805,
3042
+ "learning_rate": 4.7677296938058045e-07,
3043
+ "loss": 0.0548,
3044
+ "step": 211000
3045
+ },
3046
+ {
3047
+ "epoch": 9.574468085106384,
3048
+ "grad_norm": 26.961414337158203,
3049
+ "learning_rate": 4.526935264825713e-07,
3050
+ "loss": 0.057,
3051
+ "step": 211500
3052
+ },
3053
+ {
3054
+ "epoch": 9.597102761430511,
3055
+ "grad_norm": 0.019862385466694832,
3056
+ "learning_rate": 4.286140835845622e-07,
3057
+ "loss": 0.0577,
3058
+ "step": 212000
3059
+ },
3060
+ {
3061
+ "epoch": 9.61973743775464,
3062
+ "grad_norm": 0.6857370138168335,
3063
+ "learning_rate": 4.0453464068655306e-07,
3064
+ "loss": 0.0744,
3065
+ "step": 212500
3066
+ },
3067
+ {
3068
+ "epoch": 9.64237211407877,
3069
+ "grad_norm": 0.03343261405825615,
3070
+ "learning_rate": 3.80455197788544e-07,
3071
+ "loss": 0.067,
3072
+ "step": 213000
3073
+ },
3074
+ {
3075
+ "epoch": 9.665006790402897,
3076
+ "grad_norm": 21.0791072845459,
3077
+ "learning_rate": 3.5637575489053483e-07,
3078
+ "loss": 0.054,
3079
+ "step": 213500
3080
+ },
3081
+ {
3082
+ "epoch": 9.687641466727026,
3083
+ "grad_norm": 0.22458316385746002,
3084
+ "learning_rate": 3.322963119925258e-07,
3085
+ "loss": 0.0701,
3086
+ "step": 214000
3087
+ },
3088
+ {
3089
+ "epoch": 9.710276143051153,
3090
+ "grad_norm": 0.008931541815400124,
3091
+ "learning_rate": 3.0821686909451666e-07,
3092
+ "loss": 0.0619,
3093
+ "step": 214500
3094
+ },
3095
+ {
3096
+ "epoch": 9.732910819375283,
3097
+ "grad_norm": 0.052219439297914505,
3098
+ "learning_rate": 2.8413742619650755e-07,
3099
+ "loss": 0.0531,
3100
+ "step": 215000
3101
+ },
3102
+ {
3103
+ "epoch": 9.755545495699412,
3104
+ "grad_norm": 0.5297519564628601,
3105
+ "learning_rate": 2.6005798329849844e-07,
3106
+ "loss": 0.0684,
3107
+ "step": 215500
3108
+ },
3109
+ {
3110
+ "epoch": 9.77818017202354,
3111
+ "grad_norm": 0.8211806416511536,
3112
+ "learning_rate": 2.3597854040048932e-07,
3113
+ "loss": 0.066,
3114
+ "step": 216000
3115
+ },
3116
+ {
3117
+ "epoch": 9.800814848347668,
3118
+ "grad_norm": 52.57466125488281,
3119
+ "learning_rate": 2.118990975024802e-07,
3120
+ "loss": 0.0697,
3121
+ "step": 216500
3122
+ },
3123
+ {
3124
+ "epoch": 9.823449524671798,
3125
+ "grad_norm": 8.066961288452148,
3126
+ "learning_rate": 1.878196546044711e-07,
3127
+ "loss": 0.0592,
3128
+ "step": 217000
3129
+ },
3130
+ {
3131
+ "epoch": 9.846084200995925,
3132
+ "grad_norm": 1.6638778448104858,
3133
+ "learning_rate": 1.6374021170646199e-07,
3134
+ "loss": 0.0608,
3135
+ "step": 217500
3136
+ },
3137
+ {
3138
+ "epoch": 9.868718877320054,
3139
+ "grad_norm": 0.019856590777635574,
3140
+ "learning_rate": 1.3966076880845285e-07,
3141
+ "loss": 0.0746,
3142
+ "step": 218000
3143
+ },
3144
+ {
3145
+ "epoch": 9.891353553644183,
3146
+ "grad_norm": 0.0204610638320446,
3147
+ "learning_rate": 1.1558132591044375e-07,
3148
+ "loss": 0.0709,
3149
+ "step": 218500
3150
+ },
3151
+ {
3152
+ "epoch": 9.91398822996831,
3153
+ "grad_norm": 0.011149962432682514,
3154
+ "learning_rate": 9.150188301243464e-08,
3155
+ "loss": 0.0546,
3156
+ "step": 219000
3157
+ },
3158
+ {
3159
+ "epoch": 9.93662290629244,
3160
+ "grad_norm": 0.0041813417337834835,
3161
+ "learning_rate": 6.742244011442552e-08,
3162
+ "loss": 0.0659,
3163
+ "step": 219500
3164
+ },
3165
+ {
3166
+ "epoch": 9.95925758261657,
3167
+ "grad_norm": 0.022371483966708183,
3168
+ "learning_rate": 4.3342997216416404e-08,
3169
+ "loss": 0.0738,
3170
+ "step": 220000
3171
+ },
3172
+ {
3173
+ "epoch": 9.981892258940697,
3174
+ "grad_norm": 4.121944904327393,
3175
+ "learning_rate": 1.926355431840729e-08,
3176
+ "loss": 0.0548,
3177
+ "step": 220500
3178
+ },
3179
+ {
3180
+ "epoch": 10.0,
3181
+ "eval_accuracy": 0.8308930253876906,
3182
+ "eval_loss": 1.1927790641784668,
3183
+ "eval_runtime": 25.1954,
3184
+ "eval_samples_per_second": 1558.657,
3185
+ "eval_steps_per_second": 97.438,
3186
+ "step": 220900
3187
+ },
3188
+ {
3189
+ "epoch": 10.0,
3190
+ "step": 220900,
3191
+ "total_flos": 1.4402219214422795e+17,
3192
+ "train_loss": 0.2300453716190005,
3193
+ "train_runtime": 11223.2851,
3194
+ "train_samples_per_second": 314.909,
3195
+ "train_steps_per_second": 19.682
3196
+ }
3197
+ ],
3198
+ "logging_steps": 500,
3199
+ "max_steps": 220900,
3200
+ "num_input_tokens_seen": 0,
3201
+ "num_train_epochs": 10,
3202
+ "save_steps": 500,
3203
+ "stateful_callbacks": {
3204
+ "TrainerControl": {
3205
+ "args": {
3206
+ "should_epoch_stop": false,
3207
+ "should_evaluate": false,
3208
+ "should_log": false,
3209
+ "should_save": true,
3210
+ "should_training_stop": true
3211
+ },
3212
+ "attributes": {}
3213
+ }
3214
+ },
3215
+ "total_flos": 1.4402219214422795e+17,
3216
+ "train_batch_size": 16,
3217
+ "trial_name": null,
3218
+ "trial_params": null
3219
+ }
mnli/bert-base-uncased_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5bff93bf75e45b54a38a2da67a2b5e470b1652b7fc4411d1e6c8918854be1ff
3
+ size 5240
mnli/bert-base-uncased_lr1e-05/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eba0f417795a7bd2b56d26638e00725c956a9dbbc1d9bc00dfd3243f0536d85b
3
+ size 2374992
mnli/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/shikexuan/nlu_model/roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.45.2",
34
+ "type_vocab_size": 1,
35
+ "use_cache": true,
36
+ "vocab_size": 50265
37
+ }
mnli/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d39f40fc76cca73abb83fe0ca29467979c6eeca8c092759b49c610e98e2a9470
3
+ size 498615900
mnli/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
mnli/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
mnli/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,3219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8667464541264547,
3
+ "best_model_checkpoint": "./nlu_finetuned_models/mnli/roberta-base_lr1e-05/checkpoint-110450",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 220900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.022634676324128564,
13
+ "grad_norm": 1.6427655220031738,
14
+ "learning_rate": 3.772446054021428e-07,
15
+ "loss": 1.0998,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.04526935264825713,
20
+ "grad_norm": 3.6388282775878906,
21
+ "learning_rate": 7.544892108042856e-07,
22
+ "loss": 1.0989,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.06790402897238569,
27
+ "grad_norm": 5.199594974517822,
28
+ "learning_rate": 1.1317338162064282e-06,
29
+ "loss": 1.0951,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.09053870529651425,
34
+ "grad_norm": 16.297880172729492,
35
+ "learning_rate": 1.5089784216085712e-06,
36
+ "loss": 0.9832,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.11317338162064282,
41
+ "grad_norm": 30.214738845825195,
42
+ "learning_rate": 1.886223027010714e-06,
43
+ "loss": 0.7491,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.13580805794477138,
48
+ "grad_norm": 27.916301727294922,
49
+ "learning_rate": 2.2634676324128565e-06,
50
+ "loss": 0.6467,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.15844273426889996,
55
+ "grad_norm": 42.154232025146484,
56
+ "learning_rate": 2.6407122378149996e-06,
57
+ "loss": 0.6092,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.1810774105930285,
62
+ "grad_norm": 29.020992279052734,
63
+ "learning_rate": 3.0179568432171424e-06,
64
+ "loss": 0.5782,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.2037120869171571,
69
+ "grad_norm": 31.89137840270996,
70
+ "learning_rate": 3.395201448619285e-06,
71
+ "loss": 0.5567,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.22634676324128564,
76
+ "grad_norm": 46.158416748046875,
77
+ "learning_rate": 3.772446054021428e-06,
78
+ "loss": 0.5382,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.24898143956541421,
83
+ "grad_norm": 16.78737449645996,
84
+ "learning_rate": 4.149690659423571e-06,
85
+ "loss": 0.5278,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.27161611588954276,
90
+ "grad_norm": 13.616703033447266,
91
+ "learning_rate": 4.526935264825713e-06,
92
+ "loss": 0.5233,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.29425079221367134,
97
+ "grad_norm": 21.825986862182617,
98
+ "learning_rate": 4.904179870227856e-06,
99
+ "loss": 0.5042,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.3168854685377999,
104
+ "grad_norm": 24.68229103088379,
105
+ "learning_rate": 5.281424475629999e-06,
106
+ "loss": 0.4967,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.3395201448619285,
111
+ "grad_norm": 17.55514907836914,
112
+ "learning_rate": 5.658669081032142e-06,
113
+ "loss": 0.4979,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.362154821186057,
118
+ "grad_norm": 20.74059295654297,
119
+ "learning_rate": 6.035913686434285e-06,
120
+ "loss": 0.4983,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.3847894975101856,
125
+ "grad_norm": 12.430363655090332,
126
+ "learning_rate": 6.4131582918364275e-06,
127
+ "loss": 0.4773,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.4074241738343142,
132
+ "grad_norm": 30.890859603881836,
133
+ "learning_rate": 6.79040289723857e-06,
134
+ "loss": 0.4802,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.43005885015844275,
139
+ "grad_norm": 27.11601448059082,
140
+ "learning_rate": 7.167647502640713e-06,
141
+ "loss": 0.4631,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.4526935264825713,
146
+ "grad_norm": 22.201452255249023,
147
+ "learning_rate": 7.544892108042856e-06,
148
+ "loss": 0.4575,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.47532820280669985,
153
+ "grad_norm": 22.29547119140625,
154
+ "learning_rate": 7.922136713445e-06,
155
+ "loss": 0.4643,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.49796287913082843,
160
+ "grad_norm": 15.072667121887207,
161
+ "learning_rate": 8.299381318847142e-06,
162
+ "loss": 0.466,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.520597555454957,
167
+ "grad_norm": 28.1677188873291,
168
+ "learning_rate": 8.676625924249283e-06,
169
+ "loss": 0.4619,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.5432322317790855,
174
+ "grad_norm": 17.43030548095703,
175
+ "learning_rate": 9.053870529651426e-06,
176
+ "loss": 0.4542,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 0.5658669081032142,
181
+ "grad_norm": 23.5450382232666,
182
+ "learning_rate": 9.431115135053569e-06,
183
+ "loss": 0.449,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 0.5885015844273427,
188
+ "grad_norm": 21.659141540527344,
189
+ "learning_rate": 9.808359740455711e-06,
190
+ "loss": 0.4434,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 0.6111362607514712,
195
+ "grad_norm": 19.54996681213379,
196
+ "learning_rate": 9.98815291409418e-06,
197
+ "loss": 0.4555,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 0.6337709370755998,
202
+ "grad_norm": 11.85847282409668,
203
+ "learning_rate": 9.964073471196171e-06,
204
+ "loss": 0.4511,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 0.6564056133997284,
209
+ "grad_norm": 26.537872314453125,
210
+ "learning_rate": 9.939994028298163e-06,
211
+ "loss": 0.4424,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 0.679040289723857,
216
+ "grad_norm": 16.38085174560547,
217
+ "learning_rate": 9.915914585400153e-06,
218
+ "loss": 0.4488,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 0.7016749660479855,
223
+ "grad_norm": 20.928190231323242,
224
+ "learning_rate": 9.891835142502145e-06,
225
+ "loss": 0.4368,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 0.724309642372114,
230
+ "grad_norm": 16.260562896728516,
231
+ "learning_rate": 9.867755699604135e-06,
232
+ "loss": 0.4481,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 0.7469443186962427,
237
+ "grad_norm": 9.784706115722656,
238
+ "learning_rate": 9.843676256706126e-06,
239
+ "loss": 0.4344,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 0.7695789950203712,
244
+ "grad_norm": 6.384799480438232,
245
+ "learning_rate": 9.819596813808116e-06,
246
+ "loss": 0.4416,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 0.7922136713444998,
251
+ "grad_norm": 14.344380378723145,
252
+ "learning_rate": 9.795517370910108e-06,
253
+ "loss": 0.429,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 0.8148483476686283,
258
+ "grad_norm": 33.43516159057617,
259
+ "learning_rate": 9.771437928012098e-06,
260
+ "loss": 0.4271,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.8374830239927569,
265
+ "grad_norm": 23.591716766357422,
266
+ "learning_rate": 9.74735848511409e-06,
267
+ "loss": 0.44,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.8601177003168855,
272
+ "grad_norm": 20.75270652770996,
273
+ "learning_rate": 9.723279042216081e-06,
274
+ "loss": 0.423,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 0.882752376641014,
279
+ "grad_norm": 24.833736419677734,
280
+ "learning_rate": 9.699199599318071e-06,
281
+ "loss": 0.4212,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 0.9053870529651425,
286
+ "grad_norm": 18.03992462158203,
287
+ "learning_rate": 9.675120156420061e-06,
288
+ "loss": 0.4164,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 0.9280217292892712,
293
+ "grad_norm": 12.502860069274902,
294
+ "learning_rate": 9.651040713522053e-06,
295
+ "loss": 0.4213,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 0.9506564056133997,
300
+ "grad_norm": 13.808119773864746,
301
+ "learning_rate": 9.626961270624043e-06,
302
+ "loss": 0.4197,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 0.9732910819375283,
307
+ "grad_norm": 15.191283226013184,
308
+ "learning_rate": 9.602881827726035e-06,
309
+ "loss": 0.4219,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 0.9959257582616569,
314
+ "grad_norm": 27.065793991088867,
315
+ "learning_rate": 9.578802384828026e-06,
316
+ "loss": 0.3965,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 1.0,
321
+ "eval_accuracy": 0.8536579155101729,
322
+ "eval_loss": 0.4072297513484955,
323
+ "eval_runtime": 25.9705,
324
+ "eval_samples_per_second": 1512.137,
325
+ "eval_steps_per_second": 94.53,
326
+ "step": 22090
327
+ },
328
+ {
329
+ "epoch": 1.0185604345857855,
330
+ "grad_norm": 40.00636672973633,
331
+ "learning_rate": 9.554722941930016e-06,
332
+ "loss": 0.371,
333
+ "step": 22500
334
+ },
335
+ {
336
+ "epoch": 1.041195110909914,
337
+ "grad_norm": 14.272022247314453,
338
+ "learning_rate": 9.530643499032008e-06,
339
+ "loss": 0.357,
340
+ "step": 23000
341
+ },
342
+ {
343
+ "epoch": 1.0638297872340425,
344
+ "grad_norm": 14.10964298248291,
345
+ "learning_rate": 9.506564056133998e-06,
346
+ "loss": 0.3584,
347
+ "step": 23500
348
+ },
349
+ {
350
+ "epoch": 1.086464463558171,
351
+ "grad_norm": 49.96983337402344,
352
+ "learning_rate": 9.48248461323599e-06,
353
+ "loss": 0.3567,
354
+ "step": 24000
355
+ },
356
+ {
357
+ "epoch": 1.1090991398822996,
358
+ "grad_norm": 15.82590389251709,
359
+ "learning_rate": 9.45840517033798e-06,
360
+ "loss": 0.3682,
361
+ "step": 24500
362
+ },
363
+ {
364
+ "epoch": 1.1317338162064283,
365
+ "grad_norm": 17.939584732055664,
366
+ "learning_rate": 9.43432572743997e-06,
367
+ "loss": 0.3589,
368
+ "step": 25000
369
+ },
370
+ {
371
+ "epoch": 1.1543684925305568,
372
+ "grad_norm": 9.916324615478516,
373
+ "learning_rate": 9.410246284541961e-06,
374
+ "loss": 0.3654,
375
+ "step": 25500
376
+ },
377
+ {
378
+ "epoch": 1.1770031688546854,
379
+ "grad_norm": 13.99868392944336,
380
+ "learning_rate": 9.386166841643953e-06,
381
+ "loss": 0.3609,
382
+ "step": 26000
383
+ },
384
+ {
385
+ "epoch": 1.1996378451788139,
386
+ "grad_norm": 19.407920837402344,
387
+ "learning_rate": 9.362087398745945e-06,
388
+ "loss": 0.3615,
389
+ "step": 26500
390
+ },
391
+ {
392
+ "epoch": 1.2222725215029424,
393
+ "grad_norm": 22.240596771240234,
394
+ "learning_rate": 9.338007955847935e-06,
395
+ "loss": 0.3628,
396
+ "step": 27000
397
+ },
398
+ {
399
+ "epoch": 1.2449071978270712,
400
+ "grad_norm": 27.17753028869629,
401
+ "learning_rate": 9.313928512949925e-06,
402
+ "loss": 0.3629,
403
+ "step": 27500
404
+ },
405
+ {
406
+ "epoch": 1.2675418741511997,
407
+ "grad_norm": 16.045578002929688,
408
+ "learning_rate": 9.289849070051916e-06,
409
+ "loss": 0.3735,
410
+ "step": 28000
411
+ },
412
+ {
413
+ "epoch": 1.2901765504753282,
414
+ "grad_norm": 16.507614135742188,
415
+ "learning_rate": 9.265769627153906e-06,
416
+ "loss": 0.3646,
417
+ "step": 28500
418
+ },
419
+ {
420
+ "epoch": 1.3128112267994567,
421
+ "grad_norm": 16.56389808654785,
422
+ "learning_rate": 9.241690184255898e-06,
423
+ "loss": 0.3459,
424
+ "step": 29000
425
+ },
426
+ {
427
+ "epoch": 1.3354459031235852,
428
+ "grad_norm": 21.800291061401367,
429
+ "learning_rate": 9.217610741357888e-06,
430
+ "loss": 0.3552,
431
+ "step": 29500
432
+ },
433
+ {
434
+ "epoch": 1.358080579447714,
435
+ "grad_norm": 19.036035537719727,
436
+ "learning_rate": 9.19353129845988e-06,
437
+ "loss": 0.3569,
438
+ "step": 30000
439
+ },
440
+ {
441
+ "epoch": 1.3807152557718425,
442
+ "grad_norm": 20.2823543548584,
443
+ "learning_rate": 9.169451855561871e-06,
444
+ "loss": 0.3635,
445
+ "step": 30500
446
+ },
447
+ {
448
+ "epoch": 1.403349932095971,
449
+ "grad_norm": 9.948273658752441,
450
+ "learning_rate": 9.145372412663861e-06,
451
+ "loss": 0.3626,
452
+ "step": 31000
453
+ },
454
+ {
455
+ "epoch": 1.4259846084200996,
456
+ "grad_norm": 24.87046241760254,
457
+ "learning_rate": 9.121292969765853e-06,
458
+ "loss": 0.3625,
459
+ "step": 31500
460
+ },
461
+ {
462
+ "epoch": 1.448619284744228,
463
+ "grad_norm": 7.488998889923096,
464
+ "learning_rate": 9.097213526867843e-06,
465
+ "loss": 0.3598,
466
+ "step": 32000
467
+ },
468
+ {
469
+ "epoch": 1.4712539610683568,
470
+ "grad_norm": 22.464574813842773,
471
+ "learning_rate": 9.073134083969835e-06,
472
+ "loss": 0.354,
473
+ "step": 32500
474
+ },
475
+ {
476
+ "epoch": 1.4938886373924853,
477
+ "grad_norm": 18.735248565673828,
478
+ "learning_rate": 9.049054641071825e-06,
479
+ "loss": 0.3559,
480
+ "step": 33000
481
+ },
482
+ {
483
+ "epoch": 1.5165233137166139,
484
+ "grad_norm": 17.83134651184082,
485
+ "learning_rate": 9.024975198173815e-06,
486
+ "loss": 0.3458,
487
+ "step": 33500
488
+ },
489
+ {
490
+ "epoch": 1.5391579900407424,
491
+ "grad_norm": 22.070232391357422,
492
+ "learning_rate": 9.000895755275806e-06,
493
+ "loss": 0.3665,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 1.561792666364871,
498
+ "grad_norm": 6.331955432891846,
499
+ "learning_rate": 8.976816312377798e-06,
500
+ "loss": 0.3607,
501
+ "step": 34500
502
+ },
503
+ {
504
+ "epoch": 1.5844273426889997,
505
+ "grad_norm": 9.2369966506958,
506
+ "learning_rate": 8.95273686947979e-06,
507
+ "loss": 0.3549,
508
+ "step": 35000
509
+ },
510
+ {
511
+ "epoch": 1.607062019013128,
512
+ "grad_norm": 14.87072467803955,
513
+ "learning_rate": 8.92865742658178e-06,
514
+ "loss": 0.3577,
515
+ "step": 35500
516
+ },
517
+ {
518
+ "epoch": 1.6296966953372567,
519
+ "grad_norm": 21.532787322998047,
520
+ "learning_rate": 8.90457798368377e-06,
521
+ "loss": 0.353,
522
+ "step": 36000
523
+ },
524
+ {
525
+ "epoch": 1.6523313716613852,
526
+ "grad_norm": 21.97151756286621,
527
+ "learning_rate": 8.880498540785761e-06,
528
+ "loss": 0.3706,
529
+ "step": 36500
530
+ },
531
+ {
532
+ "epoch": 1.6749660479855137,
533
+ "grad_norm": 17.71976661682129,
534
+ "learning_rate": 8.856419097887751e-06,
535
+ "loss": 0.3648,
536
+ "step": 37000
537
+ },
538
+ {
539
+ "epoch": 1.6976007243096425,
540
+ "grad_norm": 21.98705291748047,
541
+ "learning_rate": 8.832339654989743e-06,
542
+ "loss": 0.3591,
543
+ "step": 37500
544
+ },
545
+ {
546
+ "epoch": 1.7202354006337708,
547
+ "grad_norm": 26.871360778808594,
548
+ "learning_rate": 8.808260212091733e-06,
549
+ "loss": 0.3454,
550
+ "step": 38000
551
+ },
552
+ {
553
+ "epoch": 1.7428700769578995,
554
+ "grad_norm": 12.331396102905273,
555
+ "learning_rate": 8.784180769193725e-06,
556
+ "loss": 0.3418,
557
+ "step": 38500
558
+ },
559
+ {
560
+ "epoch": 1.765504753282028,
561
+ "grad_norm": 7.1670756340026855,
562
+ "learning_rate": 8.760101326295716e-06,
563
+ "loss": 0.3454,
564
+ "step": 39000
565
+ },
566
+ {
567
+ "epoch": 1.7881394296061566,
568
+ "grad_norm": 18.57856559753418,
569
+ "learning_rate": 8.736021883397706e-06,
570
+ "loss": 0.345,
571
+ "step": 39500
572
+ },
573
+ {
574
+ "epoch": 1.8107741059302853,
575
+ "grad_norm": 36.80974578857422,
576
+ "learning_rate": 8.711942440499698e-06,
577
+ "loss": 0.3475,
578
+ "step": 40000
579
+ },
580
+ {
581
+ "epoch": 1.8334087822544136,
582
+ "grad_norm": 19.742300033569336,
583
+ "learning_rate": 8.687862997601688e-06,
584
+ "loss": 0.3499,
585
+ "step": 40500
586
+ },
587
+ {
588
+ "epoch": 1.8560434585785424,
589
+ "grad_norm": 13.53554630279541,
590
+ "learning_rate": 8.663783554703678e-06,
591
+ "loss": 0.3625,
592
+ "step": 41000
593
+ },
594
+ {
595
+ "epoch": 1.8786781349026709,
596
+ "grad_norm": 12.660998344421387,
597
+ "learning_rate": 8.63970411180567e-06,
598
+ "loss": 0.355,
599
+ "step": 41500
600
+ },
601
+ {
602
+ "epoch": 1.9013128112267994,
603
+ "grad_norm": 10.94740104675293,
604
+ "learning_rate": 8.615624668907661e-06,
605
+ "loss": 0.3495,
606
+ "step": 42000
607
+ },
608
+ {
609
+ "epoch": 1.9239474875509281,
610
+ "grad_norm": 39.87784957885742,
611
+ "learning_rate": 8.591545226009653e-06,
612
+ "loss": 0.3484,
613
+ "step": 42500
614
+ },
615
+ {
616
+ "epoch": 1.9465821638750564,
617
+ "grad_norm": 15.916511535644531,
618
+ "learning_rate": 8.567465783111643e-06,
619
+ "loss": 0.3511,
620
+ "step": 43000
621
+ },
622
+ {
623
+ "epoch": 1.9692168401991852,
624
+ "grad_norm": 19.070566177368164,
625
+ "learning_rate": 8.543386340213633e-06,
626
+ "loss": 0.334,
627
+ "step": 43500
628
+ },
629
+ {
630
+ "epoch": 1.9918515165233137,
631
+ "grad_norm": 21.015113830566406,
632
+ "learning_rate": 8.519306897315625e-06,
633
+ "loss": 0.3463,
634
+ "step": 44000
635
+ },
636
+ {
637
+ "epoch": 2.0,
638
+ "eval_accuracy": 0.8632324106847292,
639
+ "eval_loss": 0.37872129678726196,
640
+ "eval_runtime": 26.0123,
641
+ "eval_samples_per_second": 1509.711,
642
+ "eval_steps_per_second": 94.379,
643
+ "step": 44180
644
+ },
645
+ {
646
+ "epoch": 2.0144861928474422,
647
+ "grad_norm": 12.006568908691406,
648
+ "learning_rate": 8.495227454417615e-06,
649
+ "loss": 0.2925,
650
+ "step": 44500
651
+ },
652
+ {
653
+ "epoch": 2.037120869171571,
654
+ "grad_norm": 16.82537078857422,
655
+ "learning_rate": 8.471148011519606e-06,
656
+ "loss": 0.2821,
657
+ "step": 45000
658
+ },
659
+ {
660
+ "epoch": 2.0597555454956993,
661
+ "grad_norm": 14.019499778747559,
662
+ "learning_rate": 8.447068568621596e-06,
663
+ "loss": 0.2759,
664
+ "step": 45500
665
+ },
666
+ {
667
+ "epoch": 2.082390221819828,
668
+ "grad_norm": 11.033167839050293,
669
+ "learning_rate": 8.422989125723588e-06,
670
+ "loss": 0.2778,
671
+ "step": 46000
672
+ },
673
+ {
674
+ "epoch": 2.1050248981439568,
675
+ "grad_norm": 55.102169036865234,
676
+ "learning_rate": 8.39890968282558e-06,
677
+ "loss": 0.2759,
678
+ "step": 46500
679
+ },
680
+ {
681
+ "epoch": 2.127659574468085,
682
+ "grad_norm": 22.346426010131836,
683
+ "learning_rate": 8.37483023992757e-06,
684
+ "loss": 0.2793,
685
+ "step": 47000
686
+ },
687
+ {
688
+ "epoch": 2.150294250792214,
689
+ "grad_norm": 9.00412368774414,
690
+ "learning_rate": 8.350750797029561e-06,
691
+ "loss": 0.2696,
692
+ "step": 47500
693
+ },
694
+ {
695
+ "epoch": 2.172928927116342,
696
+ "grad_norm": 13.098092079162598,
697
+ "learning_rate": 8.326671354131551e-06,
698
+ "loss": 0.2882,
699
+ "step": 48000
700
+ },
701
+ {
702
+ "epoch": 2.195563603440471,
703
+ "grad_norm": 16.290449142456055,
704
+ "learning_rate": 8.302591911233543e-06,
705
+ "loss": 0.2807,
706
+ "step": 48500
707
+ },
708
+ {
709
+ "epoch": 2.218198279764599,
710
+ "grad_norm": 36.5540771484375,
711
+ "learning_rate": 8.278512468335533e-06,
712
+ "loss": 0.2761,
713
+ "step": 49000
714
+ },
715
+ {
716
+ "epoch": 2.240832956088728,
717
+ "grad_norm": 6.7274065017700195,
718
+ "learning_rate": 8.254433025437523e-06,
719
+ "loss": 0.2727,
720
+ "step": 49500
721
+ },
722
+ {
723
+ "epoch": 2.2634676324128566,
724
+ "grad_norm": 10.446264266967773,
725
+ "learning_rate": 8.230353582539515e-06,
726
+ "loss": 0.2798,
727
+ "step": 50000
728
+ },
729
+ {
730
+ "epoch": 2.286102308736985,
731
+ "grad_norm": 14.677602767944336,
732
+ "learning_rate": 8.206274139641506e-06,
733
+ "loss": 0.2677,
734
+ "step": 50500
735
+ },
736
+ {
737
+ "epoch": 2.3087369850611137,
738
+ "grad_norm": 23.758255004882812,
739
+ "learning_rate": 8.182194696743498e-06,
740
+ "loss": 0.2917,
741
+ "step": 51000
742
+ },
743
+ {
744
+ "epoch": 2.3313716613852424,
745
+ "grad_norm": 15.98766803741455,
746
+ "learning_rate": 8.158115253845488e-06,
747
+ "loss": 0.2918,
748
+ "step": 51500
749
+ },
750
+ {
751
+ "epoch": 2.3540063377093707,
752
+ "grad_norm": 2.399088144302368,
753
+ "learning_rate": 8.134035810947478e-06,
754
+ "loss": 0.28,
755
+ "step": 52000
756
+ },
757
+ {
758
+ "epoch": 2.3766410140334995,
759
+ "grad_norm": 15.759695053100586,
760
+ "learning_rate": 8.10995636804947e-06,
761
+ "loss": 0.2775,
762
+ "step": 52500
763
+ },
764
+ {
765
+ "epoch": 2.3992756903576278,
766
+ "grad_norm": 46.171875,
767
+ "learning_rate": 8.08587692515146e-06,
768
+ "loss": 0.2799,
769
+ "step": 53000
770
+ },
771
+ {
772
+ "epoch": 2.4219103666817565,
773
+ "grad_norm": 41.83917236328125,
774
+ "learning_rate": 8.061797482253451e-06,
775
+ "loss": 0.2905,
776
+ "step": 53500
777
+ },
778
+ {
779
+ "epoch": 2.444545043005885,
780
+ "grad_norm": 28.17142677307129,
781
+ "learning_rate": 8.037718039355441e-06,
782
+ "loss": 0.2897,
783
+ "step": 54000
784
+ },
785
+ {
786
+ "epoch": 2.4671797193300136,
787
+ "grad_norm": 30.47774314880371,
788
+ "learning_rate": 8.013638596457433e-06,
789
+ "loss": 0.2687,
790
+ "step": 54500
791
+ },
792
+ {
793
+ "epoch": 2.4898143956541423,
794
+ "grad_norm": 24.639873504638672,
795
+ "learning_rate": 7.989559153559425e-06,
796
+ "loss": 0.2965,
797
+ "step": 55000
798
+ },
799
+ {
800
+ "epoch": 2.5124490719782706,
801
+ "grad_norm": 24.947662353515625,
802
+ "learning_rate": 7.965479710661415e-06,
803
+ "loss": 0.2893,
804
+ "step": 55500
805
+ },
806
+ {
807
+ "epoch": 2.5350837483023994,
808
+ "grad_norm": 28.483293533325195,
809
+ "learning_rate": 7.941400267763406e-06,
810
+ "loss": 0.2757,
811
+ "step": 56000
812
+ },
813
+ {
814
+ "epoch": 2.557718424626528,
815
+ "grad_norm": 45.0990104675293,
816
+ "learning_rate": 7.917320824865396e-06,
817
+ "loss": 0.2682,
818
+ "step": 56500
819
+ },
820
+ {
821
+ "epoch": 2.5803531009506564,
822
+ "grad_norm": 22.60608673095703,
823
+ "learning_rate": 7.893241381967386e-06,
824
+ "loss": 0.2779,
825
+ "step": 57000
826
+ },
827
+ {
828
+ "epoch": 2.6029877772747847,
829
+ "grad_norm": 31.8905029296875,
830
+ "learning_rate": 7.869161939069378e-06,
831
+ "loss": 0.2857,
832
+ "step": 57500
833
+ },
834
+ {
835
+ "epoch": 2.6256224535989134,
836
+ "grad_norm": 10.05256175994873,
837
+ "learning_rate": 7.84508249617137e-06,
838
+ "loss": 0.2765,
839
+ "step": 58000
840
+ },
841
+ {
842
+ "epoch": 2.648257129923042,
843
+ "grad_norm": 18.466182708740234,
844
+ "learning_rate": 7.82100305327336e-06,
845
+ "loss": 0.2743,
846
+ "step": 58500
847
+ },
848
+ {
849
+ "epoch": 2.6708918062471705,
850
+ "grad_norm": 22.704708099365234,
851
+ "learning_rate": 7.796923610375351e-06,
852
+ "loss": 0.2718,
853
+ "step": 59000
854
+ },
855
+ {
856
+ "epoch": 2.6935264825712992,
857
+ "grad_norm": 38.51487731933594,
858
+ "learning_rate": 7.772844167477341e-06,
859
+ "loss": 0.2733,
860
+ "step": 59500
861
+ },
862
+ {
863
+ "epoch": 2.716161158895428,
864
+ "grad_norm": 25.67682647705078,
865
+ "learning_rate": 7.748764724579333e-06,
866
+ "loss": 0.2803,
867
+ "step": 60000
868
+ },
869
+ {
870
+ "epoch": 2.7387958352195563,
871
+ "grad_norm": 39.251068115234375,
872
+ "learning_rate": 7.724685281681323e-06,
873
+ "loss": 0.2757,
874
+ "step": 60500
875
+ },
876
+ {
877
+ "epoch": 2.761430511543685,
878
+ "grad_norm": 20.692581176757812,
879
+ "learning_rate": 7.700605838783315e-06,
880
+ "loss": 0.2853,
881
+ "step": 61000
882
+ },
883
+ {
884
+ "epoch": 2.7840651878678138,
885
+ "grad_norm": 22.915571212768555,
886
+ "learning_rate": 7.676526395885305e-06,
887
+ "loss": 0.2899,
888
+ "step": 61500
889
+ },
890
+ {
891
+ "epoch": 2.806699864191942,
892
+ "grad_norm": 20.167299270629883,
893
+ "learning_rate": 7.652446952987296e-06,
894
+ "loss": 0.2802,
895
+ "step": 62000
896
+ },
897
+ {
898
+ "epoch": 2.8293345405160704,
899
+ "grad_norm": 31.358797073364258,
900
+ "learning_rate": 7.628367510089287e-06,
901
+ "loss": 0.2775,
902
+ "step": 62500
903
+ },
904
+ {
905
+ "epoch": 2.851969216840199,
906
+ "grad_norm": 16.160572052001953,
907
+ "learning_rate": 7.604288067191278e-06,
908
+ "loss": 0.2755,
909
+ "step": 63000
910
+ },
911
+ {
912
+ "epoch": 2.874603893164328,
913
+ "grad_norm": 24.560882568359375,
914
+ "learning_rate": 7.58020862429327e-06,
915
+ "loss": 0.3024,
916
+ "step": 63500
917
+ },
918
+ {
919
+ "epoch": 2.897238569488456,
920
+ "grad_norm": 30.818029403686523,
921
+ "learning_rate": 7.55612918139526e-06,
922
+ "loss": 0.2878,
923
+ "step": 64000
924
+ },
925
+ {
926
+ "epoch": 2.919873245812585,
927
+ "grad_norm": 31.68956756591797,
928
+ "learning_rate": 7.532049738497251e-06,
929
+ "loss": 0.2793,
930
+ "step": 64500
931
+ },
932
+ {
933
+ "epoch": 2.9425079221367136,
934
+ "grad_norm": 17.775924682617188,
935
+ "learning_rate": 7.507970295599241e-06,
936
+ "loss": 0.2824,
937
+ "step": 65000
938
+ },
939
+ {
940
+ "epoch": 2.965142598460842,
941
+ "grad_norm": 26.35023307800293,
942
+ "learning_rate": 7.483890852701232e-06,
943
+ "loss": 0.275,
944
+ "step": 65500
945
+ },
946
+ {
947
+ "epoch": 2.9877772747849707,
948
+ "grad_norm": 13.411957740783691,
949
+ "learning_rate": 7.459811409803224e-06,
950
+ "loss": 0.2867,
951
+ "step": 66000
952
+ },
953
+ {
954
+ "epoch": 3.0,
955
+ "eval_accuracy": 0.8651422169030583,
956
+ "eval_loss": 0.39841848611831665,
957
+ "eval_runtime": 26.0093,
958
+ "eval_samples_per_second": 1509.882,
959
+ "eval_steps_per_second": 94.389,
960
+ "step": 66270
961
+ },
962
+ {
963
+ "epoch": 3.010411951109099,
964
+ "grad_norm": 31.959758758544922,
965
+ "learning_rate": 7.435731966905214e-06,
966
+ "loss": 0.2628,
967
+ "step": 66500
968
+ },
969
+ {
970
+ "epoch": 3.0330466274332277,
971
+ "grad_norm": 9.174257278442383,
972
+ "learning_rate": 7.4116525240072056e-06,
973
+ "loss": 0.2354,
974
+ "step": 67000
975
+ },
976
+ {
977
+ "epoch": 3.0556813037573565,
978
+ "grad_norm": 41.27067565917969,
979
+ "learning_rate": 7.387573081109196e-06,
980
+ "loss": 0.231,
981
+ "step": 67500
982
+ },
983
+ {
984
+ "epoch": 3.0783159800814848,
985
+ "grad_norm": 37.20170211791992,
986
+ "learning_rate": 7.363493638211186e-06,
987
+ "loss": 0.2168,
988
+ "step": 68000
989
+ },
990
+ {
991
+ "epoch": 3.1009506564056135,
992
+ "grad_norm": 37.324825286865234,
993
+ "learning_rate": 7.339414195313178e-06,
994
+ "loss": 0.2244,
995
+ "step": 68500
996
+ },
997
+ {
998
+ "epoch": 3.123585332729742,
999
+ "grad_norm": 39.479610443115234,
1000
+ "learning_rate": 7.315334752415169e-06,
1001
+ "loss": 0.2199,
1002
+ "step": 69000
1003
+ },
1004
+ {
1005
+ "epoch": 3.1462200090538706,
1006
+ "grad_norm": 38.33029556274414,
1007
+ "learning_rate": 7.29125530951716e-06,
1008
+ "loss": 0.2285,
1009
+ "step": 69500
1010
+ },
1011
+ {
1012
+ "epoch": 3.1688546853779993,
1013
+ "grad_norm": 34.06528091430664,
1014
+ "learning_rate": 7.2671758666191506e-06,
1015
+ "loss": 0.224,
1016
+ "step": 70000
1017
+ },
1018
+ {
1019
+ "epoch": 3.1914893617021276,
1020
+ "grad_norm": 36.66078186035156,
1021
+ "learning_rate": 7.2430964237211406e-06,
1022
+ "loss": 0.2224,
1023
+ "step": 70500
1024
+ },
1025
+ {
1026
+ "epoch": 3.2141240380262563,
1027
+ "grad_norm": 13.858600616455078,
1028
+ "learning_rate": 7.219016980823132e-06,
1029
+ "loss": 0.2163,
1030
+ "step": 71000
1031
+ },
1032
+ {
1033
+ "epoch": 3.2367587143503846,
1034
+ "grad_norm": 2.7164244651794434,
1035
+ "learning_rate": 7.194937537925123e-06,
1036
+ "loss": 0.2356,
1037
+ "step": 71500
1038
+ },
1039
+ {
1040
+ "epoch": 3.2593933906745134,
1041
+ "grad_norm": 57.87660598754883,
1042
+ "learning_rate": 7.170858095027115e-06,
1043
+ "loss": 0.2271,
1044
+ "step": 72000
1045
+ },
1046
+ {
1047
+ "epoch": 3.2820280669986417,
1048
+ "grad_norm": 90.21813201904297,
1049
+ "learning_rate": 7.146778652129105e-06,
1050
+ "loss": 0.2177,
1051
+ "step": 72500
1052
+ },
1053
+ {
1054
+ "epoch": 3.3046627433227704,
1055
+ "grad_norm": 14.034249305725098,
1056
+ "learning_rate": 7.1226992092310956e-06,
1057
+ "loss": 0.2225,
1058
+ "step": 73000
1059
+ },
1060
+ {
1061
+ "epoch": 3.327297419646899,
1062
+ "grad_norm": 46.05585861206055,
1063
+ "learning_rate": 7.098619766333087e-06,
1064
+ "loss": 0.2242,
1065
+ "step": 73500
1066
+ },
1067
+ {
1068
+ "epoch": 3.3499320959710275,
1069
+ "grad_norm": 37.766517639160156,
1070
+ "learning_rate": 7.074540323435077e-06,
1071
+ "loss": 0.2384,
1072
+ "step": 74000
1073
+ },
1074
+ {
1075
+ "epoch": 3.3725667722951562,
1076
+ "grad_norm": 9.106913566589355,
1077
+ "learning_rate": 7.050460880537069e-06,
1078
+ "loss": 0.2485,
1079
+ "step": 74500
1080
+ },
1081
+ {
1082
+ "epoch": 3.395201448619285,
1083
+ "grad_norm": 15.67898178100586,
1084
+ "learning_rate": 7.026381437639059e-06,
1085
+ "loss": 0.2373,
1086
+ "step": 75000
1087
+ },
1088
+ {
1089
+ "epoch": 3.4178361249434133,
1090
+ "grad_norm": 26.127885818481445,
1091
+ "learning_rate": 7.00230199474105e-06,
1092
+ "loss": 0.2333,
1093
+ "step": 75500
1094
+ },
1095
+ {
1096
+ "epoch": 3.440470801267542,
1097
+ "grad_norm": 14.250904083251953,
1098
+ "learning_rate": 6.9782225518430414e-06,
1099
+ "loss": 0.2189,
1100
+ "step": 76000
1101
+ },
1102
+ {
1103
+ "epoch": 3.4631054775916703,
1104
+ "grad_norm": 67.55126190185547,
1105
+ "learning_rate": 6.954143108945031e-06,
1106
+ "loss": 0.2378,
1107
+ "step": 76500
1108
+ },
1109
+ {
1110
+ "epoch": 3.485740153915799,
1111
+ "grad_norm": 12.584871292114258,
1112
+ "learning_rate": 6.930063666047023e-06,
1113
+ "loss": 0.2448,
1114
+ "step": 77000
1115
+ },
1116
+ {
1117
+ "epoch": 3.5083748302399274,
1118
+ "grad_norm": 26.13035011291504,
1119
+ "learning_rate": 6.905984223149014e-06,
1120
+ "loss": 0.2302,
1121
+ "step": 77500
1122
+ },
1123
+ {
1124
+ "epoch": 3.531009506564056,
1125
+ "grad_norm": 19.708215713500977,
1126
+ "learning_rate": 6.881904780251004e-06,
1127
+ "loss": 0.2239,
1128
+ "step": 78000
1129
+ },
1130
+ {
1131
+ "epoch": 3.553644182888185,
1132
+ "grad_norm": 9.146390914916992,
1133
+ "learning_rate": 6.857825337352996e-06,
1134
+ "loss": 0.2303,
1135
+ "step": 78500
1136
+ },
1137
+ {
1138
+ "epoch": 3.576278859212313,
1139
+ "grad_norm": 69.38152313232422,
1140
+ "learning_rate": 6.8337458944549864e-06,
1141
+ "loss": 0.2366,
1142
+ "step": 79000
1143
+ },
1144
+ {
1145
+ "epoch": 3.598913535536442,
1146
+ "grad_norm": 43.6939811706543,
1147
+ "learning_rate": 6.809666451556978e-06,
1148
+ "loss": 0.2263,
1149
+ "step": 79500
1150
+ },
1151
+ {
1152
+ "epoch": 3.6215482118605706,
1153
+ "grad_norm": 12.009560585021973,
1154
+ "learning_rate": 6.785587008658968e-06,
1155
+ "loss": 0.2251,
1156
+ "step": 80000
1157
+ },
1158
+ {
1159
+ "epoch": 3.644182888184699,
1160
+ "grad_norm": 21.58733558654785,
1161
+ "learning_rate": 6.761507565760959e-06,
1162
+ "loss": 0.2447,
1163
+ "step": 80500
1164
+ },
1165
+ {
1166
+ "epoch": 3.6668175645088277,
1167
+ "grad_norm": 15.762284278869629,
1168
+ "learning_rate": 6.73742812286295e-06,
1169
+ "loss": 0.2349,
1170
+ "step": 81000
1171
+ },
1172
+ {
1173
+ "epoch": 3.689452240832956,
1174
+ "grad_norm": 32.47639083862305,
1175
+ "learning_rate": 6.713348679964941e-06,
1176
+ "loss": 0.2413,
1177
+ "step": 81500
1178
+ },
1179
+ {
1180
+ "epoch": 3.7120869171570847,
1181
+ "grad_norm": 36.506526947021484,
1182
+ "learning_rate": 6.689269237066932e-06,
1183
+ "loss": 0.2311,
1184
+ "step": 82000
1185
+ },
1186
+ {
1187
+ "epoch": 3.734721593481213,
1188
+ "grad_norm": 30.49101448059082,
1189
+ "learning_rate": 6.665189794168922e-06,
1190
+ "loss": 0.2399,
1191
+ "step": 82500
1192
+ },
1193
+ {
1194
+ "epoch": 3.7573562698053418,
1195
+ "grad_norm": 10.327536582946777,
1196
+ "learning_rate": 6.641110351270914e-06,
1197
+ "loss": 0.2332,
1198
+ "step": 83000
1199
+ },
1200
+ {
1201
+ "epoch": 3.7799909461294705,
1202
+ "grad_norm": 41.280303955078125,
1203
+ "learning_rate": 6.617030908372905e-06,
1204
+ "loss": 0.2274,
1205
+ "step": 83500
1206
+ },
1207
+ {
1208
+ "epoch": 3.802625622453599,
1209
+ "grad_norm": 35.73218536376953,
1210
+ "learning_rate": 6.592951465474895e-06,
1211
+ "loss": 0.2363,
1212
+ "step": 84000
1213
+ },
1214
+ {
1215
+ "epoch": 3.8252602987777276,
1216
+ "grad_norm": 19.240692138671875,
1217
+ "learning_rate": 6.5688720225768865e-06,
1218
+ "loss": 0.2408,
1219
+ "step": 84500
1220
+ },
1221
+ {
1222
+ "epoch": 3.8478949751018563,
1223
+ "grad_norm": 55.575645446777344,
1224
+ "learning_rate": 6.544792579678877e-06,
1225
+ "loss": 0.2281,
1226
+ "step": 85000
1227
+ },
1228
+ {
1229
+ "epoch": 3.8705296514259846,
1230
+ "grad_norm": 5.29152250289917,
1231
+ "learning_rate": 6.520713136780868e-06,
1232
+ "loss": 0.2316,
1233
+ "step": 85500
1234
+ },
1235
+ {
1236
+ "epoch": 3.893164327750113,
1237
+ "grad_norm": 74.08470153808594,
1238
+ "learning_rate": 6.496633693882859e-06,
1239
+ "loss": 0.2361,
1240
+ "step": 86000
1241
+ },
1242
+ {
1243
+ "epoch": 3.9157990040742416,
1244
+ "grad_norm": 23.3903751373291,
1245
+ "learning_rate": 6.472554250984849e-06,
1246
+ "loss": 0.2425,
1247
+ "step": 86500
1248
+ },
1249
+ {
1250
+ "epoch": 3.9384336803983704,
1251
+ "grad_norm": 37.02583694458008,
1252
+ "learning_rate": 6.448474808086841e-06,
1253
+ "loss": 0.2377,
1254
+ "step": 87000
1255
+ },
1256
+ {
1257
+ "epoch": 3.9610683567224987,
1258
+ "grad_norm": 13.134513854980469,
1259
+ "learning_rate": 6.4243953651888315e-06,
1260
+ "loss": 0.2424,
1261
+ "step": 87500
1262
+ },
1263
+ {
1264
+ "epoch": 3.9837030330466274,
1265
+ "grad_norm": 10.808335304260254,
1266
+ "learning_rate": 6.400315922290823e-06,
1267
+ "loss": 0.2339,
1268
+ "step": 88000
1269
+ },
1270
+ {
1271
+ "epoch": 4.0,
1272
+ "eval_accuracy": 0.8661353161365893,
1273
+ "eval_loss": 0.4954204857349396,
1274
+ "eval_runtime": 26.0621,
1275
+ "eval_samples_per_second": 1506.824,
1276
+ "eval_steps_per_second": 94.198,
1277
+ "step": 88360
1278
+ },
1279
+ {
1280
+ "epoch": 4.006337709370756,
1281
+ "grad_norm": 9.569930076599121,
1282
+ "learning_rate": 6.376236479392813e-06,
1283
+ "loss": 0.2142,
1284
+ "step": 88500
1285
+ },
1286
+ {
1287
+ "epoch": 4.0289723856948845,
1288
+ "grad_norm": 25.4268856048584,
1289
+ "learning_rate": 6.352157036494804e-06,
1290
+ "loss": 0.1827,
1291
+ "step": 89000
1292
+ },
1293
+ {
1294
+ "epoch": 4.051607062019013,
1295
+ "grad_norm": 60.39373779296875,
1296
+ "learning_rate": 6.328077593596796e-06,
1297
+ "loss": 0.197,
1298
+ "step": 89500
1299
+ },
1300
+ {
1301
+ "epoch": 4.074241738343142,
1302
+ "grad_norm": 63.3898811340332,
1303
+ "learning_rate": 6.303998150698786e-06,
1304
+ "loss": 0.196,
1305
+ "step": 90000
1306
+ },
1307
+ {
1308
+ "epoch": 4.09687641466727,
1309
+ "grad_norm": 61.60245132446289,
1310
+ "learning_rate": 6.279918707800777e-06,
1311
+ "loss": 0.1904,
1312
+ "step": 90500
1313
+ },
1314
+ {
1315
+ "epoch": 4.119511090991399,
1316
+ "grad_norm": 12.662140846252441,
1317
+ "learning_rate": 6.255839264902767e-06,
1318
+ "loss": 0.1933,
1319
+ "step": 91000
1320
+ },
1321
+ {
1322
+ "epoch": 4.142145767315528,
1323
+ "grad_norm": 15.43615436553955,
1324
+ "learning_rate": 6.231759822004758e-06,
1325
+ "loss": 0.1922,
1326
+ "step": 91500
1327
+ },
1328
+ {
1329
+ "epoch": 4.164780443639656,
1330
+ "grad_norm": 64.14022064208984,
1331
+ "learning_rate": 6.20768037910675e-06,
1332
+ "loss": 0.1969,
1333
+ "step": 92000
1334
+ },
1335
+ {
1336
+ "epoch": 4.187415119963784,
1337
+ "grad_norm": 46.96083068847656,
1338
+ "learning_rate": 6.18360093620874e-06,
1339
+ "loss": 0.2086,
1340
+ "step": 92500
1341
+ },
1342
+ {
1343
+ "epoch": 4.2100497962879135,
1344
+ "grad_norm": 28.856672286987305,
1345
+ "learning_rate": 6.1595214933107315e-06,
1346
+ "loss": 0.2022,
1347
+ "step": 93000
1348
+ },
1349
+ {
1350
+ "epoch": 4.232684472612042,
1351
+ "grad_norm": 0.08567750453948975,
1352
+ "learning_rate": 6.135442050412722e-06,
1353
+ "loss": 0.1966,
1354
+ "step": 93500
1355
+ },
1356
+ {
1357
+ "epoch": 4.25531914893617,
1358
+ "grad_norm": 23.097795486450195,
1359
+ "learning_rate": 6.111362607514712e-06,
1360
+ "loss": 0.1994,
1361
+ "step": 94000
1362
+ },
1363
+ {
1364
+ "epoch": 4.277953825260298,
1365
+ "grad_norm": 124.21513366699219,
1366
+ "learning_rate": 6.087283164616704e-06,
1367
+ "loss": 0.2014,
1368
+ "step": 94500
1369
+ },
1370
+ {
1371
+ "epoch": 4.300588501584428,
1372
+ "grad_norm": 106.59521484375,
1373
+ "learning_rate": 6.063203721718695e-06,
1374
+ "loss": 0.1999,
1375
+ "step": 95000
1376
+ },
1377
+ {
1378
+ "epoch": 4.323223177908556,
1379
+ "grad_norm": 9.190028190612793,
1380
+ "learning_rate": 6.039124278820686e-06,
1381
+ "loss": 0.1989,
1382
+ "step": 95500
1383
+ },
1384
+ {
1385
+ "epoch": 4.345857854232684,
1386
+ "grad_norm": 0.6034038066864014,
1387
+ "learning_rate": 6.0150448359226765e-06,
1388
+ "loss": 0.2003,
1389
+ "step": 96000
1390
+ },
1391
+ {
1392
+ "epoch": 4.368492530556813,
1393
+ "grad_norm": 28.348718643188477,
1394
+ "learning_rate": 5.990965393024667e-06,
1395
+ "loss": 0.1944,
1396
+ "step": 96500
1397
+ },
1398
+ {
1399
+ "epoch": 4.391127206880942,
1400
+ "grad_norm": 34.211814880371094,
1401
+ "learning_rate": 5.966885950126658e-06,
1402
+ "loss": 0.214,
1403
+ "step": 97000
1404
+ },
1405
+ {
1406
+ "epoch": 4.41376188320507,
1407
+ "grad_norm": 78.27364349365234,
1408
+ "learning_rate": 5.942806507228649e-06,
1409
+ "loss": 0.1948,
1410
+ "step": 97500
1411
+ },
1412
+ {
1413
+ "epoch": 4.436396559529198,
1414
+ "grad_norm": 39.985931396484375,
1415
+ "learning_rate": 5.918727064330641e-06,
1416
+ "loss": 0.1979,
1417
+ "step": 98000
1418
+ },
1419
+ {
1420
+ "epoch": 4.4590312358533275,
1421
+ "grad_norm": 125.82061004638672,
1422
+ "learning_rate": 5.894647621432631e-06,
1423
+ "loss": 0.2057,
1424
+ "step": 98500
1425
+ },
1426
+ {
1427
+ "epoch": 4.481665912177456,
1428
+ "grad_norm": 24.297237396240234,
1429
+ "learning_rate": 5.8705681785346215e-06,
1430
+ "loss": 0.1873,
1431
+ "step": 99000
1432
+ },
1433
+ {
1434
+ "epoch": 4.504300588501584,
1435
+ "grad_norm": 14.736886024475098,
1436
+ "learning_rate": 5.846488735636613e-06,
1437
+ "loss": 0.2043,
1438
+ "step": 99500
1439
+ },
1440
+ {
1441
+ "epoch": 4.526935264825713,
1442
+ "grad_norm": 47.609375,
1443
+ "learning_rate": 5.822409292738603e-06,
1444
+ "loss": 0.2043,
1445
+ "step": 100000
1446
+ },
1447
+ {
1448
+ "epoch": 4.549569941149842,
1449
+ "grad_norm": 32.71791076660156,
1450
+ "learning_rate": 5.798329849840595e-06,
1451
+ "loss": 0.1981,
1452
+ "step": 100500
1453
+ },
1454
+ {
1455
+ "epoch": 4.57220461747397,
1456
+ "grad_norm": 32.31149673461914,
1457
+ "learning_rate": 5.774250406942586e-06,
1458
+ "loss": 0.1904,
1459
+ "step": 101000
1460
+ },
1461
+ {
1462
+ "epoch": 4.594839293798099,
1463
+ "grad_norm": 15.635351181030273,
1464
+ "learning_rate": 5.7501709640445765e-06,
1465
+ "loss": 0.2027,
1466
+ "step": 101500
1467
+ },
1468
+ {
1469
+ "epoch": 4.617473970122227,
1470
+ "grad_norm": 0.519290566444397,
1471
+ "learning_rate": 5.726091521146567e-06,
1472
+ "loss": 0.2102,
1473
+ "step": 102000
1474
+ },
1475
+ {
1476
+ "epoch": 4.640108646446356,
1477
+ "grad_norm": 54.78620910644531,
1478
+ "learning_rate": 5.702012078248557e-06,
1479
+ "loss": 0.2028,
1480
+ "step": 102500
1481
+ },
1482
+ {
1483
+ "epoch": 4.662743322770485,
1484
+ "grad_norm": 15.834295272827148,
1485
+ "learning_rate": 5.677932635350549e-06,
1486
+ "loss": 0.2068,
1487
+ "step": 103000
1488
+ },
1489
+ {
1490
+ "epoch": 4.685377999094613,
1491
+ "grad_norm": 18.294235229492188,
1492
+ "learning_rate": 5.65385319245254e-06,
1493
+ "loss": 0.2194,
1494
+ "step": 103500
1495
+ },
1496
+ {
1497
+ "epoch": 4.7080126754187415,
1498
+ "grad_norm": 9.605391502380371,
1499
+ "learning_rate": 5.6297737495545315e-06,
1500
+ "loss": 0.1946,
1501
+ "step": 104000
1502
+ },
1503
+ {
1504
+ "epoch": 4.73064735174287,
1505
+ "grad_norm": 50.544219970703125,
1506
+ "learning_rate": 5.6056943066565215e-06,
1507
+ "loss": 0.2007,
1508
+ "step": 104500
1509
+ },
1510
+ {
1511
+ "epoch": 4.753282028066999,
1512
+ "grad_norm": 19.007843017578125,
1513
+ "learning_rate": 5.581614863758512e-06,
1514
+ "loss": 0.2192,
1515
+ "step": 105000
1516
+ },
1517
+ {
1518
+ "epoch": 4.775916704391127,
1519
+ "grad_norm": 1.8254756927490234,
1520
+ "learning_rate": 5.557535420860504e-06,
1521
+ "loss": 0.1972,
1522
+ "step": 105500
1523
+ },
1524
+ {
1525
+ "epoch": 4.7985513807152556,
1526
+ "grad_norm": 32.10722732543945,
1527
+ "learning_rate": 5.533455977962494e-06,
1528
+ "loss": 0.2088,
1529
+ "step": 106000
1530
+ },
1531
+ {
1532
+ "epoch": 4.821186057039384,
1533
+ "grad_norm": 54.646392822265625,
1534
+ "learning_rate": 5.509376535064486e-06,
1535
+ "loss": 0.2111,
1536
+ "step": 106500
1537
+ },
1538
+ {
1539
+ "epoch": 4.843820733363513,
1540
+ "grad_norm": 0.40658873319625854,
1541
+ "learning_rate": 5.485297092166476e-06,
1542
+ "loss": 0.2114,
1543
+ "step": 107000
1544
+ },
1545
+ {
1546
+ "epoch": 4.866455409687641,
1547
+ "grad_norm": 12.083222389221191,
1548
+ "learning_rate": 5.4612176492684665e-06,
1549
+ "loss": 0.1959,
1550
+ "step": 107500
1551
+ },
1552
+ {
1553
+ "epoch": 4.88909008601177,
1554
+ "grad_norm": 0.27834174036979675,
1555
+ "learning_rate": 5.437138206370458e-06,
1556
+ "loss": 0.1994,
1557
+ "step": 108000
1558
+ },
1559
+ {
1560
+ "epoch": 4.911724762335899,
1561
+ "grad_norm": 7.066097259521484,
1562
+ "learning_rate": 5.413058763472448e-06,
1563
+ "loss": 0.2121,
1564
+ "step": 108500
1565
+ },
1566
+ {
1567
+ "epoch": 4.934359438660027,
1568
+ "grad_norm": 39.164085388183594,
1569
+ "learning_rate": 5.38897932057444e-06,
1570
+ "loss": 0.1952,
1571
+ "step": 109000
1572
+ },
1573
+ {
1574
+ "epoch": 4.956994114984155,
1575
+ "grad_norm": 27.279882431030273,
1576
+ "learning_rate": 5.364899877676431e-06,
1577
+ "loss": 0.1997,
1578
+ "step": 109500
1579
+ },
1580
+ {
1581
+ "epoch": 4.979628791308285,
1582
+ "grad_norm": 54.53019332885742,
1583
+ "learning_rate": 5.340820434778421e-06,
1584
+ "loss": 0.2037,
1585
+ "step": 110000
1586
+ },
1587
+ {
1588
+ "epoch": 5.0,
1589
+ "eval_accuracy": 0.8667464541264547,
1590
+ "eval_loss": 0.6144042611122131,
1591
+ "eval_runtime": 26.0348,
1592
+ "eval_samples_per_second": 1508.405,
1593
+ "eval_steps_per_second": 94.297,
1594
+ "step": 110450
1595
+ },
1596
+ {
1597
+ "epoch": 5.002263467632413,
1598
+ "grad_norm": 24.079191207885742,
1599
+ "learning_rate": 5.316740991880412e-06,
1600
+ "loss": 0.197,
1601
+ "step": 110500
1602
+ },
1603
+ {
1604
+ "epoch": 5.024898143956541,
1605
+ "grad_norm": 0.3425958752632141,
1606
+ "learning_rate": 5.292661548982403e-06,
1607
+ "loss": 0.1472,
1608
+ "step": 111000
1609
+ },
1610
+ {
1611
+ "epoch": 5.04753282028067,
1612
+ "grad_norm": 0.11918644607067108,
1613
+ "learning_rate": 5.268582106084394e-06,
1614
+ "loss": 0.1614,
1615
+ "step": 111500
1616
+ },
1617
+ {
1618
+ "epoch": 5.070167496604799,
1619
+ "grad_norm": 0.0681818500161171,
1620
+ "learning_rate": 5.244502663186385e-06,
1621
+ "loss": 0.1781,
1622
+ "step": 112000
1623
+ },
1624
+ {
1625
+ "epoch": 5.092802172928927,
1626
+ "grad_norm": 62.686737060546875,
1627
+ "learning_rate": 5.220423220288376e-06,
1628
+ "loss": 0.1781,
1629
+ "step": 112500
1630
+ },
1631
+ {
1632
+ "epoch": 5.115436849253055,
1633
+ "grad_norm": 73.86345672607422,
1634
+ "learning_rate": 5.1963437773903666e-06,
1635
+ "loss": 0.1679,
1636
+ "step": 113000
1637
+ },
1638
+ {
1639
+ "epoch": 5.1380715255771845,
1640
+ "grad_norm": 0.13405387103557587,
1641
+ "learning_rate": 5.172264334492357e-06,
1642
+ "loss": 0.184,
1643
+ "step": 113500
1644
+ },
1645
+ {
1646
+ "epoch": 5.160706201901313,
1647
+ "grad_norm": 38.33438491821289,
1648
+ "learning_rate": 5.148184891594349e-06,
1649
+ "loss": 0.1679,
1650
+ "step": 114000
1651
+ },
1652
+ {
1653
+ "epoch": 5.183340878225441,
1654
+ "grad_norm": 0.36470118165016174,
1655
+ "learning_rate": 5.124105448696339e-06,
1656
+ "loss": 0.154,
1657
+ "step": 114500
1658
+ },
1659
+ {
1660
+ "epoch": 5.20597555454957,
1661
+ "grad_norm": 31.240108489990234,
1662
+ "learning_rate": 5.10002600579833e-06,
1663
+ "loss": 0.1718,
1664
+ "step": 115000
1665
+ },
1666
+ {
1667
+ "epoch": 5.228610230873699,
1668
+ "grad_norm": 0.2006056010723114,
1669
+ "learning_rate": 5.075946562900322e-06,
1670
+ "loss": 0.1624,
1671
+ "step": 115500
1672
+ },
1673
+ {
1674
+ "epoch": 5.251244907197827,
1675
+ "grad_norm": 80.91893768310547,
1676
+ "learning_rate": 5.0518671200023116e-06,
1677
+ "loss": 0.1751,
1678
+ "step": 116000
1679
+ },
1680
+ {
1681
+ "epoch": 5.273879583521955,
1682
+ "grad_norm": 39.18518829345703,
1683
+ "learning_rate": 5.027787677104303e-06,
1684
+ "loss": 0.1768,
1685
+ "step": 116500
1686
+ },
1687
+ {
1688
+ "epoch": 5.296514259846084,
1689
+ "grad_norm": 10.39631175994873,
1690
+ "learning_rate": 5.003708234206294e-06,
1691
+ "loss": 0.186,
1692
+ "step": 117000
1693
+ },
1694
+ {
1695
+ "epoch": 5.319148936170213,
1696
+ "grad_norm": 2.1696979999542236,
1697
+ "learning_rate": 4.979628791308285e-06,
1698
+ "loss": 0.175,
1699
+ "step": 117500
1700
+ },
1701
+ {
1702
+ "epoch": 5.341783612494341,
1703
+ "grad_norm": 21.064584732055664,
1704
+ "learning_rate": 4.955549348410276e-06,
1705
+ "loss": 0.1625,
1706
+ "step": 118000
1707
+ },
1708
+ {
1709
+ "epoch": 5.36441828881847,
1710
+ "grad_norm": 0.3759268522262573,
1711
+ "learning_rate": 4.931469905512267e-06,
1712
+ "loss": 0.1834,
1713
+ "step": 118500
1714
+ },
1715
+ {
1716
+ "epoch": 5.3870529651425985,
1717
+ "grad_norm": 35.439117431640625,
1718
+ "learning_rate": 4.9073904626142574e-06,
1719
+ "loss": 0.1859,
1720
+ "step": 119000
1721
+ },
1722
+ {
1723
+ "epoch": 5.409687641466727,
1724
+ "grad_norm": 63.224666595458984,
1725
+ "learning_rate": 4.883311019716248e-06,
1726
+ "loss": 0.1722,
1727
+ "step": 119500
1728
+ },
1729
+ {
1730
+ "epoch": 5.432322317790856,
1731
+ "grad_norm": 2.553009033203125,
1732
+ "learning_rate": 4.859231576818239e-06,
1733
+ "loss": 0.173,
1734
+ "step": 120000
1735
+ },
1736
+ {
1737
+ "epoch": 5.454956994114984,
1738
+ "grad_norm": 15.771255493164062,
1739
+ "learning_rate": 4.83515213392023e-06,
1740
+ "loss": 0.1922,
1741
+ "step": 120500
1742
+ },
1743
+ {
1744
+ "epoch": 5.4775916704391125,
1745
+ "grad_norm": 108.56519317626953,
1746
+ "learning_rate": 4.811072691022221e-06,
1747
+ "loss": 0.1717,
1748
+ "step": 121000
1749
+ },
1750
+ {
1751
+ "epoch": 5.500226346763242,
1752
+ "grad_norm": 78.23528289794922,
1753
+ "learning_rate": 4.7869932481242124e-06,
1754
+ "loss": 0.1822,
1755
+ "step": 121500
1756
+ },
1757
+ {
1758
+ "epoch": 5.52286102308737,
1759
+ "grad_norm": 62.75898742675781,
1760
+ "learning_rate": 4.7629138052262024e-06,
1761
+ "loss": 0.1697,
1762
+ "step": 122000
1763
+ },
1764
+ {
1765
+ "epoch": 5.545495699411498,
1766
+ "grad_norm": 136.13113403320312,
1767
+ "learning_rate": 4.738834362328193e-06,
1768
+ "loss": 0.1807,
1769
+ "step": 122500
1770
+ },
1771
+ {
1772
+ "epoch": 5.568130375735627,
1773
+ "grad_norm": 52.2840461730957,
1774
+ "learning_rate": 4.714754919430184e-06,
1775
+ "loss": 0.1762,
1776
+ "step": 123000
1777
+ },
1778
+ {
1779
+ "epoch": 5.590765052059756,
1780
+ "grad_norm": 4.957085609436035,
1781
+ "learning_rate": 4.690675476532176e-06,
1782
+ "loss": 0.1875,
1783
+ "step": 123500
1784
+ },
1785
+ {
1786
+ "epoch": 5.613399728383884,
1787
+ "grad_norm": 39.328086853027344,
1788
+ "learning_rate": 4.666596033634167e-06,
1789
+ "loss": 0.1732,
1790
+ "step": 124000
1791
+ },
1792
+ {
1793
+ "epoch": 5.636034404708012,
1794
+ "grad_norm": 3.6397218704223633,
1795
+ "learning_rate": 4.642516590736157e-06,
1796
+ "loss": 0.175,
1797
+ "step": 124500
1798
+ },
1799
+ {
1800
+ "epoch": 5.658669081032142,
1801
+ "grad_norm": 32.74443435668945,
1802
+ "learning_rate": 4.618437147838148e-06,
1803
+ "loss": 0.1667,
1804
+ "step": 125000
1805
+ },
1806
+ {
1807
+ "epoch": 5.68130375735627,
1808
+ "grad_norm": 32.64069366455078,
1809
+ "learning_rate": 4.594357704940139e-06,
1810
+ "loss": 0.1691,
1811
+ "step": 125500
1812
+ },
1813
+ {
1814
+ "epoch": 5.703938433680398,
1815
+ "grad_norm": 21.668283462524414,
1816
+ "learning_rate": 4.57027826204213e-06,
1817
+ "loss": 0.1785,
1818
+ "step": 126000
1819
+ },
1820
+ {
1821
+ "epoch": 5.7265731100045265,
1822
+ "grad_norm": 8.382264137268066,
1823
+ "learning_rate": 4.546198819144121e-06,
1824
+ "loss": 0.1829,
1825
+ "step": 126500
1826
+ },
1827
+ {
1828
+ "epoch": 5.749207786328656,
1829
+ "grad_norm": 24.240978240966797,
1830
+ "learning_rate": 4.522119376246112e-06,
1831
+ "loss": 0.1573,
1832
+ "step": 127000
1833
+ },
1834
+ {
1835
+ "epoch": 5.771842462652784,
1836
+ "grad_norm": 98.30403900146484,
1837
+ "learning_rate": 4.4980399333481025e-06,
1838
+ "loss": 0.1962,
1839
+ "step": 127500
1840
+ },
1841
+ {
1842
+ "epoch": 5.794477138976912,
1843
+ "grad_norm": 0.6671485304832458,
1844
+ "learning_rate": 4.473960490450093e-06,
1845
+ "loss": 0.1813,
1846
+ "step": 128000
1847
+ },
1848
+ {
1849
+ "epoch": 5.8171118153010415,
1850
+ "grad_norm": 71.27288055419922,
1851
+ "learning_rate": 4.449881047552084e-06,
1852
+ "loss": 0.1747,
1853
+ "step": 128500
1854
+ },
1855
+ {
1856
+ "epoch": 5.83974649162517,
1857
+ "grad_norm": 148.5382537841797,
1858
+ "learning_rate": 4.425801604654075e-06,
1859
+ "loss": 0.1676,
1860
+ "step": 129000
1861
+ },
1862
+ {
1863
+ "epoch": 5.862381167949298,
1864
+ "grad_norm": 0.19661898910999298,
1865
+ "learning_rate": 4.401722161756066e-06,
1866
+ "loss": 0.1771,
1867
+ "step": 129500
1868
+ },
1869
+ {
1870
+ "epoch": 5.885015844273427,
1871
+ "grad_norm": 16.245052337646484,
1872
+ "learning_rate": 4.377642718858057e-06,
1873
+ "loss": 0.1864,
1874
+ "step": 130000
1875
+ },
1876
+ {
1877
+ "epoch": 5.907650520597556,
1878
+ "grad_norm": 0.5395733118057251,
1879
+ "learning_rate": 4.3535632759600475e-06,
1880
+ "loss": 0.1775,
1881
+ "step": 130500
1882
+ },
1883
+ {
1884
+ "epoch": 5.930285196921684,
1885
+ "grad_norm": 13.2942533493042,
1886
+ "learning_rate": 4.329483833062038e-06,
1887
+ "loss": 0.1669,
1888
+ "step": 131000
1889
+ },
1890
+ {
1891
+ "epoch": 5.952919873245813,
1892
+ "grad_norm": 12.363393783569336,
1893
+ "learning_rate": 4.30540439016403e-06,
1894
+ "loss": 0.1747,
1895
+ "step": 131500
1896
+ },
1897
+ {
1898
+ "epoch": 5.975554549569941,
1899
+ "grad_norm": 0.334881067276001,
1900
+ "learning_rate": 4.281324947266021e-06,
1901
+ "loss": 0.1941,
1902
+ "step": 132000
1903
+ },
1904
+ {
1905
+ "epoch": 5.99818922589407,
1906
+ "grad_norm": 9.071168899536133,
1907
+ "learning_rate": 4.257245504368011e-06,
1908
+ "loss": 0.1745,
1909
+ "step": 132500
1910
+ },
1911
+ {
1912
+ "epoch": 6.0,
1913
+ "eval_accuracy": 0.8641491176695272,
1914
+ "eval_loss": 0.6998937726020813,
1915
+ "eval_runtime": 26.0201,
1916
+ "eval_samples_per_second": 1509.256,
1917
+ "eval_steps_per_second": 94.35,
1918
+ "step": 132540
1919
+ },
1920
+ {
1921
+ "epoch": 6.020823902218198,
1922
+ "grad_norm": 79.37480163574219,
1923
+ "learning_rate": 4.2331660614700025e-06,
1924
+ "loss": 0.1323,
1925
+ "step": 133000
1926
+ },
1927
+ {
1928
+ "epoch": 6.043458578542327,
1929
+ "grad_norm": 97.10159301757812,
1930
+ "learning_rate": 4.209086618571993e-06,
1931
+ "loss": 0.1373,
1932
+ "step": 133500
1933
+ },
1934
+ {
1935
+ "epoch": 6.0660932548664555,
1936
+ "grad_norm": 9.43271541595459,
1937
+ "learning_rate": 4.185007175673984e-06,
1938
+ "loss": 0.1422,
1939
+ "step": 134000
1940
+ },
1941
+ {
1942
+ "epoch": 6.088727931190584,
1943
+ "grad_norm": 0.20963682234287262,
1944
+ "learning_rate": 4.160927732775975e-06,
1945
+ "loss": 0.1457,
1946
+ "step": 134500
1947
+ },
1948
+ {
1949
+ "epoch": 6.111362607514713,
1950
+ "grad_norm": 55.66864776611328,
1951
+ "learning_rate": 4.136848289877966e-06,
1952
+ "loss": 0.1508,
1953
+ "step": 135000
1954
+ },
1955
+ {
1956
+ "epoch": 6.133997283838841,
1957
+ "grad_norm": 84.8683090209961,
1958
+ "learning_rate": 4.112768846979957e-06,
1959
+ "loss": 0.1347,
1960
+ "step": 135500
1961
+ },
1962
+ {
1963
+ "epoch": 6.1566319601629695,
1964
+ "grad_norm": 77.05133819580078,
1965
+ "learning_rate": 4.0886894040819475e-06,
1966
+ "loss": 0.1424,
1967
+ "step": 136000
1968
+ },
1969
+ {
1970
+ "epoch": 6.179266636487098,
1971
+ "grad_norm": 0.16760210692882538,
1972
+ "learning_rate": 4.064609961183938e-06,
1973
+ "loss": 0.1401,
1974
+ "step": 136500
1975
+ },
1976
+ {
1977
+ "epoch": 6.201901312811227,
1978
+ "grad_norm": 6.680587291717529,
1979
+ "learning_rate": 4.040530518285929e-06,
1980
+ "loss": 0.1579,
1981
+ "step": 137000
1982
+ },
1983
+ {
1984
+ "epoch": 6.224535989135355,
1985
+ "grad_norm": 10.419951438903809,
1986
+ "learning_rate": 4.01645107538792e-06,
1987
+ "loss": 0.1441,
1988
+ "step": 137500
1989
+ },
1990
+ {
1991
+ "epoch": 6.247170665459484,
1992
+ "grad_norm": 75.0548095703125,
1993
+ "learning_rate": 3.992371632489911e-06,
1994
+ "loss": 0.1489,
1995
+ "step": 138000
1996
+ },
1997
+ {
1998
+ "epoch": 6.269805341783613,
1999
+ "grad_norm": 1.2689846754074097,
2000
+ "learning_rate": 3.968292189591902e-06,
2001
+ "loss": 0.1527,
2002
+ "step": 138500
2003
+ },
2004
+ {
2005
+ "epoch": 6.292440018107741,
2006
+ "grad_norm": 23.291440963745117,
2007
+ "learning_rate": 3.9442127466938925e-06,
2008
+ "loss": 0.1508,
2009
+ "step": 139000
2010
+ },
2011
+ {
2012
+ "epoch": 6.315074694431869,
2013
+ "grad_norm": 99.6236343383789,
2014
+ "learning_rate": 3.920133303795884e-06,
2015
+ "loss": 0.1561,
2016
+ "step": 139500
2017
+ },
2018
+ {
2019
+ "epoch": 6.337709370755999,
2020
+ "grad_norm": 62.657745361328125,
2021
+ "learning_rate": 3.896053860897875e-06,
2022
+ "loss": 0.1316,
2023
+ "step": 140000
2024
+ },
2025
+ {
2026
+ "epoch": 6.360344047080127,
2027
+ "grad_norm": 1.494821310043335,
2028
+ "learning_rate": 3.871974417999865e-06,
2029
+ "loss": 0.1371,
2030
+ "step": 140500
2031
+ },
2032
+ {
2033
+ "epoch": 6.382978723404255,
2034
+ "grad_norm": 54.55570602416992,
2035
+ "learning_rate": 3.847894975101857e-06,
2036
+ "loss": 0.1281,
2037
+ "step": 141000
2038
+ },
2039
+ {
2040
+ "epoch": 6.4056133997283835,
2041
+ "grad_norm": 39.3503303527832,
2042
+ "learning_rate": 3.8238155322038475e-06,
2043
+ "loss": 0.153,
2044
+ "step": 141500
2045
+ },
2046
+ {
2047
+ "epoch": 6.428248076052513,
2048
+ "grad_norm": 211.0976104736328,
2049
+ "learning_rate": 3.7997360893058384e-06,
2050
+ "loss": 0.1529,
2051
+ "step": 142000
2052
+ },
2053
+ {
2054
+ "epoch": 6.450882752376641,
2055
+ "grad_norm": 36.55986404418945,
2056
+ "learning_rate": 3.775656646407829e-06,
2057
+ "loss": 0.1446,
2058
+ "step": 142500
2059
+ },
2060
+ {
2061
+ "epoch": 6.473517428700769,
2062
+ "grad_norm": 14.937396049499512,
2063
+ "learning_rate": 3.7515772035098196e-06,
2064
+ "loss": 0.157,
2065
+ "step": 143000
2066
+ },
2067
+ {
2068
+ "epoch": 6.4961521050248985,
2069
+ "grad_norm": 0.12284702807664871,
2070
+ "learning_rate": 3.727497760611811e-06,
2071
+ "loss": 0.1576,
2072
+ "step": 143500
2073
+ },
2074
+ {
2075
+ "epoch": 6.518786781349027,
2076
+ "grad_norm": 169.0521697998047,
2077
+ "learning_rate": 3.7034183177138017e-06,
2078
+ "loss": 0.1516,
2079
+ "step": 144000
2080
+ },
2081
+ {
2082
+ "epoch": 6.541421457673155,
2083
+ "grad_norm": 119.7757339477539,
2084
+ "learning_rate": 3.6793388748157925e-06,
2085
+ "loss": 0.142,
2086
+ "step": 144500
2087
+ },
2088
+ {
2089
+ "epoch": 6.564056133997283,
2090
+ "grad_norm": 0.5702412128448486,
2091
+ "learning_rate": 3.6552594319177838e-06,
2092
+ "loss": 0.1519,
2093
+ "step": 145000
2094
+ },
2095
+ {
2096
+ "epoch": 6.586690810321413,
2097
+ "grad_norm": 0.6575600504875183,
2098
+ "learning_rate": 3.631179989019774e-06,
2099
+ "loss": 0.1494,
2100
+ "step": 145500
2101
+ },
2102
+ {
2103
+ "epoch": 6.609325486645541,
2104
+ "grad_norm": 104.18098449707031,
2105
+ "learning_rate": 3.607100546121765e-06,
2106
+ "loss": 0.1431,
2107
+ "step": 146000
2108
+ },
2109
+ {
2110
+ "epoch": 6.631960162969669,
2111
+ "grad_norm": 0.18219584226608276,
2112
+ "learning_rate": 3.583021103223756e-06,
2113
+ "loss": 0.1397,
2114
+ "step": 146500
2115
+ },
2116
+ {
2117
+ "epoch": 6.654594839293798,
2118
+ "grad_norm": 39.80546569824219,
2119
+ "learning_rate": 3.558941660325747e-06,
2120
+ "loss": 0.1384,
2121
+ "step": 147000
2122
+ },
2123
+ {
2124
+ "epoch": 6.677229515617927,
2125
+ "grad_norm": 70.61176300048828,
2126
+ "learning_rate": 3.534862217427738e-06,
2127
+ "loss": 0.1452,
2128
+ "step": 147500
2129
+ },
2130
+ {
2131
+ "epoch": 6.699864191942055,
2132
+ "grad_norm": 0.11137774586677551,
2133
+ "learning_rate": 3.5107827745297292e-06,
2134
+ "loss": 0.1649,
2135
+ "step": 148000
2136
+ },
2137
+ {
2138
+ "epoch": 6.722498868266184,
2139
+ "grad_norm": 1.3033461570739746,
2140
+ "learning_rate": 3.4867033316317196e-06,
2141
+ "loss": 0.1468,
2142
+ "step": 148500
2143
+ },
2144
+ {
2145
+ "epoch": 6.7451335445903124,
2146
+ "grad_norm": 188.11358642578125,
2147
+ "learning_rate": 3.4626238887337105e-06,
2148
+ "loss": 0.1396,
2149
+ "step": 149000
2150
+ },
2151
+ {
2152
+ "epoch": 6.767768220914441,
2153
+ "grad_norm": 186.4955596923828,
2154
+ "learning_rate": 3.4385444458357013e-06,
2155
+ "loss": 0.1415,
2156
+ "step": 149500
2157
+ },
2158
+ {
2159
+ "epoch": 6.79040289723857,
2160
+ "grad_norm": 10.157150268554688,
2161
+ "learning_rate": 3.4144650029376926e-06,
2162
+ "loss": 0.1446,
2163
+ "step": 150000
2164
+ },
2165
+ {
2166
+ "epoch": 6.813037573562698,
2167
+ "grad_norm": 14.647910118103027,
2168
+ "learning_rate": 3.3903855600396834e-06,
2169
+ "loss": 0.1363,
2170
+ "step": 150500
2171
+ },
2172
+ {
2173
+ "epoch": 6.8356722498868265,
2174
+ "grad_norm": 0.07332862168550491,
2175
+ "learning_rate": 3.366306117141674e-06,
2176
+ "loss": 0.1622,
2177
+ "step": 151000
2178
+ },
2179
+ {
2180
+ "epoch": 6.858306926210955,
2181
+ "grad_norm": 125.91682434082031,
2182
+ "learning_rate": 3.342226674243665e-06,
2183
+ "loss": 0.1408,
2184
+ "step": 151500
2185
+ },
2186
+ {
2187
+ "epoch": 6.880941602535084,
2188
+ "grad_norm": 0.21492162346839905,
2189
+ "learning_rate": 3.318147231345656e-06,
2190
+ "loss": 0.1423,
2191
+ "step": 152000
2192
+ },
2193
+ {
2194
+ "epoch": 6.903576278859212,
2195
+ "grad_norm": 27.321796417236328,
2196
+ "learning_rate": 3.2940677884476467e-06,
2197
+ "loss": 0.1562,
2198
+ "step": 152500
2199
+ },
2200
+ {
2201
+ "epoch": 6.926210955183341,
2202
+ "grad_norm": 0.10927353799343109,
2203
+ "learning_rate": 3.269988345549638e-06,
2204
+ "loss": 0.1475,
2205
+ "step": 153000
2206
+ },
2207
+ {
2208
+ "epoch": 6.94884563150747,
2209
+ "grad_norm": 97.3139877319336,
2210
+ "learning_rate": 3.2459089026516284e-06,
2211
+ "loss": 0.1438,
2212
+ "step": 153500
2213
+ },
2214
+ {
2215
+ "epoch": 6.971480307831598,
2216
+ "grad_norm": 159.5480499267578,
2217
+ "learning_rate": 3.2218294597536192e-06,
2218
+ "loss": 0.1481,
2219
+ "step": 154000
2220
+ },
2221
+ {
2222
+ "epoch": 6.994114984155726,
2223
+ "grad_norm": 0.06921840459108353,
2224
+ "learning_rate": 3.19775001685561e-06,
2225
+ "loss": 0.1671,
2226
+ "step": 154500
2227
+ },
2228
+ {
2229
+ "epoch": 7.0,
2230
+ "eval_accuracy": 0.8639454050062387,
2231
+ "eval_loss": 0.7750576734542847,
2232
+ "eval_runtime": 26.0647,
2233
+ "eval_samples_per_second": 1506.673,
2234
+ "eval_steps_per_second": 94.189,
2235
+ "step": 154630
2236
+ },
2237
+ {
2238
+ "epoch": 7.016749660479855,
2239
+ "grad_norm": 1.394852638244629,
2240
+ "learning_rate": 3.1736705739576013e-06,
2241
+ "loss": 0.1232,
2242
+ "step": 155000
2243
+ },
2244
+ {
2245
+ "epoch": 7.039384336803984,
2246
+ "grad_norm": 0.33521416783332825,
2247
+ "learning_rate": 3.149591131059592e-06,
2248
+ "loss": 0.108,
2249
+ "step": 155500
2250
+ },
2251
+ {
2252
+ "epoch": 7.062019013128112,
2253
+ "grad_norm": 0.06891336292028427,
2254
+ "learning_rate": 3.1255116881615826e-06,
2255
+ "loss": 0.1155,
2256
+ "step": 156000
2257
+ },
2258
+ {
2259
+ "epoch": 7.0846536894522405,
2260
+ "grad_norm": 15.24691390991211,
2261
+ "learning_rate": 3.101432245263574e-06,
2262
+ "loss": 0.1266,
2263
+ "step": 156500
2264
+ },
2265
+ {
2266
+ "epoch": 7.10728836577637,
2267
+ "grad_norm": 0.06932001560926437,
2268
+ "learning_rate": 3.0773528023655647e-06,
2269
+ "loss": 0.1114,
2270
+ "step": 157000
2271
+ },
2272
+ {
2273
+ "epoch": 7.129923042100498,
2274
+ "grad_norm": 0.0540509857237339,
2275
+ "learning_rate": 3.0532733594675555e-06,
2276
+ "loss": 0.1279,
2277
+ "step": 157500
2278
+ },
2279
+ {
2280
+ "epoch": 7.152557718424626,
2281
+ "grad_norm": 29.716217041015625,
2282
+ "learning_rate": 3.0291939165695468e-06,
2283
+ "loss": 0.115,
2284
+ "step": 158000
2285
+ },
2286
+ {
2287
+ "epoch": 7.1751923947487555,
2288
+ "grad_norm": 0.0442744679749012,
2289
+ "learning_rate": 3.0051144736715376e-06,
2290
+ "loss": 0.1252,
2291
+ "step": 158500
2292
+ },
2293
+ {
2294
+ "epoch": 7.197827071072884,
2295
+ "grad_norm": 7.542829513549805,
2296
+ "learning_rate": 2.981035030773528e-06,
2297
+ "loss": 0.1156,
2298
+ "step": 159000
2299
+ },
2300
+ {
2301
+ "epoch": 7.220461747397012,
2302
+ "grad_norm": 11.190882682800293,
2303
+ "learning_rate": 2.9569555878755193e-06,
2304
+ "loss": 0.1065,
2305
+ "step": 159500
2306
+ },
2307
+ {
2308
+ "epoch": 7.24309642372114,
2309
+ "grad_norm": 0.10643190145492554,
2310
+ "learning_rate": 2.93287614497751e-06,
2311
+ "loss": 0.1316,
2312
+ "step": 160000
2313
+ },
2314
+ {
2315
+ "epoch": 7.26573110004527,
2316
+ "grad_norm": 0.16745133697986603,
2317
+ "learning_rate": 2.908796702079501e-06,
2318
+ "loss": 0.1101,
2319
+ "step": 160500
2320
+ },
2321
+ {
2322
+ "epoch": 7.288365776369398,
2323
+ "grad_norm": 0.16076330840587616,
2324
+ "learning_rate": 2.884717259181492e-06,
2325
+ "loss": 0.1314,
2326
+ "step": 161000
2327
+ },
2328
+ {
2329
+ "epoch": 7.311000452693526,
2330
+ "grad_norm": 0.10047034919261932,
2331
+ "learning_rate": 2.8606378162834826e-06,
2332
+ "loss": 0.1181,
2333
+ "step": 161500
2334
+ },
2335
+ {
2336
+ "epoch": 7.333635129017655,
2337
+ "grad_norm": 0.331920862197876,
2338
+ "learning_rate": 2.8365583733854734e-06,
2339
+ "loss": 0.1259,
2340
+ "step": 162000
2341
+ },
2342
+ {
2343
+ "epoch": 7.356269805341784,
2344
+ "grad_norm": 0.1453462541103363,
2345
+ "learning_rate": 2.8124789304874643e-06,
2346
+ "loss": 0.1249,
2347
+ "step": 162500
2348
+ },
2349
+ {
2350
+ "epoch": 7.378904481665912,
2351
+ "grad_norm": 0.8490937948226929,
2352
+ "learning_rate": 2.7883994875894555e-06,
2353
+ "loss": 0.1209,
2354
+ "step": 163000
2355
+ },
2356
+ {
2357
+ "epoch": 7.401539157990041,
2358
+ "grad_norm": 0.22700923681259155,
2359
+ "learning_rate": 2.7643200446914464e-06,
2360
+ "loss": 0.1244,
2361
+ "step": 163500
2362
+ },
2363
+ {
2364
+ "epoch": 7.424173834314169,
2365
+ "grad_norm": 0.19261109828948975,
2366
+ "learning_rate": 2.7402406017934368e-06,
2367
+ "loss": 0.135,
2368
+ "step": 164000
2369
+ },
2370
+ {
2371
+ "epoch": 7.446808510638298,
2372
+ "grad_norm": 75.31595611572266,
2373
+ "learning_rate": 2.716161158895428e-06,
2374
+ "loss": 0.1262,
2375
+ "step": 164500
2376
+ },
2377
+ {
2378
+ "epoch": 7.469443186962426,
2379
+ "grad_norm": 65.7965087890625,
2380
+ "learning_rate": 2.692081715997419e-06,
2381
+ "loss": 0.1139,
2382
+ "step": 165000
2383
+ },
2384
+ {
2385
+ "epoch": 7.492077863286555,
2386
+ "grad_norm": 114.45712280273438,
2387
+ "learning_rate": 2.6680022730994097e-06,
2388
+ "loss": 0.1335,
2389
+ "step": 165500
2390
+ },
2391
+ {
2392
+ "epoch": 7.5147125396106835,
2393
+ "grad_norm": 0.08683761209249496,
2394
+ "learning_rate": 2.643922830201401e-06,
2395
+ "loss": 0.1257,
2396
+ "step": 166000
2397
+ },
2398
+ {
2399
+ "epoch": 7.537347215934812,
2400
+ "grad_norm": 91.00257873535156,
2401
+ "learning_rate": 2.6198433873033918e-06,
2402
+ "loss": 0.1307,
2403
+ "step": 166500
2404
+ },
2405
+ {
2406
+ "epoch": 7.559981892258941,
2407
+ "grad_norm": 0.0967201367020607,
2408
+ "learning_rate": 2.595763944405382e-06,
2409
+ "loss": 0.1163,
2410
+ "step": 167000
2411
+ },
2412
+ {
2413
+ "epoch": 7.582616568583069,
2414
+ "grad_norm": 179.25857543945312,
2415
+ "learning_rate": 2.5716845015073735e-06,
2416
+ "loss": 0.1207,
2417
+ "step": 167500
2418
+ },
2419
+ {
2420
+ "epoch": 7.605251244907198,
2421
+ "grad_norm": 0.09339158982038498,
2422
+ "learning_rate": 2.5476050586093643e-06,
2423
+ "loss": 0.1284,
2424
+ "step": 168000
2425
+ },
2426
+ {
2427
+ "epoch": 7.627885921231327,
2428
+ "grad_norm": 0.0970580130815506,
2429
+ "learning_rate": 2.523525615711355e-06,
2430
+ "loss": 0.1222,
2431
+ "step": 168500
2432
+ },
2433
+ {
2434
+ "epoch": 7.650520597555455,
2435
+ "grad_norm": 0.26078376173973083,
2436
+ "learning_rate": 2.499446172813346e-06,
2437
+ "loss": 0.1225,
2438
+ "step": 169000
2439
+ },
2440
+ {
2441
+ "epoch": 7.673155273879583,
2442
+ "grad_norm": 29.06781005859375,
2443
+ "learning_rate": 2.475366729915337e-06,
2444
+ "loss": 0.1238,
2445
+ "step": 169500
2446
+ },
2447
+ {
2448
+ "epoch": 7.695789950203712,
2449
+ "grad_norm": 4.162774085998535,
2450
+ "learning_rate": 2.4512872870173276e-06,
2451
+ "loss": 0.1323,
2452
+ "step": 170000
2453
+ },
2454
+ {
2455
+ "epoch": 7.718424626527841,
2456
+ "grad_norm": 0.9394495487213135,
2457
+ "learning_rate": 2.4272078441193185e-06,
2458
+ "loss": 0.1173,
2459
+ "step": 170500
2460
+ },
2461
+ {
2462
+ "epoch": 7.741059302851969,
2463
+ "grad_norm": 0.06645090132951736,
2464
+ "learning_rate": 2.4031284012213097e-06,
2465
+ "loss": 0.1275,
2466
+ "step": 171000
2467
+ },
2468
+ {
2469
+ "epoch": 7.7636939791760975,
2470
+ "grad_norm": 0.12068886309862137,
2471
+ "learning_rate": 2.3790489583233006e-06,
2472
+ "loss": 0.1103,
2473
+ "step": 171500
2474
+ },
2475
+ {
2476
+ "epoch": 7.786328655500227,
2477
+ "grad_norm": 0.18481621146202087,
2478
+ "learning_rate": 2.3549695154252914e-06,
2479
+ "loss": 0.1225,
2480
+ "step": 172000
2481
+ },
2482
+ {
2483
+ "epoch": 7.808963331824355,
2484
+ "grad_norm": 0.0315103605389595,
2485
+ "learning_rate": 2.3308900725272822e-06,
2486
+ "loss": 0.1061,
2487
+ "step": 172500
2488
+ },
2489
+ {
2490
+ "epoch": 7.831598008148483,
2491
+ "grad_norm": 2.336836814880371,
2492
+ "learning_rate": 2.306810629629273e-06,
2493
+ "loss": 0.1226,
2494
+ "step": 173000
2495
+ },
2496
+ {
2497
+ "epoch": 7.854232684472612,
2498
+ "grad_norm": 32.957130432128906,
2499
+ "learning_rate": 2.282731186731264e-06,
2500
+ "loss": 0.1228,
2501
+ "step": 173500
2502
+ },
2503
+ {
2504
+ "epoch": 7.876867360796741,
2505
+ "grad_norm": 0.14461065828800201,
2506
+ "learning_rate": 2.2586517438332547e-06,
2507
+ "loss": 0.1172,
2508
+ "step": 174000
2509
+ },
2510
+ {
2511
+ "epoch": 7.899502037120869,
2512
+ "grad_norm": 0.13647380471229553,
2513
+ "learning_rate": 2.2345723009352456e-06,
2514
+ "loss": 0.1242,
2515
+ "step": 174500
2516
+ },
2517
+ {
2518
+ "epoch": 7.922136713444997,
2519
+ "grad_norm": 0.14755909144878387,
2520
+ "learning_rate": 2.210492858037237e-06,
2521
+ "loss": 0.1167,
2522
+ "step": 175000
2523
+ },
2524
+ {
2525
+ "epoch": 7.944771389769127,
2526
+ "grad_norm": 0.16207629442214966,
2527
+ "learning_rate": 2.1864134151392277e-06,
2528
+ "loss": 0.127,
2529
+ "step": 175500
2530
+ },
2531
+ {
2532
+ "epoch": 7.967406066093255,
2533
+ "grad_norm": 15.389420509338379,
2534
+ "learning_rate": 2.1623339722412185e-06,
2535
+ "loss": 0.1163,
2536
+ "step": 176000
2537
+ },
2538
+ {
2539
+ "epoch": 7.990040742417383,
2540
+ "grad_norm": 202.52561950683594,
2541
+ "learning_rate": 2.1382545293432093e-06,
2542
+ "loss": 0.121,
2543
+ "step": 176500
2544
+ },
2545
+ {
2546
+ "epoch": 8.0,
2547
+ "eval_accuracy": 0.8648621119910367,
2548
+ "eval_loss": 0.8655109405517578,
2549
+ "eval_runtime": 26.0286,
2550
+ "eval_samples_per_second": 1508.765,
2551
+ "eval_steps_per_second": 94.319,
2552
+ "step": 176720
2553
+ },
2554
+ {
2555
+ "epoch": 8.012675418741512,
2556
+ "grad_norm": 1.5518616437911987,
2557
+ "learning_rate": 2.1141750864452e-06,
2558
+ "loss": 0.1058,
2559
+ "step": 177000
2560
+ },
2561
+ {
2562
+ "epoch": 8.03531009506564,
2563
+ "grad_norm": 0.015344664454460144,
2564
+ "learning_rate": 2.090095643547191e-06,
2565
+ "loss": 0.0809,
2566
+ "step": 177500
2567
+ },
2568
+ {
2569
+ "epoch": 8.057944771389769,
2570
+ "grad_norm": 100.70498657226562,
2571
+ "learning_rate": 2.066016200649182e-06,
2572
+ "loss": 0.094,
2573
+ "step": 178000
2574
+ },
2575
+ {
2576
+ "epoch": 8.080579447713898,
2577
+ "grad_norm": 0.47632962465286255,
2578
+ "learning_rate": 2.0419367577511727e-06,
2579
+ "loss": 0.1115,
2580
+ "step": 178500
2581
+ },
2582
+ {
2583
+ "epoch": 8.103214124038026,
2584
+ "grad_norm": 6.641209125518799,
2585
+ "learning_rate": 2.017857314853164e-06,
2586
+ "loss": 0.0856,
2587
+ "step": 179000
2588
+ },
2589
+ {
2590
+ "epoch": 8.125848800362155,
2591
+ "grad_norm": 27.82591438293457,
2592
+ "learning_rate": 1.9937778719551548e-06,
2593
+ "loss": 0.0983,
2594
+ "step": 179500
2595
+ },
2596
+ {
2597
+ "epoch": 8.148483476686284,
2598
+ "grad_norm": 314.1797790527344,
2599
+ "learning_rate": 1.9696984290571456e-06,
2600
+ "loss": 0.1075,
2601
+ "step": 180000
2602
+ },
2603
+ {
2604
+ "epoch": 8.171118153010411,
2605
+ "grad_norm": 254.616455078125,
2606
+ "learning_rate": 1.9456189861591364e-06,
2607
+ "loss": 0.1016,
2608
+ "step": 180500
2609
+ },
2610
+ {
2611
+ "epoch": 8.19375282933454,
2612
+ "grad_norm": 0.07589972764253616,
2613
+ "learning_rate": 1.9215395432611273e-06,
2614
+ "loss": 0.0924,
2615
+ "step": 181000
2616
+ },
2617
+ {
2618
+ "epoch": 8.21638750565867,
2619
+ "grad_norm": 77.42697143554688,
2620
+ "learning_rate": 1.8974601003631183e-06,
2621
+ "loss": 0.0965,
2622
+ "step": 181500
2623
+ },
2624
+ {
2625
+ "epoch": 8.239022181982797,
2626
+ "grad_norm": 0.12723857164382935,
2627
+ "learning_rate": 1.873380657465109e-06,
2628
+ "loss": 0.1024,
2629
+ "step": 182000
2630
+ },
2631
+ {
2632
+ "epoch": 8.261656858306926,
2633
+ "grad_norm": 7.569960594177246,
2634
+ "learning_rate": 1.8493012145671e-06,
2635
+ "loss": 0.1201,
2636
+ "step": 182500
2637
+ },
2638
+ {
2639
+ "epoch": 8.284291534631055,
2640
+ "grad_norm": 92.7170181274414,
2641
+ "learning_rate": 1.825221771669091e-06,
2642
+ "loss": 0.0976,
2643
+ "step": 183000
2644
+ },
2645
+ {
2646
+ "epoch": 8.306926210955183,
2647
+ "grad_norm": 0.009992193430662155,
2648
+ "learning_rate": 1.8011423287710816e-06,
2649
+ "loss": 0.1025,
2650
+ "step": 183500
2651
+ },
2652
+ {
2653
+ "epoch": 8.329560887279312,
2654
+ "grad_norm": 0.11167449504137039,
2655
+ "learning_rate": 1.7770628858730727e-06,
2656
+ "loss": 0.09,
2657
+ "step": 184000
2658
+ },
2659
+ {
2660
+ "epoch": 8.352195563603441,
2661
+ "grad_norm": 54.87889099121094,
2662
+ "learning_rate": 1.7529834429750633e-06,
2663
+ "loss": 0.0993,
2664
+ "step": 184500
2665
+ },
2666
+ {
2667
+ "epoch": 8.374830239927569,
2668
+ "grad_norm": 0.07053136825561523,
2669
+ "learning_rate": 1.7289040000770544e-06,
2670
+ "loss": 0.0987,
2671
+ "step": 185000
2672
+ },
2673
+ {
2674
+ "epoch": 8.397464916251698,
2675
+ "grad_norm": 4.904270648956299,
2676
+ "learning_rate": 1.7048245571790454e-06,
2677
+ "loss": 0.105,
2678
+ "step": 185500
2679
+ },
2680
+ {
2681
+ "epoch": 8.420099592575827,
2682
+ "grad_norm": 0.10817304253578186,
2683
+ "learning_rate": 1.680745114281036e-06,
2684
+ "loss": 0.0977,
2685
+ "step": 186000
2686
+ },
2687
+ {
2688
+ "epoch": 8.442734268899954,
2689
+ "grad_norm": 0.03593330830335617,
2690
+ "learning_rate": 1.656665671383027e-06,
2691
+ "loss": 0.0954,
2692
+ "step": 186500
2693
+ },
2694
+ {
2695
+ "epoch": 8.465368945224084,
2696
+ "grad_norm": 105.52520751953125,
2697
+ "learning_rate": 1.6325862284850181e-06,
2698
+ "loss": 0.1065,
2699
+ "step": 187000
2700
+ },
2701
+ {
2702
+ "epoch": 8.488003621548211,
2703
+ "grad_norm": 0.19925498962402344,
2704
+ "learning_rate": 1.6085067855870087e-06,
2705
+ "loss": 0.0983,
2706
+ "step": 187500
2707
+ },
2708
+ {
2709
+ "epoch": 8.51063829787234,
2710
+ "grad_norm": 0.29446855187416077,
2711
+ "learning_rate": 1.5844273426889998e-06,
2712
+ "loss": 0.1015,
2713
+ "step": 188000
2714
+ },
2715
+ {
2716
+ "epoch": 8.53327297419647,
2717
+ "grad_norm": 13.635686874389648,
2718
+ "learning_rate": 1.5603478997909904e-06,
2719
+ "loss": 0.0902,
2720
+ "step": 188500
2721
+ },
2722
+ {
2723
+ "epoch": 8.555907650520597,
2724
+ "grad_norm": 0.05709734186530113,
2725
+ "learning_rate": 1.5362684568929815e-06,
2726
+ "loss": 0.0972,
2727
+ "step": 189000
2728
+ },
2729
+ {
2730
+ "epoch": 8.578542326844726,
2731
+ "grad_norm": 0.023464586585760117,
2732
+ "learning_rate": 1.5121890139949725e-06,
2733
+ "loss": 0.1121,
2734
+ "step": 189500
2735
+ },
2736
+ {
2737
+ "epoch": 8.601177003168855,
2738
+ "grad_norm": 0.10528367012739182,
2739
+ "learning_rate": 1.4881095710969631e-06,
2740
+ "loss": 0.1001,
2741
+ "step": 190000
2742
+ },
2743
+ {
2744
+ "epoch": 8.623811679492983,
2745
+ "grad_norm": 0.4939417243003845,
2746
+ "learning_rate": 1.4640301281989542e-06,
2747
+ "loss": 0.098,
2748
+ "step": 190500
2749
+ },
2750
+ {
2751
+ "epoch": 8.646446355817112,
2752
+ "grad_norm": 12.716562271118164,
2753
+ "learning_rate": 1.4399506853009452e-06,
2754
+ "loss": 0.0945,
2755
+ "step": 191000
2756
+ },
2757
+ {
2758
+ "epoch": 8.669081032141241,
2759
+ "grad_norm": 145.86587524414062,
2760
+ "learning_rate": 1.4158712424029358e-06,
2761
+ "loss": 0.0888,
2762
+ "step": 191500
2763
+ },
2764
+ {
2765
+ "epoch": 8.691715708465368,
2766
+ "grad_norm": 0.03197444975376129,
2767
+ "learning_rate": 1.3917917995049269e-06,
2768
+ "loss": 0.0858,
2769
+ "step": 192000
2770
+ },
2771
+ {
2772
+ "epoch": 8.714350384789498,
2773
+ "grad_norm": 0.11782459169626236,
2774
+ "learning_rate": 1.3677123566069175e-06,
2775
+ "loss": 0.1027,
2776
+ "step": 192500
2777
+ },
2778
+ {
2779
+ "epoch": 8.736985061113627,
2780
+ "grad_norm": 0.1515623927116394,
2781
+ "learning_rate": 1.3436329137089086e-06,
2782
+ "loss": 0.1003,
2783
+ "step": 193000
2784
+ },
2785
+ {
2786
+ "epoch": 8.759619737437754,
2787
+ "grad_norm": 0.04349144920706749,
2788
+ "learning_rate": 1.3195534708108996e-06,
2789
+ "loss": 0.0926,
2790
+ "step": 193500
2791
+ },
2792
+ {
2793
+ "epoch": 8.782254413761883,
2794
+ "grad_norm": 14.532307624816895,
2795
+ "learning_rate": 1.2954740279128902e-06,
2796
+ "loss": 0.0935,
2797
+ "step": 194000
2798
+ },
2799
+ {
2800
+ "epoch": 8.80488909008601,
2801
+ "grad_norm": 0.05790287256240845,
2802
+ "learning_rate": 1.2713945850148813e-06,
2803
+ "loss": 0.0923,
2804
+ "step": 194500
2805
+ },
2806
+ {
2807
+ "epoch": 8.82752376641014,
2808
+ "grad_norm": 81.71268463134766,
2809
+ "learning_rate": 1.247315142116872e-06,
2810
+ "loss": 0.1172,
2811
+ "step": 195000
2812
+ },
2813
+ {
2814
+ "epoch": 8.85015844273427,
2815
+ "grad_norm": 0.03535538911819458,
2816
+ "learning_rate": 1.223235699218863e-06,
2817
+ "loss": 0.0985,
2818
+ "step": 195500
2819
+ },
2820
+ {
2821
+ "epoch": 8.872793119058397,
2822
+ "grad_norm": 0.05989941582083702,
2823
+ "learning_rate": 1.199156256320854e-06,
2824
+ "loss": 0.1033,
2825
+ "step": 196000
2826
+ },
2827
+ {
2828
+ "epoch": 8.895427795382526,
2829
+ "grad_norm": 0.18569760024547577,
2830
+ "learning_rate": 1.1750768134228448e-06,
2831
+ "loss": 0.1178,
2832
+ "step": 196500
2833
+ },
2834
+ {
2835
+ "epoch": 8.918062471706655,
2836
+ "grad_norm": 0.02892606146633625,
2837
+ "learning_rate": 1.1509973705248357e-06,
2838
+ "loss": 0.0964,
2839
+ "step": 197000
2840
+ },
2841
+ {
2842
+ "epoch": 8.940697148030782,
2843
+ "grad_norm": 181.0758819580078,
2844
+ "learning_rate": 1.1269179276268265e-06,
2845
+ "loss": 0.1056,
2846
+ "step": 197500
2847
+ },
2848
+ {
2849
+ "epoch": 8.963331824354912,
2850
+ "grad_norm": 28.286996841430664,
2851
+ "learning_rate": 1.1028384847288175e-06,
2852
+ "loss": 0.1055,
2853
+ "step": 198000
2854
+ },
2855
+ {
2856
+ "epoch": 8.98596650067904,
2857
+ "grad_norm": 0.08270686864852905,
2858
+ "learning_rate": 1.0787590418308084e-06,
2859
+ "loss": 0.1083,
2860
+ "step": 198500
2861
+ },
2862
+ {
2863
+ "epoch": 9.0,
2864
+ "eval_accuracy": 0.8651931450688803,
2865
+ "eval_loss": 0.9116848111152649,
2866
+ "eval_runtime": 25.9694,
2867
+ "eval_samples_per_second": 1512.2,
2868
+ "eval_steps_per_second": 94.534,
2869
+ "step": 198810
2870
+ },
2871
+ {
2872
+ "epoch": 9.008601177003168,
2873
+ "grad_norm": 0.3910556733608246,
2874
+ "learning_rate": 1.0546795989327992e-06,
2875
+ "loss": 0.0919,
2876
+ "step": 199000
2877
+ },
2878
+ {
2879
+ "epoch": 9.031235853327297,
2880
+ "grad_norm": 39.47013854980469,
2881
+ "learning_rate": 1.03060015603479e-06,
2882
+ "loss": 0.0828,
2883
+ "step": 199500
2884
+ },
2885
+ {
2886
+ "epoch": 9.053870529651427,
2887
+ "grad_norm": 0.02229388989508152,
2888
+ "learning_rate": 1.006520713136781e-06,
2889
+ "loss": 0.0814,
2890
+ "step": 200000
2891
+ },
2892
+ {
2893
+ "epoch": 9.076505205975554,
2894
+ "grad_norm": 0.028238942846655846,
2895
+ "learning_rate": 9.82441270238772e-07,
2896
+ "loss": 0.0835,
2897
+ "step": 200500
2898
+ },
2899
+ {
2900
+ "epoch": 9.099139882299683,
2901
+ "grad_norm": 10.808701515197754,
2902
+ "learning_rate": 9.583618273407628e-07,
2903
+ "loss": 0.0923,
2904
+ "step": 201000
2905
+ },
2906
+ {
2907
+ "epoch": 9.121774558623812,
2908
+ "grad_norm": 221.81275939941406,
2909
+ "learning_rate": 9.342823844427536e-07,
2910
+ "loss": 0.0665,
2911
+ "step": 201500
2912
+ },
2913
+ {
2914
+ "epoch": 9.14440923494794,
2915
+ "grad_norm": 0.1080513447523117,
2916
+ "learning_rate": 9.102029415447445e-07,
2917
+ "loss": 0.0876,
2918
+ "step": 202000
2919
+ },
2920
+ {
2921
+ "epoch": 9.167043911272069,
2922
+ "grad_norm": 232.07345581054688,
2923
+ "learning_rate": 8.861234986467354e-07,
2924
+ "loss": 0.0863,
2925
+ "step": 202500
2926
+ },
2927
+ {
2928
+ "epoch": 9.189678587596198,
2929
+ "grad_norm": 0.04613710194826126,
2930
+ "learning_rate": 8.620440557487263e-07,
2931
+ "loss": 0.0938,
2932
+ "step": 203000
2933
+ },
2934
+ {
2935
+ "epoch": 9.212313263920326,
2936
+ "grad_norm": 0.022649744525551796,
2937
+ "learning_rate": 8.379646128507171e-07,
2938
+ "loss": 0.0815,
2939
+ "step": 203500
2940
+ },
2941
+ {
2942
+ "epoch": 9.234947940244455,
2943
+ "grad_norm": 0.0704297199845314,
2944
+ "learning_rate": 8.138851699527081e-07,
2945
+ "loss": 0.0793,
2946
+ "step": 204000
2947
+ },
2948
+ {
2949
+ "epoch": 9.257582616568584,
2950
+ "grad_norm": 169.63650512695312,
2951
+ "learning_rate": 7.898057270546989e-07,
2952
+ "loss": 0.0757,
2953
+ "step": 204500
2954
+ },
2955
+ {
2956
+ "epoch": 9.280217292892711,
2957
+ "grad_norm": 145.97549438476562,
2958
+ "learning_rate": 7.657262841566899e-07,
2959
+ "loss": 0.0818,
2960
+ "step": 205000
2961
+ },
2962
+ {
2963
+ "epoch": 9.30285196921684,
2964
+ "grad_norm": 0.3320428431034088,
2965
+ "learning_rate": 7.416468412586807e-07,
2966
+ "loss": 0.0996,
2967
+ "step": 205500
2968
+ },
2969
+ {
2970
+ "epoch": 9.325486645540968,
2971
+ "grad_norm": 2.268958806991577,
2972
+ "learning_rate": 7.175673983606715e-07,
2973
+ "loss": 0.0835,
2974
+ "step": 206000
2975
+ },
2976
+ {
2977
+ "epoch": 9.348121321865097,
2978
+ "grad_norm": 0.038560718297958374,
2979
+ "learning_rate": 6.934879554626625e-07,
2980
+ "loss": 0.0822,
2981
+ "step": 206500
2982
+ },
2983
+ {
2984
+ "epoch": 9.370755998189226,
2985
+ "grad_norm": 92.30923461914062,
2986
+ "learning_rate": 6.694085125646534e-07,
2987
+ "loss": 0.0854,
2988
+ "step": 207000
2989
+ },
2990
+ {
2991
+ "epoch": 9.393390674513354,
2992
+ "grad_norm": 0.2284342646598816,
2993
+ "learning_rate": 6.453290696666442e-07,
2994
+ "loss": 0.0807,
2995
+ "step": 207500
2996
+ },
2997
+ {
2998
+ "epoch": 9.416025350837483,
2999
+ "grad_norm": 0.8369685411453247,
3000
+ "learning_rate": 6.212496267686352e-07,
3001
+ "loss": 0.0744,
3002
+ "step": 208000
3003
+ },
3004
+ {
3005
+ "epoch": 9.438660027161612,
3006
+ "grad_norm": 0.49937498569488525,
3007
+ "learning_rate": 5.97170183870626e-07,
3008
+ "loss": 0.0942,
3009
+ "step": 208500
3010
+ },
3011
+ {
3012
+ "epoch": 9.46129470348574,
3013
+ "grad_norm": 0.020057352259755135,
3014
+ "learning_rate": 5.73090740972617e-07,
3015
+ "loss": 0.0748,
3016
+ "step": 209000
3017
+ },
3018
+ {
3019
+ "epoch": 9.483929379809869,
3020
+ "grad_norm": 196.6532745361328,
3021
+ "learning_rate": 5.490112980746078e-07,
3022
+ "loss": 0.0964,
3023
+ "step": 209500
3024
+ },
3025
+ {
3026
+ "epoch": 9.506564056133998,
3027
+ "grad_norm": 0.5909414887428284,
3028
+ "learning_rate": 5.249318551765987e-07,
3029
+ "loss": 0.0759,
3030
+ "step": 210000
3031
+ },
3032
+ {
3033
+ "epoch": 9.529198732458125,
3034
+ "grad_norm": 135.35101318359375,
3035
+ "learning_rate": 5.008524122785896e-07,
3036
+ "loss": 0.0804,
3037
+ "step": 210500
3038
+ },
3039
+ {
3040
+ "epoch": 9.551833408782255,
3041
+ "grad_norm": 0.0622185617685318,
3042
+ "learning_rate": 4.7677296938058045e-07,
3043
+ "loss": 0.0709,
3044
+ "step": 211000
3045
+ },
3046
+ {
3047
+ "epoch": 9.574468085106384,
3048
+ "grad_norm": 0.05341747775673866,
3049
+ "learning_rate": 4.526935264825713e-07,
3050
+ "loss": 0.0863,
3051
+ "step": 211500
3052
+ },
3053
+ {
3054
+ "epoch": 9.597102761430511,
3055
+ "grad_norm": 55.312278747558594,
3056
+ "learning_rate": 4.286140835845622e-07,
3057
+ "loss": 0.0782,
3058
+ "step": 212000
3059
+ },
3060
+ {
3061
+ "epoch": 9.61973743775464,
3062
+ "grad_norm": 0.06182483211159706,
3063
+ "learning_rate": 4.0453464068655306e-07,
3064
+ "loss": 0.0881,
3065
+ "step": 212500
3066
+ },
3067
+ {
3068
+ "epoch": 9.64237211407877,
3069
+ "grad_norm": 0.06101556122303009,
3070
+ "learning_rate": 3.80455197788544e-07,
3071
+ "loss": 0.0834,
3072
+ "step": 213000
3073
+ },
3074
+ {
3075
+ "epoch": 9.665006790402897,
3076
+ "grad_norm": 0.35538122057914734,
3077
+ "learning_rate": 3.5637575489053483e-07,
3078
+ "loss": 0.0776,
3079
+ "step": 213500
3080
+ },
3081
+ {
3082
+ "epoch": 9.687641466727026,
3083
+ "grad_norm": 0.08858389407396317,
3084
+ "learning_rate": 3.322963119925258e-07,
3085
+ "loss": 0.0832,
3086
+ "step": 214000
3087
+ },
3088
+ {
3089
+ "epoch": 9.710276143051153,
3090
+ "grad_norm": 0.07368449866771698,
3091
+ "learning_rate": 3.0821686909451666e-07,
3092
+ "loss": 0.0921,
3093
+ "step": 214500
3094
+ },
3095
+ {
3096
+ "epoch": 9.732910819375283,
3097
+ "grad_norm": 0.4127441644668579,
3098
+ "learning_rate": 2.8413742619650755e-07,
3099
+ "loss": 0.0846,
3100
+ "step": 215000
3101
+ },
3102
+ {
3103
+ "epoch": 9.755545495699412,
3104
+ "grad_norm": 0.06469714641571045,
3105
+ "learning_rate": 2.6005798329849844e-07,
3106
+ "loss": 0.0928,
3107
+ "step": 215500
3108
+ },
3109
+ {
3110
+ "epoch": 9.77818017202354,
3111
+ "grad_norm": 0.28226494789123535,
3112
+ "learning_rate": 2.3597854040048932e-07,
3113
+ "loss": 0.0796,
3114
+ "step": 216000
3115
+ },
3116
+ {
3117
+ "epoch": 9.800814848347668,
3118
+ "grad_norm": 250.34405517578125,
3119
+ "learning_rate": 2.118990975024802e-07,
3120
+ "loss": 0.0859,
3121
+ "step": 216500
3122
+ },
3123
+ {
3124
+ "epoch": 9.823449524671798,
3125
+ "grad_norm": 22.07843589782715,
3126
+ "learning_rate": 1.878196546044711e-07,
3127
+ "loss": 0.0846,
3128
+ "step": 217000
3129
+ },
3130
+ {
3131
+ "epoch": 9.846084200995925,
3132
+ "grad_norm": 14.63901138305664,
3133
+ "learning_rate": 1.6374021170646199e-07,
3134
+ "loss": 0.0788,
3135
+ "step": 217500
3136
+ },
3137
+ {
3138
+ "epoch": 9.868718877320054,
3139
+ "grad_norm": 80.16069793701172,
3140
+ "learning_rate": 1.3966076880845285e-07,
3141
+ "loss": 0.095,
3142
+ "step": 218000
3143
+ },
3144
+ {
3145
+ "epoch": 9.891353553644183,
3146
+ "grad_norm": 0.07123162597417831,
3147
+ "learning_rate": 1.1558132591044375e-07,
3148
+ "loss": 0.0935,
3149
+ "step": 218500
3150
+ },
3151
+ {
3152
+ "epoch": 9.91398822996831,
3153
+ "grad_norm": 17.373336791992188,
3154
+ "learning_rate": 9.150188301243464e-08,
3155
+ "loss": 0.0839,
3156
+ "step": 219000
3157
+ },
3158
+ {
3159
+ "epoch": 9.93662290629244,
3160
+ "grad_norm": 0.0160963274538517,
3161
+ "learning_rate": 6.742244011442552e-08,
3162
+ "loss": 0.0746,
3163
+ "step": 219500
3164
+ },
3165
+ {
3166
+ "epoch": 9.95925758261657,
3167
+ "grad_norm": 0.07339876890182495,
3168
+ "learning_rate": 4.3342997216416404e-08,
3169
+ "loss": 0.0755,
3170
+ "step": 220000
3171
+ },
3172
+ {
3173
+ "epoch": 9.981892258940697,
3174
+ "grad_norm": 0.06764261424541473,
3175
+ "learning_rate": 1.926355431840729e-08,
3176
+ "loss": 0.0835,
3177
+ "step": 220500
3178
+ },
3179
+ {
3180
+ "epoch": 10.0,
3181
+ "eval_accuracy": 0.8659570675562119,
3182
+ "eval_loss": 0.9559618830680847,
3183
+ "eval_runtime": 26.0025,
3184
+ "eval_samples_per_second": 1510.279,
3185
+ "eval_steps_per_second": 94.414,
3186
+ "step": 220900
3187
+ },
3188
+ {
3189
+ "epoch": 10.0,
3190
+ "step": 220900,
3191
+ "total_flos": 1.4387669402147813e+17,
3192
+ "train_loss": 0.2225576283846467,
3193
+ "train_runtime": 11995.4266,
3194
+ "train_samples_per_second": 294.638,
3195
+ "train_steps_per_second": 18.415
3196
+ }
3197
+ ],
3198
+ "logging_steps": 500,
3199
+ "max_steps": 220900,
3200
+ "num_input_tokens_seen": 0,
3201
+ "num_train_epochs": 10,
3202
+ "save_steps": 500,
3203
+ "stateful_callbacks": {
3204
+ "TrainerControl": {
3205
+ "args": {
3206
+ "should_epoch_stop": false,
3207
+ "should_evaluate": false,
3208
+ "should_log": false,
3209
+ "should_save": true,
3210
+ "should_training_stop": true
3211
+ },
3212
+ "attributes": {}
3213
+ }
3214
+ },
3215
+ "total_flos": 1.4387669402147813e+17,
3216
+ "train_batch_size": 16,
3217
+ "trial_name": null,
3218
+ "trial_params": null
3219
+ }
mnli/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abdd2a615dd69580df71dd30f22047bafd974f7823356358b69b681944926bee
3
+ size 5240
mnli/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-large_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3ddd1778ad8ee06716219d88efb6bb5ea437e85db0a59496999b238cafc38a
3
+ size 4214096
mnli/roberta-large_lr1e-05/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/projects/shikexuan/nlu_model/roberta-large",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 4096,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "roberta",
28
+ "num_attention_heads": 16,
29
+ "num_hidden_layers": 24,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.44.1",
34
+ "type_vocab_size": 1,
35
+ "use_cache": true,
36
+ "vocab_size": 50265
37
+ }
mnli/roberta-large_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-large_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0894795ad4cd58b0610ea0779a1e4fb1548cd7f9027bf37b5f7b4f91a9bd76f
3
+ size 1421499516