Kendamarron commited on
Commit
1fd40ad
1 Parent(s): 1103bb1

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ent2>": 32771,
3
+ "<ent>": 32770
4
+ }
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pkshatech/GLuCoSE-base-ja",
3
+ "architectures": [
4
+ "LukeForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_model_name": "models/luke-japanese/hf_xlm_roberta",
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.0,
10
+ "cls_entity_prediction": false,
11
+ "entity_emb_size": 256,
12
+ "entity_vocab_size": 4,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.0,
16
+ "hidden_size": 768,
17
+ "id2label": {
18
+ "0": "LABEL_0",
19
+ "1": "LABEL_1",
20
+ "2": "LABEL_2",
21
+ "3": "LABEL_3",
22
+ "4": "LABEL_4",
23
+ "5": "LABEL_5"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "label2id": {
28
+ "LABEL_0": 0,
29
+ "LABEL_1": 1,
30
+ "LABEL_2": 2,
31
+ "LABEL_3": 3,
32
+ "LABEL_4": 4,
33
+ "LABEL_5": 5
34
+ },
35
+ "layer_norm_eps": 1e-05,
36
+ "max_position_embeddings": 514,
37
+ "model_type": "luke",
38
+ "num_attention_heads": 12,
39
+ "num_hidden_layers": 12,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "problem_type": "single_label_classification",
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.41.2",
45
+ "type_vocab_size": 1,
46
+ "use_cache": true,
47
+ "use_entity_aware_attention": true,
48
+ "vocab_size": 32772
49
+ }
entity_vocab.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[MASK2]": 3,
3
+ "[MASK]": 0,
4
+ "[PAD]": 2,
5
+ "[UNK]": 1
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b6e9b71ee20dcc0ec1db6159545531c1ad3f38a7e00d5dbd981229b1a5cbc25
3
+ size 532319592
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a72060e26d7823056951f42a70e767ec5feda794ef5234ae8eb5a554f2803e50
3
+ size 889893306
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa5dea273dfc73f9ac24831489f71d14f48a6bfc83548f75aa510b4bfdea1bd2
3
+ size 14180
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:637320bd09bc2c277af9b239bbf16d21d18021f93eda798db5a6718d046b6b91
3
+ size 1064
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b73a5e054936c920cf5b7d1ec21ce9c281977078269963beb821c6c86fbff7
3
+ size 841889
special_tokens_map.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<ent>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<ent2>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<ent>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<ent2>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<ent>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<ent2>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<ent>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<ent2>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<ent>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<ent2>",
68
+ "lstrip": false,
69
+ "normalized": true,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ }
73
+ ],
74
+ "bos_token": "<s>",
75
+ "cls_token": "<s>",
76
+ "eos_token": "</s>",
77
+ "mask_token": {
78
+ "content": "<mask>",
79
+ "lstrip": true,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false
83
+ },
84
+ "pad_token": "<pad>",
85
+ "sep_token": "</s>",
86
+ "unk_token": "<unk>"
87
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "32769": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "32770": {
44
+ "content": "<ent>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "32771": {
52
+ "content": "<ent2>",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "additional_special_tokens": [
61
+ "<ent>",
62
+ "<ent2>",
63
+ "<ent>",
64
+ "<ent2>",
65
+ "<ent>",
66
+ "<ent2>",
67
+ "<ent>",
68
+ "<ent2>",
69
+ "<ent>",
70
+ "<ent2>"
71
+ ],
72
+ "bos_token": "<s>",
73
+ "clean_up_tokenization_spaces": true,
74
+ "cls_token": "<s>",
75
+ "entity_mask2_token": "[MASK2]",
76
+ "entity_mask_token": "[MASK]",
77
+ "entity_pad_token": "[PAD]",
78
+ "entity_token_1": {
79
+ "__type": "AddedToken",
80
+ "content": "<ent>",
81
+ "lstrip": false,
82
+ "normalized": true,
83
+ "rstrip": false,
84
+ "single_word": false,
85
+ "special": false
86
+ },
87
+ "entity_token_2": {
88
+ "__type": "AddedToken",
89
+ "content": "<ent2>",
90
+ "lstrip": false,
91
+ "normalized": true,
92
+ "rstrip": false,
93
+ "single_word": false,
94
+ "special": false
95
+ },
96
+ "entity_unk_token": "[UNK]",
97
+ "eos_token": "</s>",
98
+ "mask_token": "<mask>",
99
+ "max_entity_length": 32,
100
+ "max_mention_length": 30,
101
+ "model_max_length": 512,
102
+ "pad_token": "<pad>",
103
+ "sep_token": "</s>",
104
+ "sp_model_kwargs": {},
105
+ "task": null,
106
+ "tokenizer_class": "MLukeTokenizer",
107
+ "unk_token": "<unk>"
108
+ }
trainer_state.json ADDED
@@ -0,0 +1,3264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.3032359897985208,
3
+ "best_model_checkpoint": "outputs/checkpoint-400",
4
+ "epoch": 4.25531914893617,
5
+ "eval_steps": 50,
6
+ "global_step": 450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009456264775413711,
13
+ "grad_norm": 6.37161922454834,
14
+ "learning_rate": 9.995238095238095e-06,
15
+ "loss": 1.7524,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.018912529550827423,
20
+ "grad_norm": 4.851662635803223,
21
+ "learning_rate": 9.990476190476191e-06,
22
+ "loss": 1.6245,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.028368794326241134,
27
+ "grad_norm": 4.380736827850342,
28
+ "learning_rate": 9.985714285714286e-06,
29
+ "loss": 1.4964,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.037825059101654845,
34
+ "grad_norm": 3.163944959640503,
35
+ "learning_rate": 9.980952380952382e-06,
36
+ "loss": 1.4517,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.04728132387706856,
41
+ "grad_norm": 2.9784162044525146,
42
+ "learning_rate": 9.976190476190477e-06,
43
+ "loss": 1.3719,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.05673758865248227,
48
+ "grad_norm": 2.3548669815063477,
49
+ "learning_rate": 9.971428571428571e-06,
50
+ "loss": 1.3009,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.06619385342789598,
55
+ "grad_norm": 2.4280319213867188,
56
+ "learning_rate": 9.966666666666667e-06,
57
+ "loss": 1.2607,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.07565011820330969,
62
+ "grad_norm": 1.8799192905426025,
63
+ "learning_rate": 9.961904761904763e-06,
64
+ "loss": 1.2404,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.0851063829787234,
69
+ "grad_norm": 1.855432152748108,
70
+ "learning_rate": 9.957142857142858e-06,
71
+ "loss": 1.2647,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.09456264775413711,
76
+ "grad_norm": 2.1619343757629395,
77
+ "learning_rate": 9.952380952380954e-06,
78
+ "loss": 1.2139,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.10401891252955082,
83
+ "grad_norm": 1.4054124355316162,
84
+ "learning_rate": 9.947619047619049e-06,
85
+ "loss": 1.2353,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.11347517730496454,
90
+ "grad_norm": 1.2696382999420166,
91
+ "learning_rate": 9.942857142857145e-06,
92
+ "loss": 1.2791,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.12293144208037825,
97
+ "grad_norm": 1.0346633195877075,
98
+ "learning_rate": 9.93809523809524e-06,
99
+ "loss": 1.2959,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.13238770685579196,
104
+ "grad_norm": 0.8732487559318542,
105
+ "learning_rate": 9.933333333333334e-06,
106
+ "loss": 1.223,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.14184397163120568,
111
+ "grad_norm": 0.7110472917556763,
112
+ "learning_rate": 9.92857142857143e-06,
113
+ "loss": 1.2079,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.15130023640661938,
118
+ "grad_norm": 1.2797213792800903,
119
+ "learning_rate": 9.923809523809524e-06,
120
+ "loss": 1.1188,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.1607565011820331,
125
+ "grad_norm": 0.8014526963233948,
126
+ "learning_rate": 9.91904761904762e-06,
127
+ "loss": 1.1624,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.1702127659574468,
132
+ "grad_norm": 2.1425223350524902,
133
+ "learning_rate": 9.914285714285715e-06,
134
+ "loss": 1.1996,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.17966903073286053,
139
+ "grad_norm": 2.180694818496704,
140
+ "learning_rate": 9.90952380952381e-06,
141
+ "loss": 1.094,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.18912529550827423,
146
+ "grad_norm": 1.3586962223052979,
147
+ "learning_rate": 9.904761904761906e-06,
148
+ "loss": 1.1439,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.19858156028368795,
153
+ "grad_norm": 1.3756808042526245,
154
+ "learning_rate": 9.9e-06,
155
+ "loss": 1.1987,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.20803782505910165,
160
+ "grad_norm": 1.4752682447433472,
161
+ "learning_rate": 9.895238095238096e-06,
162
+ "loss": 1.2424,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.21749408983451538,
167
+ "grad_norm": 1.1806340217590332,
168
+ "learning_rate": 9.89047619047619e-06,
169
+ "loss": 1.1481,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.22695035460992907,
174
+ "grad_norm": 1.5014915466308594,
175
+ "learning_rate": 9.885714285714287e-06,
176
+ "loss": 1.1306,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.2364066193853428,
181
+ "grad_norm": 2.27945613861084,
182
+ "learning_rate": 9.880952380952381e-06,
183
+ "loss": 1.2185,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.2458628841607565,
188
+ "grad_norm": 2.4887917041778564,
189
+ "learning_rate": 9.876190476190478e-06,
190
+ "loss": 1.1919,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.2553191489361702,
195
+ "grad_norm": 1.334818959236145,
196
+ "learning_rate": 9.871428571428572e-06,
197
+ "loss": 1.1375,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.2647754137115839,
202
+ "grad_norm": 2.0502264499664307,
203
+ "learning_rate": 9.866666666666668e-06,
204
+ "loss": 1.167,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.27423167848699764,
209
+ "grad_norm": 2.03678822517395,
210
+ "learning_rate": 9.861904761904763e-06,
211
+ "loss": 1.0997,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.28368794326241137,
216
+ "grad_norm": 1.9366700649261475,
217
+ "learning_rate": 9.857142857142859e-06,
218
+ "loss": 1.0612,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.29314420803782504,
223
+ "grad_norm": 1.8982603549957275,
224
+ "learning_rate": 9.852380952380953e-06,
225
+ "loss": 1.0902,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.30260047281323876,
230
+ "grad_norm": 1.7486457824707031,
231
+ "learning_rate": 9.847619047619048e-06,
232
+ "loss": 1.149,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.3120567375886525,
237
+ "grad_norm": 1.8188557624816895,
238
+ "learning_rate": 9.842857142857144e-06,
239
+ "loss": 1.0975,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.3215130023640662,
244
+ "grad_norm": 3.800814151763916,
245
+ "learning_rate": 9.838095238095238e-06,
246
+ "loss": 1.1611,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.3309692671394799,
251
+ "grad_norm": 2.2901618480682373,
252
+ "learning_rate": 9.833333333333333e-06,
253
+ "loss": 1.0969,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.3404255319148936,
258
+ "grad_norm": 2.420259475708008,
259
+ "learning_rate": 9.828571428571429e-06,
260
+ "loss": 1.0287,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.34988179669030733,
265
+ "grad_norm": 2.751361846923828,
266
+ "learning_rate": 9.823809523809524e-06,
267
+ "loss": 1.0103,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.35933806146572106,
272
+ "grad_norm": 2.2316462993621826,
273
+ "learning_rate": 9.81904761904762e-06,
274
+ "loss": 1.0957,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.36879432624113473,
279
+ "grad_norm": 0.9498681426048279,
280
+ "learning_rate": 9.814285714285716e-06,
281
+ "loss": 1.0657,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.37825059101654845,
286
+ "grad_norm": 1.658214807510376,
287
+ "learning_rate": 9.80952380952381e-06,
288
+ "loss": 1.1366,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.3877068557919622,
293
+ "grad_norm": 5.218255043029785,
294
+ "learning_rate": 9.804761904761907e-06,
295
+ "loss": 1.1998,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.3971631205673759,
300
+ "grad_norm": 3.6433420181274414,
301
+ "learning_rate": 9.800000000000001e-06,
302
+ "loss": 1.165,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.4066193853427896,
307
+ "grad_norm": 2.1015524864196777,
308
+ "learning_rate": 9.795238095238097e-06,
309
+ "loss": 1.0357,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.4160756501182033,
314
+ "grad_norm": 1.6407890319824219,
315
+ "learning_rate": 9.790476190476192e-06,
316
+ "loss": 1.1007,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.425531914893617,
321
+ "grad_norm": 1.929078459739685,
322
+ "learning_rate": 9.785714285714286e-06,
323
+ "loss": 0.9683,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.43498817966903075,
328
+ "grad_norm": 1.3185120820999146,
329
+ "learning_rate": 9.780952380952382e-06,
330
+ "loss": 1.0848,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.4444444444444444,
335
+ "grad_norm": 3.1991031169891357,
336
+ "learning_rate": 9.776190476190477e-06,
337
+ "loss": 1.1025,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.45390070921985815,
342
+ "grad_norm": 4.7339863777160645,
343
+ "learning_rate": 9.771428571428571e-06,
344
+ "loss": 1.0889,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.46335697399527187,
349
+ "grad_norm": 2.6043481826782227,
350
+ "learning_rate": 9.766666666666667e-06,
351
+ "loss": 1.1063,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.4728132387706856,
356
+ "grad_norm": 3.4060988426208496,
357
+ "learning_rate": 9.761904761904762e-06,
358
+ "loss": 1.0141,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.4728132387706856,
363
+ "eval_f1_macro": 0.21460474338825342,
364
+ "eval_loss": 1.0437264442443848,
365
+ "eval_runtime": 14.0189,
366
+ "eval_samples_per_second": 120.623,
367
+ "eval_steps_per_second": 7.561,
368
+ "step": 50
369
+ },
370
+ {
371
+ "epoch": 0.48226950354609927,
372
+ "grad_norm": 1.8903930187225342,
373
+ "learning_rate": 9.757142857142858e-06,
374
+ "loss": 1.0172,
375
+ "step": 51
376
+ },
377
+ {
378
+ "epoch": 0.491725768321513,
379
+ "grad_norm": 1.3195421695709229,
380
+ "learning_rate": 9.752380952380953e-06,
381
+ "loss": 1.0802,
382
+ "step": 52
383
+ },
384
+ {
385
+ "epoch": 0.5011820330969267,
386
+ "grad_norm": 2.4457075595855713,
387
+ "learning_rate": 9.747619047619049e-06,
388
+ "loss": 1.1387,
389
+ "step": 53
390
+ },
391
+ {
392
+ "epoch": 0.5106382978723404,
393
+ "grad_norm": 1.3236303329467773,
394
+ "learning_rate": 9.742857142857143e-06,
395
+ "loss": 0.9655,
396
+ "step": 54
397
+ },
398
+ {
399
+ "epoch": 0.5200945626477541,
400
+ "grad_norm": 2.286327600479126,
401
+ "learning_rate": 9.73809523809524e-06,
402
+ "loss": 1.0168,
403
+ "step": 55
404
+ },
405
+ {
406
+ "epoch": 0.5295508274231678,
407
+ "grad_norm": 1.6412752866744995,
408
+ "learning_rate": 9.733333333333334e-06,
409
+ "loss": 1.052,
410
+ "step": 56
411
+ },
412
+ {
413
+ "epoch": 0.5390070921985816,
414
+ "grad_norm": 2.345207452774048,
415
+ "learning_rate": 9.72857142857143e-06,
416
+ "loss": 1.0871,
417
+ "step": 57
418
+ },
419
+ {
420
+ "epoch": 0.5484633569739953,
421
+ "grad_norm": 1.5813645124435425,
422
+ "learning_rate": 9.723809523809525e-06,
423
+ "loss": 1.1483,
424
+ "step": 58
425
+ },
426
+ {
427
+ "epoch": 0.557919621749409,
428
+ "grad_norm": 2.1535708904266357,
429
+ "learning_rate": 9.71904761904762e-06,
430
+ "loss": 1.014,
431
+ "step": 59
432
+ },
433
+ {
434
+ "epoch": 0.5673758865248227,
435
+ "grad_norm": 1.5568801164627075,
436
+ "learning_rate": 9.714285714285715e-06,
437
+ "loss": 0.9503,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 0.5768321513002365,
442
+ "grad_norm": 2.258251428604126,
443
+ "learning_rate": 9.70952380952381e-06,
444
+ "loss": 0.8972,
445
+ "step": 61
446
+ },
447
+ {
448
+ "epoch": 0.5862884160756501,
449
+ "grad_norm": 1.8242671489715576,
450
+ "learning_rate": 9.704761904761906e-06,
451
+ "loss": 1.0092,
452
+ "step": 62
453
+ },
454
+ {
455
+ "epoch": 0.5957446808510638,
456
+ "grad_norm": 4.071340084075928,
457
+ "learning_rate": 9.7e-06,
458
+ "loss": 1.1998,
459
+ "step": 63
460
+ },
461
+ {
462
+ "epoch": 0.6052009456264775,
463
+ "grad_norm": 2.4138002395629883,
464
+ "learning_rate": 9.695238095238096e-06,
465
+ "loss": 1.1186,
466
+ "step": 64
467
+ },
468
+ {
469
+ "epoch": 0.6146572104018913,
470
+ "grad_norm": 2.901859760284424,
471
+ "learning_rate": 9.690476190476191e-06,
472
+ "loss": 1.104,
473
+ "step": 65
474
+ },
475
+ {
476
+ "epoch": 0.624113475177305,
477
+ "grad_norm": 1.9879859685897827,
478
+ "learning_rate": 9.685714285714285e-06,
479
+ "loss": 1.022,
480
+ "step": 66
481
+ },
482
+ {
483
+ "epoch": 0.6335697399527187,
484
+ "grad_norm": 1.620570182800293,
485
+ "learning_rate": 9.680952380952382e-06,
486
+ "loss": 1.109,
487
+ "step": 67
488
+ },
489
+ {
490
+ "epoch": 0.6430260047281324,
491
+ "grad_norm": 2.608358144760132,
492
+ "learning_rate": 9.676190476190476e-06,
493
+ "loss": 1.1036,
494
+ "step": 68
495
+ },
496
+ {
497
+ "epoch": 0.6524822695035462,
498
+ "grad_norm": 1.9834874868392944,
499
+ "learning_rate": 9.671428571428572e-06,
500
+ "loss": 0.9812,
501
+ "step": 69
502
+ },
503
+ {
504
+ "epoch": 0.6619385342789598,
505
+ "grad_norm": 1.2722113132476807,
506
+ "learning_rate": 9.666666666666667e-06,
507
+ "loss": 1.044,
508
+ "step": 70
509
+ },
510
+ {
511
+ "epoch": 0.6713947990543735,
512
+ "grad_norm": 4.8911333084106445,
513
+ "learning_rate": 9.661904761904763e-06,
514
+ "loss": 0.9368,
515
+ "step": 71
516
+ },
517
+ {
518
+ "epoch": 0.6808510638297872,
519
+ "grad_norm": 5.012025356292725,
520
+ "learning_rate": 9.657142857142859e-06,
521
+ "loss": 1.0675,
522
+ "step": 72
523
+ },
524
+ {
525
+ "epoch": 0.6903073286052009,
526
+ "grad_norm": 5.291039943695068,
527
+ "learning_rate": 9.652380952380954e-06,
528
+ "loss": 1.128,
529
+ "step": 73
530
+ },
531
+ {
532
+ "epoch": 0.6997635933806147,
533
+ "grad_norm": 4.000207901000977,
534
+ "learning_rate": 9.647619047619048e-06,
535
+ "loss": 1.0296,
536
+ "step": 74
537
+ },
538
+ {
539
+ "epoch": 0.7092198581560284,
540
+ "grad_norm": 2.474881410598755,
541
+ "learning_rate": 9.642857142857144e-06,
542
+ "loss": 0.9613,
543
+ "step": 75
544
+ },
545
+ {
546
+ "epoch": 0.7186761229314421,
547
+ "grad_norm": 2.7820417881011963,
548
+ "learning_rate": 9.638095238095239e-06,
549
+ "loss": 1.0414,
550
+ "step": 76
551
+ },
552
+ {
553
+ "epoch": 0.7281323877068558,
554
+ "grad_norm": 3.6213011741638184,
555
+ "learning_rate": 9.633333333333335e-06,
556
+ "loss": 1.0961,
557
+ "step": 77
558
+ },
559
+ {
560
+ "epoch": 0.7375886524822695,
561
+ "grad_norm": 2.0980260372161865,
562
+ "learning_rate": 9.62857142857143e-06,
563
+ "loss": 0.9454,
564
+ "step": 78
565
+ },
566
+ {
567
+ "epoch": 0.7470449172576832,
568
+ "grad_norm": 1.899154782295227,
569
+ "learning_rate": 9.623809523809524e-06,
570
+ "loss": 0.9763,
571
+ "step": 79
572
+ },
573
+ {
574
+ "epoch": 0.7565011820330969,
575
+ "grad_norm": 2.9085607528686523,
576
+ "learning_rate": 9.61904761904762e-06,
577
+ "loss": 0.9829,
578
+ "step": 80
579
+ },
580
+ {
581
+ "epoch": 0.7659574468085106,
582
+ "grad_norm": 2.3891396522521973,
583
+ "learning_rate": 9.614285714285714e-06,
584
+ "loss": 1.0338,
585
+ "step": 81
586
+ },
587
+ {
588
+ "epoch": 0.7754137115839244,
589
+ "grad_norm": 2.1058549880981445,
590
+ "learning_rate": 9.60952380952381e-06,
591
+ "loss": 1.095,
592
+ "step": 82
593
+ },
594
+ {
595
+ "epoch": 0.7848699763593381,
596
+ "grad_norm": 2.2750802040100098,
597
+ "learning_rate": 9.604761904761905e-06,
598
+ "loss": 0.8883,
599
+ "step": 83
600
+ },
601
+ {
602
+ "epoch": 0.7943262411347518,
603
+ "grad_norm": 2.928985834121704,
604
+ "learning_rate": 9.600000000000001e-06,
605
+ "loss": 1.0404,
606
+ "step": 84
607
+ },
608
+ {
609
+ "epoch": 0.8037825059101655,
610
+ "grad_norm": 3.526047706604004,
611
+ "learning_rate": 9.595238095238096e-06,
612
+ "loss": 0.9821,
613
+ "step": 85
614
+ },
615
+ {
616
+ "epoch": 0.8132387706855791,
617
+ "grad_norm": 1.2852156162261963,
618
+ "learning_rate": 9.590476190476192e-06,
619
+ "loss": 0.9858,
620
+ "step": 86
621
+ },
622
+ {
623
+ "epoch": 0.8226950354609929,
624
+ "grad_norm": 1.357054591178894,
625
+ "learning_rate": 9.585714285714286e-06,
626
+ "loss": 0.9973,
627
+ "step": 87
628
+ },
629
+ {
630
+ "epoch": 0.8321513002364066,
631
+ "grad_norm": 1.4836820363998413,
632
+ "learning_rate": 9.580952380952383e-06,
633
+ "loss": 1.0258,
634
+ "step": 88
635
+ },
636
+ {
637
+ "epoch": 0.8416075650118203,
638
+ "grad_norm": 1.3577030897140503,
639
+ "learning_rate": 9.576190476190477e-06,
640
+ "loss": 0.908,
641
+ "step": 89
642
+ },
643
+ {
644
+ "epoch": 0.851063829787234,
645
+ "grad_norm": 3.018012285232544,
646
+ "learning_rate": 9.571428571428573e-06,
647
+ "loss": 0.9476,
648
+ "step": 90
649
+ },
650
+ {
651
+ "epoch": 0.8605200945626478,
652
+ "grad_norm": 2.8617868423461914,
653
+ "learning_rate": 9.566666666666668e-06,
654
+ "loss": 1.0086,
655
+ "step": 91
656
+ },
657
+ {
658
+ "epoch": 0.8699763593380615,
659
+ "grad_norm": 2.0398824214935303,
660
+ "learning_rate": 9.561904761904762e-06,
661
+ "loss": 1.0729,
662
+ "step": 92
663
+ },
664
+ {
665
+ "epoch": 0.8794326241134752,
666
+ "grad_norm": 4.715724468231201,
667
+ "learning_rate": 9.557142857142858e-06,
668
+ "loss": 0.9843,
669
+ "step": 93
670
+ },
671
+ {
672
+ "epoch": 0.8888888888888888,
673
+ "grad_norm": 1.1730473041534424,
674
+ "learning_rate": 9.552380952380953e-06,
675
+ "loss": 0.985,
676
+ "step": 94
677
+ },
678
+ {
679
+ "epoch": 0.8983451536643026,
680
+ "grad_norm": 4.119668483734131,
681
+ "learning_rate": 9.547619047619049e-06,
682
+ "loss": 1.0455,
683
+ "step": 95
684
+ },
685
+ {
686
+ "epoch": 0.9078014184397163,
687
+ "grad_norm": 3.797478199005127,
688
+ "learning_rate": 9.542857142857143e-06,
689
+ "loss": 0.9879,
690
+ "step": 96
691
+ },
692
+ {
693
+ "epoch": 0.91725768321513,
694
+ "grad_norm": 5.911831378936768,
695
+ "learning_rate": 9.538095238095238e-06,
696
+ "loss": 1.1269,
697
+ "step": 97
698
+ },
699
+ {
700
+ "epoch": 0.9267139479905437,
701
+ "grad_norm": 5.677768230438232,
702
+ "learning_rate": 9.533333333333334e-06,
703
+ "loss": 1.0706,
704
+ "step": 98
705
+ },
706
+ {
707
+ "epoch": 0.9361702127659575,
708
+ "grad_norm": 6.479707717895508,
709
+ "learning_rate": 9.528571428571429e-06,
710
+ "loss": 1.0433,
711
+ "step": 99
712
+ },
713
+ {
714
+ "epoch": 0.9456264775413712,
715
+ "grad_norm": 3.5368762016296387,
716
+ "learning_rate": 9.523809523809525e-06,
717
+ "loss": 0.8951,
718
+ "step": 100
719
+ },
720
+ {
721
+ "epoch": 0.9456264775413712,
722
+ "eval_f1_macro": 0.280761247086368,
723
+ "eval_loss": 0.9932119846343994,
724
+ "eval_runtime": 13.9437,
725
+ "eval_samples_per_second": 121.274,
726
+ "eval_steps_per_second": 7.602,
727
+ "step": 100
728
+ },
729
+ {
730
+ "epoch": 0.9550827423167849,
731
+ "grad_norm": 5.754081726074219,
732
+ "learning_rate": 9.51904761904762e-06,
733
+ "loss": 1.0286,
734
+ "step": 101
735
+ },
736
+ {
737
+ "epoch": 0.9645390070921985,
738
+ "grad_norm": 3.2837677001953125,
739
+ "learning_rate": 9.514285714285715e-06,
740
+ "loss": 0.9234,
741
+ "step": 102
742
+ },
743
+ {
744
+ "epoch": 0.9739952718676123,
745
+ "grad_norm": 3.854726791381836,
746
+ "learning_rate": 9.50952380952381e-06,
747
+ "loss": 0.907,
748
+ "step": 103
749
+ },
750
+ {
751
+ "epoch": 0.983451536643026,
752
+ "grad_norm": 6.779870510101318,
753
+ "learning_rate": 9.504761904761906e-06,
754
+ "loss": 1.0186,
755
+ "step": 104
756
+ },
757
+ {
758
+ "epoch": 0.9929078014184397,
759
+ "grad_norm": 4.993762493133545,
760
+ "learning_rate": 9.5e-06,
761
+ "loss": 1.0855,
762
+ "step": 105
763
+ },
764
+ {
765
+ "epoch": 1.0023640661938533,
766
+ "grad_norm": 5.992312908172607,
767
+ "learning_rate": 9.495238095238097e-06,
768
+ "loss": 1.0172,
769
+ "step": 106
770
+ },
771
+ {
772
+ "epoch": 1.011820330969267,
773
+ "grad_norm": 4.238219261169434,
774
+ "learning_rate": 9.490476190476191e-06,
775
+ "loss": 1.0305,
776
+ "step": 107
777
+ },
778
+ {
779
+ "epoch": 1.0212765957446808,
780
+ "grad_norm": 2.7353601455688477,
781
+ "learning_rate": 9.485714285714287e-06,
782
+ "loss": 0.973,
783
+ "step": 108
784
+ },
785
+ {
786
+ "epoch": 1.0307328605200945,
787
+ "grad_norm": 2.2366888523101807,
788
+ "learning_rate": 9.480952380952382e-06,
789
+ "loss": 1.002,
790
+ "step": 109
791
+ },
792
+ {
793
+ "epoch": 1.0401891252955082,
794
+ "grad_norm": 2.0861849784851074,
795
+ "learning_rate": 9.476190476190476e-06,
796
+ "loss": 1.0294,
797
+ "step": 110
798
+ },
799
+ {
800
+ "epoch": 1.049645390070922,
801
+ "grad_norm": 5.589237213134766,
802
+ "learning_rate": 9.471428571428572e-06,
803
+ "loss": 0.9939,
804
+ "step": 111
805
+ },
806
+ {
807
+ "epoch": 1.0591016548463357,
808
+ "grad_norm": 9.459518432617188,
809
+ "learning_rate": 9.466666666666667e-06,
810
+ "loss": 1.045,
811
+ "step": 112
812
+ },
813
+ {
814
+ "epoch": 1.0685579196217494,
815
+ "grad_norm": 6.007238388061523,
816
+ "learning_rate": 9.461904761904761e-06,
817
+ "loss": 0.9692,
818
+ "step": 113
819
+ },
820
+ {
821
+ "epoch": 1.0780141843971631,
822
+ "grad_norm": 4.863658428192139,
823
+ "learning_rate": 9.457142857142858e-06,
824
+ "loss": 1.2279,
825
+ "step": 114
826
+ },
827
+ {
828
+ "epoch": 1.0874704491725768,
829
+ "grad_norm": 1.2910066843032837,
830
+ "learning_rate": 9.452380952380952e-06,
831
+ "loss": 1.0343,
832
+ "step": 115
833
+ },
834
+ {
835
+ "epoch": 1.0969267139479906,
836
+ "grad_norm": 2.361463785171509,
837
+ "learning_rate": 9.447619047619048e-06,
838
+ "loss": 0.936,
839
+ "step": 116
840
+ },
841
+ {
842
+ "epoch": 1.1063829787234043,
843
+ "grad_norm": 1.5398199558258057,
844
+ "learning_rate": 9.442857142857144e-06,
845
+ "loss": 0.98,
846
+ "step": 117
847
+ },
848
+ {
849
+ "epoch": 1.115839243498818,
850
+ "grad_norm": 5.576039791107178,
851
+ "learning_rate": 9.438095238095239e-06,
852
+ "loss": 0.9781,
853
+ "step": 118
854
+ },
855
+ {
856
+ "epoch": 1.1252955082742317,
857
+ "grad_norm": 7.037659645080566,
858
+ "learning_rate": 9.433333333333335e-06,
859
+ "loss": 0.9037,
860
+ "step": 119
861
+ },
862
+ {
863
+ "epoch": 1.1347517730496455,
864
+ "grad_norm": 3.028674364089966,
865
+ "learning_rate": 9.42857142857143e-06,
866
+ "loss": 0.9853,
867
+ "step": 120
868
+ },
869
+ {
870
+ "epoch": 1.1442080378250592,
871
+ "grad_norm": 2.2026021480560303,
872
+ "learning_rate": 9.423809523809526e-06,
873
+ "loss": 0.903,
874
+ "step": 121
875
+ },
876
+ {
877
+ "epoch": 1.1536643026004727,
878
+ "grad_norm": 2.6655337810516357,
879
+ "learning_rate": 9.41904761904762e-06,
880
+ "loss": 0.915,
881
+ "step": 122
882
+ },
883
+ {
884
+ "epoch": 1.1631205673758864,
885
+ "grad_norm": 2.7415759563446045,
886
+ "learning_rate": 9.414285714285715e-06,
887
+ "loss": 1.0771,
888
+ "step": 123
889
+ },
890
+ {
891
+ "epoch": 1.1725768321513002,
892
+ "grad_norm": 2.241513252258301,
893
+ "learning_rate": 9.40952380952381e-06,
894
+ "loss": 0.9635,
895
+ "step": 124
896
+ },
897
+ {
898
+ "epoch": 1.1820330969267139,
899
+ "grad_norm": 1.8755624294281006,
900
+ "learning_rate": 9.404761904761905e-06,
901
+ "loss": 1.0371,
902
+ "step": 125
903
+ },
904
+ {
905
+ "epoch": 1.1914893617021276,
906
+ "grad_norm": 7.259213924407959,
907
+ "learning_rate": 9.4e-06,
908
+ "loss": 0.9636,
909
+ "step": 126
910
+ },
911
+ {
912
+ "epoch": 1.2009456264775413,
913
+ "grad_norm": 1.6004455089569092,
914
+ "learning_rate": 9.395238095238096e-06,
915
+ "loss": 0.9372,
916
+ "step": 127
917
+ },
918
+ {
919
+ "epoch": 1.210401891252955,
920
+ "grad_norm": 1.555505394935608,
921
+ "learning_rate": 9.39047619047619e-06,
922
+ "loss": 0.9623,
923
+ "step": 128
924
+ },
925
+ {
926
+ "epoch": 1.2198581560283688,
927
+ "grad_norm": 3.08105731010437,
928
+ "learning_rate": 9.385714285714287e-06,
929
+ "loss": 1.0623,
930
+ "step": 129
931
+ },
932
+ {
933
+ "epoch": 1.2293144208037825,
934
+ "grad_norm": 1.6849639415740967,
935
+ "learning_rate": 9.380952380952381e-06,
936
+ "loss": 0.9662,
937
+ "step": 130
938
+ },
939
+ {
940
+ "epoch": 1.2387706855791962,
941
+ "grad_norm": 1.9433168172836304,
942
+ "learning_rate": 9.376190476190477e-06,
943
+ "loss": 1.0246,
944
+ "step": 131
945
+ },
946
+ {
947
+ "epoch": 1.24822695035461,
948
+ "grad_norm": 1.203316569328308,
949
+ "learning_rate": 9.371428571428572e-06,
950
+ "loss": 0.9972,
951
+ "step": 132
952
+ },
953
+ {
954
+ "epoch": 1.2576832151300237,
955
+ "grad_norm": 3.589102029800415,
956
+ "learning_rate": 9.366666666666668e-06,
957
+ "loss": 1.0108,
958
+ "step": 133
959
+ },
960
+ {
961
+ "epoch": 1.2671394799054374,
962
+ "grad_norm": 3.4743452072143555,
963
+ "learning_rate": 9.361904761904762e-06,
964
+ "loss": 0.8865,
965
+ "step": 134
966
+ },
967
+ {
968
+ "epoch": 1.2765957446808511,
969
+ "grad_norm": 4.537514686584473,
970
+ "learning_rate": 9.357142857142859e-06,
971
+ "loss": 0.9915,
972
+ "step": 135
973
+ },
974
+ {
975
+ "epoch": 1.2860520094562649,
976
+ "grad_norm": 2.8337080478668213,
977
+ "learning_rate": 9.352380952380953e-06,
978
+ "loss": 1.1094,
979
+ "step": 136
980
+ },
981
+ {
982
+ "epoch": 1.2955082742316786,
983
+ "grad_norm": 3.608424663543701,
984
+ "learning_rate": 9.34761904761905e-06,
985
+ "loss": 0.9164,
986
+ "step": 137
987
+ },
988
+ {
989
+ "epoch": 1.3049645390070923,
990
+ "grad_norm": 1.528435468673706,
991
+ "learning_rate": 9.342857142857144e-06,
992
+ "loss": 1.0314,
993
+ "step": 138
994
+ },
995
+ {
996
+ "epoch": 1.314420803782506,
997
+ "grad_norm": 2.550269842147827,
998
+ "learning_rate": 9.338095238095238e-06,
999
+ "loss": 0.967,
1000
+ "step": 139
1001
+ },
1002
+ {
1003
+ "epoch": 1.3238770685579198,
1004
+ "grad_norm": 2.2891745567321777,
1005
+ "learning_rate": 9.333333333333334e-06,
1006
+ "loss": 0.9615,
1007
+ "step": 140
1008
+ },
1009
+ {
1010
+ "epoch": 1.3333333333333333,
1011
+ "grad_norm": 1.508060097694397,
1012
+ "learning_rate": 9.328571428571429e-06,
1013
+ "loss": 0.8764,
1014
+ "step": 141
1015
+ },
1016
+ {
1017
+ "epoch": 1.342789598108747,
1018
+ "grad_norm": 2.548435688018799,
1019
+ "learning_rate": 9.323809523809525e-06,
1020
+ "loss": 0.9724,
1021
+ "step": 142
1022
+ },
1023
+ {
1024
+ "epoch": 1.3522458628841607,
1025
+ "grad_norm": 1.7150108814239502,
1026
+ "learning_rate": 9.31904761904762e-06,
1027
+ "loss": 0.9821,
1028
+ "step": 143
1029
+ },
1030
+ {
1031
+ "epoch": 1.3617021276595744,
1032
+ "grad_norm": 3.745039224624634,
1033
+ "learning_rate": 9.314285714285714e-06,
1034
+ "loss": 0.9439,
1035
+ "step": 144
1036
+ },
1037
+ {
1038
+ "epoch": 1.3711583924349882,
1039
+ "grad_norm": 2.843501329421997,
1040
+ "learning_rate": 9.30952380952381e-06,
1041
+ "loss": 1.0294,
1042
+ "step": 145
1043
+ },
1044
+ {
1045
+ "epoch": 1.3806146572104019,
1046
+ "grad_norm": 1.8481038808822632,
1047
+ "learning_rate": 9.304761904761905e-06,
1048
+ "loss": 1.0332,
1049
+ "step": 146
1050
+ },
1051
+ {
1052
+ "epoch": 1.3900709219858156,
1053
+ "grad_norm": 2.5383450984954834,
1054
+ "learning_rate": 9.3e-06,
1055
+ "loss": 1.0577,
1056
+ "step": 147
1057
+ },
1058
+ {
1059
+ "epoch": 1.3995271867612293,
1060
+ "grad_norm": 2.205739736557007,
1061
+ "learning_rate": 9.295238095238095e-06,
1062
+ "loss": 1.0294,
1063
+ "step": 148
1064
+ },
1065
+ {
1066
+ "epoch": 1.408983451536643,
1067
+ "grad_norm": 3.889091968536377,
1068
+ "learning_rate": 9.290476190476191e-06,
1069
+ "loss": 0.8781,
1070
+ "step": 149
1071
+ },
1072
+ {
1073
+ "epoch": 1.4184397163120568,
1074
+ "grad_norm": 3.154792308807373,
1075
+ "learning_rate": 9.285714285714288e-06,
1076
+ "loss": 0.8584,
1077
+ "step": 150
1078
+ },
1079
+ {
1080
+ "epoch": 1.4184397163120568,
1081
+ "eval_f1_macro": 0.283063644436965,
1082
+ "eval_loss": 0.9654386043548584,
1083
+ "eval_runtime": 13.9617,
1084
+ "eval_samples_per_second": 121.117,
1085
+ "eval_steps_per_second": 7.592,
1086
+ "step": 150
1087
+ },
1088
+ {
1089
+ "epoch": 1.4278959810874705,
1090
+ "grad_norm": 3.941533088684082,
1091
+ "learning_rate": 9.280952380952382e-06,
1092
+ "loss": 0.9382,
1093
+ "step": 151
1094
+ },
1095
+ {
1096
+ "epoch": 1.4373522458628842,
1097
+ "grad_norm": 1.6653056144714355,
1098
+ "learning_rate": 9.276190476190477e-06,
1099
+ "loss": 0.865,
1100
+ "step": 152
1101
+ },
1102
+ {
1103
+ "epoch": 1.4468085106382977,
1104
+ "grad_norm": 1.8307316303253174,
1105
+ "learning_rate": 9.271428571428573e-06,
1106
+ "loss": 0.9168,
1107
+ "step": 153
1108
+ },
1109
+ {
1110
+ "epoch": 1.4562647754137115,
1111
+ "grad_norm": 5.202136993408203,
1112
+ "learning_rate": 9.266666666666667e-06,
1113
+ "loss": 0.9884,
1114
+ "step": 154
1115
+ },
1116
+ {
1117
+ "epoch": 1.4657210401891252,
1118
+ "grad_norm": 2.868016481399536,
1119
+ "learning_rate": 9.261904761904763e-06,
1120
+ "loss": 0.9221,
1121
+ "step": 155
1122
+ },
1123
+ {
1124
+ "epoch": 1.475177304964539,
1125
+ "grad_norm": 3.4823408126831055,
1126
+ "learning_rate": 9.257142857142858e-06,
1127
+ "loss": 0.9304,
1128
+ "step": 156
1129
+ },
1130
+ {
1131
+ "epoch": 1.4846335697399526,
1132
+ "grad_norm": 2.435922861099243,
1133
+ "learning_rate": 9.252380952380952e-06,
1134
+ "loss": 0.8973,
1135
+ "step": 157
1136
+ },
1137
+ {
1138
+ "epoch": 1.4940898345153664,
1139
+ "grad_norm": 1.9051096439361572,
1140
+ "learning_rate": 9.247619047619048e-06,
1141
+ "loss": 0.8849,
1142
+ "step": 158
1143
+ },
1144
+ {
1145
+ "epoch": 1.50354609929078,
1146
+ "grad_norm": 1.81594717502594,
1147
+ "learning_rate": 9.242857142857143e-06,
1148
+ "loss": 0.9916,
1149
+ "step": 159
1150
+ },
1151
+ {
1152
+ "epoch": 1.5130023640661938,
1153
+ "grad_norm": 4.773565769195557,
1154
+ "learning_rate": 9.238095238095239e-06,
1155
+ "loss": 0.9752,
1156
+ "step": 160
1157
+ },
1158
+ {
1159
+ "epoch": 1.5224586288416075,
1160
+ "grad_norm": 4.862892150878906,
1161
+ "learning_rate": 9.233333333333334e-06,
1162
+ "loss": 0.9181,
1163
+ "step": 161
1164
+ },
1165
+ {
1166
+ "epoch": 1.5319148936170213,
1167
+ "grad_norm": 2.8068463802337646,
1168
+ "learning_rate": 9.22857142857143e-06,
1169
+ "loss": 1.0103,
1170
+ "step": 162
1171
+ },
1172
+ {
1173
+ "epoch": 1.541371158392435,
1174
+ "grad_norm": 5.121504783630371,
1175
+ "learning_rate": 9.223809523809524e-06,
1176
+ "loss": 1.0227,
1177
+ "step": 163
1178
+ },
1179
+ {
1180
+ "epoch": 1.5508274231678487,
1181
+ "grad_norm": 2.266847610473633,
1182
+ "learning_rate": 9.21904761904762e-06,
1183
+ "loss": 0.8633,
1184
+ "step": 164
1185
+ },
1186
+ {
1187
+ "epoch": 1.5602836879432624,
1188
+ "grad_norm": 2.262877941131592,
1189
+ "learning_rate": 9.214285714285715e-06,
1190
+ "loss": 0.8789,
1191
+ "step": 165
1192
+ },
1193
+ {
1194
+ "epoch": 1.5697399527186762,
1195
+ "grad_norm": 4.233051300048828,
1196
+ "learning_rate": 9.209523809523811e-06,
1197
+ "loss": 0.9182,
1198
+ "step": 166
1199
+ },
1200
+ {
1201
+ "epoch": 1.57919621749409,
1202
+ "grad_norm": 2.3418633937835693,
1203
+ "learning_rate": 9.204761904761906e-06,
1204
+ "loss": 1.0088,
1205
+ "step": 167
1206
+ },
1207
+ {
1208
+ "epoch": 1.5886524822695036,
1209
+ "grad_norm": 4.228636741638184,
1210
+ "learning_rate": 9.200000000000002e-06,
1211
+ "loss": 1.0307,
1212
+ "step": 168
1213
+ },
1214
+ {
1215
+ "epoch": 1.5981087470449173,
1216
+ "grad_norm": 5.86262845993042,
1217
+ "learning_rate": 9.195238095238096e-06,
1218
+ "loss": 0.9996,
1219
+ "step": 169
1220
+ },
1221
+ {
1222
+ "epoch": 1.607565011820331,
1223
+ "grad_norm": 2.881577730178833,
1224
+ "learning_rate": 9.19047619047619e-06,
1225
+ "loss": 0.9544,
1226
+ "step": 170
1227
+ },
1228
+ {
1229
+ "epoch": 1.6170212765957448,
1230
+ "grad_norm": 1.8167017698287964,
1231
+ "learning_rate": 9.185714285714287e-06,
1232
+ "loss": 0.8761,
1233
+ "step": 171
1234
+ },
1235
+ {
1236
+ "epoch": 1.6264775413711585,
1237
+ "grad_norm": 4.837042331695557,
1238
+ "learning_rate": 9.180952380952381e-06,
1239
+ "loss": 0.9079,
1240
+ "step": 172
1241
+ },
1242
+ {
1243
+ "epoch": 1.6359338061465722,
1244
+ "grad_norm": 1.9222673177719116,
1245
+ "learning_rate": 9.176190476190477e-06,
1246
+ "loss": 1.0179,
1247
+ "step": 173
1248
+ },
1249
+ {
1250
+ "epoch": 1.645390070921986,
1251
+ "grad_norm": 3.050659418106079,
1252
+ "learning_rate": 9.171428571428572e-06,
1253
+ "loss": 0.8789,
1254
+ "step": 174
1255
+ },
1256
+ {
1257
+ "epoch": 1.6548463356973995,
1258
+ "grad_norm": 1.841301679611206,
1259
+ "learning_rate": 9.166666666666666e-06,
1260
+ "loss": 0.9302,
1261
+ "step": 175
1262
+ },
1263
+ {
1264
+ "epoch": 1.6643026004728132,
1265
+ "grad_norm": 5.169556140899658,
1266
+ "learning_rate": 9.161904761904763e-06,
1267
+ "loss": 0.9374,
1268
+ "step": 176
1269
+ },
1270
+ {
1271
+ "epoch": 1.673758865248227,
1272
+ "grad_norm": 3.1582770347595215,
1273
+ "learning_rate": 9.157142857142857e-06,
1274
+ "loss": 0.9497,
1275
+ "step": 177
1276
+ },
1277
+ {
1278
+ "epoch": 1.6832151300236406,
1279
+ "grad_norm": 2.368600606918335,
1280
+ "learning_rate": 9.152380952380953e-06,
1281
+ "loss": 0.9879,
1282
+ "step": 178
1283
+ },
1284
+ {
1285
+ "epoch": 1.6926713947990544,
1286
+ "grad_norm": 4.020886421203613,
1287
+ "learning_rate": 9.147619047619048e-06,
1288
+ "loss": 0.9077,
1289
+ "step": 179
1290
+ },
1291
+ {
1292
+ "epoch": 1.702127659574468,
1293
+ "grad_norm": 4.776878356933594,
1294
+ "learning_rate": 9.142857142857144e-06,
1295
+ "loss": 0.9397,
1296
+ "step": 180
1297
+ },
1298
+ {
1299
+ "epoch": 1.7115839243498818,
1300
+ "grad_norm": 1.6028152704238892,
1301
+ "learning_rate": 9.13809523809524e-06,
1302
+ "loss": 0.8059,
1303
+ "step": 181
1304
+ },
1305
+ {
1306
+ "epoch": 1.7210401891252955,
1307
+ "grad_norm": 1.9851771593093872,
1308
+ "learning_rate": 9.133333333333335e-06,
1309
+ "loss": 0.8532,
1310
+ "step": 182
1311
+ },
1312
+ {
1313
+ "epoch": 1.7304964539007093,
1314
+ "grad_norm": 1.7983887195587158,
1315
+ "learning_rate": 9.128571428571429e-06,
1316
+ "loss": 0.9259,
1317
+ "step": 183
1318
+ },
1319
+ {
1320
+ "epoch": 1.7399527186761228,
1321
+ "grad_norm": 3.93186616897583,
1322
+ "learning_rate": 9.123809523809525e-06,
1323
+ "loss": 0.8489,
1324
+ "step": 184
1325
+ },
1326
+ {
1327
+ "epoch": 1.7494089834515365,
1328
+ "grad_norm": 4.8172478675842285,
1329
+ "learning_rate": 9.11904761904762e-06,
1330
+ "loss": 0.978,
1331
+ "step": 185
1332
+ },
1333
+ {
1334
+ "epoch": 1.7588652482269502,
1335
+ "grad_norm": 2.9614696502685547,
1336
+ "learning_rate": 9.114285714285716e-06,
1337
+ "loss": 0.8911,
1338
+ "step": 186
1339
+ },
1340
+ {
1341
+ "epoch": 1.768321513002364,
1342
+ "grad_norm": 2.0468883514404297,
1343
+ "learning_rate": 9.10952380952381e-06,
1344
+ "loss": 0.8729,
1345
+ "step": 187
1346
+ },
1347
+ {
1348
+ "epoch": 1.7777777777777777,
1349
+ "grad_norm": 2.044738531112671,
1350
+ "learning_rate": 9.104761904761905e-06,
1351
+ "loss": 0.9597,
1352
+ "step": 188
1353
+ },
1354
+ {
1355
+ "epoch": 1.7872340425531914,
1356
+ "grad_norm": 2.1686525344848633,
1357
+ "learning_rate": 9.100000000000001e-06,
1358
+ "loss": 0.9128,
1359
+ "step": 189
1360
+ },
1361
+ {
1362
+ "epoch": 1.7966903073286051,
1363
+ "grad_norm": 2.1187691688537598,
1364
+ "learning_rate": 9.095238095238095e-06,
1365
+ "loss": 0.8705,
1366
+ "step": 190
1367
+ },
1368
+ {
1369
+ "epoch": 1.8061465721040189,
1370
+ "grad_norm": 3.233659029006958,
1371
+ "learning_rate": 9.09047619047619e-06,
1372
+ "loss": 0.9908,
1373
+ "step": 191
1374
+ },
1375
+ {
1376
+ "epoch": 1.8156028368794326,
1377
+ "grad_norm": 2.1533398628234863,
1378
+ "learning_rate": 9.085714285714286e-06,
1379
+ "loss": 0.9562,
1380
+ "step": 192
1381
+ },
1382
+ {
1383
+ "epoch": 1.8250591016548463,
1384
+ "grad_norm": 3.7706165313720703,
1385
+ "learning_rate": 9.08095238095238e-06,
1386
+ "loss": 1.0129,
1387
+ "step": 193
1388
+ },
1389
+ {
1390
+ "epoch": 1.83451536643026,
1391
+ "grad_norm": 1.9210500717163086,
1392
+ "learning_rate": 9.076190476190477e-06,
1393
+ "loss": 1.0186,
1394
+ "step": 194
1395
+ },
1396
+ {
1397
+ "epoch": 1.8439716312056738,
1398
+ "grad_norm": 3.554277181625366,
1399
+ "learning_rate": 9.071428571428573e-06,
1400
+ "loss": 0.9796,
1401
+ "step": 195
1402
+ },
1403
+ {
1404
+ "epoch": 1.8534278959810875,
1405
+ "grad_norm": 2.7373600006103516,
1406
+ "learning_rate": 9.066666666666667e-06,
1407
+ "loss": 0.9081,
1408
+ "step": 196
1409
+ },
1410
+ {
1411
+ "epoch": 1.8628841607565012,
1412
+ "grad_norm": 2.0324931144714355,
1413
+ "learning_rate": 9.061904761904764e-06,
1414
+ "loss": 0.8876,
1415
+ "step": 197
1416
+ },
1417
+ {
1418
+ "epoch": 1.872340425531915,
1419
+ "grad_norm": 4.527245998382568,
1420
+ "learning_rate": 9.057142857142858e-06,
1421
+ "loss": 1.0367,
1422
+ "step": 198
1423
+ },
1424
+ {
1425
+ "epoch": 1.8817966903073287,
1426
+ "grad_norm": 2.6082801818847656,
1427
+ "learning_rate": 9.052380952380954e-06,
1428
+ "loss": 0.9492,
1429
+ "step": 199
1430
+ },
1431
+ {
1432
+ "epoch": 1.8912529550827424,
1433
+ "grad_norm": 2.471220016479492,
1434
+ "learning_rate": 9.047619047619049e-06,
1435
+ "loss": 0.9094,
1436
+ "step": 200
1437
+ },
1438
+ {
1439
+ "epoch": 1.8912529550827424,
1440
+ "eval_f1_macro": 0.2837259159576206,
1441
+ "eval_loss": 0.9514940977096558,
1442
+ "eval_runtime": 14.0101,
1443
+ "eval_samples_per_second": 120.699,
1444
+ "eval_steps_per_second": 7.566,
1445
+ "step": 200
1446
+ },
1447
+ {
1448
+ "epoch": 1.900709219858156,
1449
+ "grad_norm": 1.804741382598877,
1450
+ "learning_rate": 9.042857142857143e-06,
1451
+ "loss": 0.9649,
1452
+ "step": 201
1453
+ },
1454
+ {
1455
+ "epoch": 1.9101654846335698,
1456
+ "grad_norm": 2.066596508026123,
1457
+ "learning_rate": 9.03809523809524e-06,
1458
+ "loss": 0.8746,
1459
+ "step": 202
1460
+ },
1461
+ {
1462
+ "epoch": 1.9196217494089836,
1463
+ "grad_norm": 1.635953664779663,
1464
+ "learning_rate": 9.033333333333334e-06,
1465
+ "loss": 0.9072,
1466
+ "step": 203
1467
+ },
1468
+ {
1469
+ "epoch": 1.9290780141843973,
1470
+ "grad_norm": 5.4566850662231445,
1471
+ "learning_rate": 9.028571428571428e-06,
1472
+ "loss": 0.9648,
1473
+ "step": 204
1474
+ },
1475
+ {
1476
+ "epoch": 1.938534278959811,
1477
+ "grad_norm": 2.198424816131592,
1478
+ "learning_rate": 9.023809523809524e-06,
1479
+ "loss": 0.8508,
1480
+ "step": 205
1481
+ },
1482
+ {
1483
+ "epoch": 1.9479905437352247,
1484
+ "grad_norm": 4.491063594818115,
1485
+ "learning_rate": 9.019047619047619e-06,
1486
+ "loss": 1.0851,
1487
+ "step": 206
1488
+ },
1489
+ {
1490
+ "epoch": 1.9574468085106385,
1491
+ "grad_norm": 2.8760032653808594,
1492
+ "learning_rate": 9.014285714285715e-06,
1493
+ "loss": 0.8874,
1494
+ "step": 207
1495
+ },
1496
+ {
1497
+ "epoch": 1.966903073286052,
1498
+ "grad_norm": 7.4303765296936035,
1499
+ "learning_rate": 9.00952380952381e-06,
1500
+ "loss": 0.9912,
1501
+ "step": 208
1502
+ },
1503
+ {
1504
+ "epoch": 1.9763593380614657,
1505
+ "grad_norm": 3.4187076091766357,
1506
+ "learning_rate": 9.004761904761906e-06,
1507
+ "loss": 0.8805,
1508
+ "step": 209
1509
+ },
1510
+ {
1511
+ "epoch": 1.9858156028368794,
1512
+ "grad_norm": 4.913360118865967,
1513
+ "learning_rate": 9e-06,
1514
+ "loss": 0.8825,
1515
+ "step": 210
1516
+ },
1517
+ {
1518
+ "epoch": 1.9952718676122931,
1519
+ "grad_norm": 3.889186143875122,
1520
+ "learning_rate": 8.995238095238096e-06,
1521
+ "loss": 1.0933,
1522
+ "step": 211
1523
+ },
1524
+ {
1525
+ "epoch": 2.0047281323877066,
1526
+ "grad_norm": 3.284080743789673,
1527
+ "learning_rate": 8.990476190476191e-06,
1528
+ "loss": 0.9464,
1529
+ "step": 212
1530
+ },
1531
+ {
1532
+ "epoch": 2.0141843971631204,
1533
+ "grad_norm": 3.6579349040985107,
1534
+ "learning_rate": 8.985714285714287e-06,
1535
+ "loss": 0.8692,
1536
+ "step": 213
1537
+ },
1538
+ {
1539
+ "epoch": 2.023640661938534,
1540
+ "grad_norm": 2.684208631515503,
1541
+ "learning_rate": 8.980952380952382e-06,
1542
+ "loss": 0.964,
1543
+ "step": 214
1544
+ },
1545
+ {
1546
+ "epoch": 2.033096926713948,
1547
+ "grad_norm": 2.114290475845337,
1548
+ "learning_rate": 8.976190476190478e-06,
1549
+ "loss": 0.8868,
1550
+ "step": 215
1551
+ },
1552
+ {
1553
+ "epoch": 2.0425531914893615,
1554
+ "grad_norm": 3.2135798931121826,
1555
+ "learning_rate": 8.971428571428572e-06,
1556
+ "loss": 0.8269,
1557
+ "step": 216
1558
+ },
1559
+ {
1560
+ "epoch": 2.0520094562647753,
1561
+ "grad_norm": 2.392505168914795,
1562
+ "learning_rate": 8.966666666666667e-06,
1563
+ "loss": 0.8804,
1564
+ "step": 217
1565
+ },
1566
+ {
1567
+ "epoch": 2.061465721040189,
1568
+ "grad_norm": 3.2920775413513184,
1569
+ "learning_rate": 8.961904761904763e-06,
1570
+ "loss": 0.9756,
1571
+ "step": 218
1572
+ },
1573
+ {
1574
+ "epoch": 2.0709219858156027,
1575
+ "grad_norm": 2.386057138442993,
1576
+ "learning_rate": 8.957142857142857e-06,
1577
+ "loss": 0.9884,
1578
+ "step": 219
1579
+ },
1580
+ {
1581
+ "epoch": 2.0803782505910164,
1582
+ "grad_norm": 3.295609712600708,
1583
+ "learning_rate": 8.952380952380953e-06,
1584
+ "loss": 0.8619,
1585
+ "step": 220
1586
+ },
1587
+ {
1588
+ "epoch": 2.08983451536643,
1589
+ "grad_norm": 4.821883201599121,
1590
+ "learning_rate": 8.947619047619048e-06,
1591
+ "loss": 0.9719,
1592
+ "step": 221
1593
+ },
1594
+ {
1595
+ "epoch": 2.099290780141844,
1596
+ "grad_norm": 3.452568292617798,
1597
+ "learning_rate": 8.942857142857142e-06,
1598
+ "loss": 1.017,
1599
+ "step": 222
1600
+ },
1601
+ {
1602
+ "epoch": 2.1087470449172576,
1603
+ "grad_norm": 3.7388017177581787,
1604
+ "learning_rate": 8.938095238095239e-06,
1605
+ "loss": 0.9183,
1606
+ "step": 223
1607
+ },
1608
+ {
1609
+ "epoch": 2.1182033096926713,
1610
+ "grad_norm": 2.8859570026397705,
1611
+ "learning_rate": 8.933333333333333e-06,
1612
+ "loss": 0.9246,
1613
+ "step": 224
1614
+ },
1615
+ {
1616
+ "epoch": 2.127659574468085,
1617
+ "grad_norm": 6.264657974243164,
1618
+ "learning_rate": 8.92857142857143e-06,
1619
+ "loss": 0.9333,
1620
+ "step": 225
1621
+ },
1622
+ {
1623
+ "epoch": 2.137115839243499,
1624
+ "grad_norm": 2.137022018432617,
1625
+ "learning_rate": 8.923809523809525e-06,
1626
+ "loss": 0.8482,
1627
+ "step": 226
1628
+ },
1629
+ {
1630
+ "epoch": 2.1465721040189125,
1631
+ "grad_norm": 3.879429817199707,
1632
+ "learning_rate": 8.91904761904762e-06,
1633
+ "loss": 0.8743,
1634
+ "step": 227
1635
+ },
1636
+ {
1637
+ "epoch": 2.1560283687943262,
1638
+ "grad_norm": 2.706341505050659,
1639
+ "learning_rate": 8.914285714285716e-06,
1640
+ "loss": 0.8389,
1641
+ "step": 228
1642
+ },
1643
+ {
1644
+ "epoch": 2.16548463356974,
1645
+ "grad_norm": 2.056065320968628,
1646
+ "learning_rate": 8.90952380952381e-06,
1647
+ "loss": 0.8362,
1648
+ "step": 229
1649
+ },
1650
+ {
1651
+ "epoch": 2.1749408983451537,
1652
+ "grad_norm": 3.5867486000061035,
1653
+ "learning_rate": 8.904761904761905e-06,
1654
+ "loss": 0.8927,
1655
+ "step": 230
1656
+ },
1657
+ {
1658
+ "epoch": 2.1843971631205674,
1659
+ "grad_norm": 2.523615598678589,
1660
+ "learning_rate": 8.900000000000001e-06,
1661
+ "loss": 0.9123,
1662
+ "step": 231
1663
+ },
1664
+ {
1665
+ "epoch": 2.193853427895981,
1666
+ "grad_norm": 3.221954345703125,
1667
+ "learning_rate": 8.895238095238096e-06,
1668
+ "loss": 0.8026,
1669
+ "step": 232
1670
+ },
1671
+ {
1672
+ "epoch": 2.203309692671395,
1673
+ "grad_norm": 5.576043128967285,
1674
+ "learning_rate": 8.890476190476192e-06,
1675
+ "loss": 1.0363,
1676
+ "step": 233
1677
+ },
1678
+ {
1679
+ "epoch": 2.2127659574468086,
1680
+ "grad_norm": 3.328744411468506,
1681
+ "learning_rate": 8.885714285714286e-06,
1682
+ "loss": 0.9239,
1683
+ "step": 234
1684
+ },
1685
+ {
1686
+ "epoch": 2.2222222222222223,
1687
+ "grad_norm": 4.1029372215271,
1688
+ "learning_rate": 8.88095238095238e-06,
1689
+ "loss": 0.9743,
1690
+ "step": 235
1691
+ },
1692
+ {
1693
+ "epoch": 2.231678486997636,
1694
+ "grad_norm": 7.877633571624756,
1695
+ "learning_rate": 8.876190476190477e-06,
1696
+ "loss": 0.9407,
1697
+ "step": 236
1698
+ },
1699
+ {
1700
+ "epoch": 2.2411347517730498,
1701
+ "grad_norm": 6.3463311195373535,
1702
+ "learning_rate": 8.871428571428571e-06,
1703
+ "loss": 0.8537,
1704
+ "step": 237
1705
+ },
1706
+ {
1707
+ "epoch": 2.2505910165484635,
1708
+ "grad_norm": 9.1329345703125,
1709
+ "learning_rate": 8.866666666666668e-06,
1710
+ "loss": 0.8501,
1711
+ "step": 238
1712
+ },
1713
+ {
1714
+ "epoch": 2.260047281323877,
1715
+ "grad_norm": 6.020025253295898,
1716
+ "learning_rate": 8.861904761904762e-06,
1717
+ "loss": 0.876,
1718
+ "step": 239
1719
+ },
1720
+ {
1721
+ "epoch": 2.269503546099291,
1722
+ "grad_norm": 2.44421124458313,
1723
+ "learning_rate": 8.857142857142858e-06,
1724
+ "loss": 0.9192,
1725
+ "step": 240
1726
+ },
1727
+ {
1728
+ "epoch": 2.2789598108747047,
1729
+ "grad_norm": 6.033252239227295,
1730
+ "learning_rate": 8.852380952380953e-06,
1731
+ "loss": 0.9477,
1732
+ "step": 241
1733
+ },
1734
+ {
1735
+ "epoch": 2.2884160756501184,
1736
+ "grad_norm": 10.110373497009277,
1737
+ "learning_rate": 8.847619047619049e-06,
1738
+ "loss": 1.0997,
1739
+ "step": 242
1740
+ },
1741
+ {
1742
+ "epoch": 2.297872340425532,
1743
+ "grad_norm": 7.680680274963379,
1744
+ "learning_rate": 8.842857142857143e-06,
1745
+ "loss": 0.8948,
1746
+ "step": 243
1747
+ },
1748
+ {
1749
+ "epoch": 2.3073286052009454,
1750
+ "grad_norm": 9.439537048339844,
1751
+ "learning_rate": 8.83809523809524e-06,
1752
+ "loss": 0.9058,
1753
+ "step": 244
1754
+ },
1755
+ {
1756
+ "epoch": 2.3167848699763596,
1757
+ "grad_norm": 8.714299201965332,
1758
+ "learning_rate": 8.833333333333334e-06,
1759
+ "loss": 0.8828,
1760
+ "step": 245
1761
+ },
1762
+ {
1763
+ "epoch": 2.326241134751773,
1764
+ "grad_norm": 2.600574254989624,
1765
+ "learning_rate": 8.82857142857143e-06,
1766
+ "loss": 0.9187,
1767
+ "step": 246
1768
+ },
1769
+ {
1770
+ "epoch": 2.3356973995271866,
1771
+ "grad_norm": 4.401780605316162,
1772
+ "learning_rate": 8.823809523809525e-06,
1773
+ "loss": 0.8681,
1774
+ "step": 247
1775
+ },
1776
+ {
1777
+ "epoch": 2.3451536643026003,
1778
+ "grad_norm": 2.297525405883789,
1779
+ "learning_rate": 8.819047619047619e-06,
1780
+ "loss": 0.8172,
1781
+ "step": 248
1782
+ },
1783
+ {
1784
+ "epoch": 2.354609929078014,
1785
+ "grad_norm": 4.044796943664551,
1786
+ "learning_rate": 8.814285714285715e-06,
1787
+ "loss": 0.8994,
1788
+ "step": 249
1789
+ },
1790
+ {
1791
+ "epoch": 2.3640661938534278,
1792
+ "grad_norm": 8.97205924987793,
1793
+ "learning_rate": 8.80952380952381e-06,
1794
+ "loss": 0.9875,
1795
+ "step": 250
1796
+ },
1797
+ {
1798
+ "epoch": 2.3640661938534278,
1799
+ "eval_f1_macro": 0.2946908943355175,
1800
+ "eval_loss": 0.9593666791915894,
1801
+ "eval_runtime": 13.9875,
1802
+ "eval_samples_per_second": 120.893,
1803
+ "eval_steps_per_second": 7.578,
1804
+ "step": 250
1805
+ },
1806
+ {
1807
+ "epoch": 2.3735224586288415,
1808
+ "grad_norm": 2.208160161972046,
1809
+ "learning_rate": 8.804761904761906e-06,
1810
+ "loss": 0.8831,
1811
+ "step": 251
1812
+ },
1813
+ {
1814
+ "epoch": 2.382978723404255,
1815
+ "grad_norm": 5.02908182144165,
1816
+ "learning_rate": 8.8e-06,
1817
+ "loss": 0.9052,
1818
+ "step": 252
1819
+ },
1820
+ {
1821
+ "epoch": 2.392434988179669,
1822
+ "grad_norm": 3.6499717235565186,
1823
+ "learning_rate": 8.795238095238095e-06,
1824
+ "loss": 0.8577,
1825
+ "step": 253
1826
+ },
1827
+ {
1828
+ "epoch": 2.4018912529550827,
1829
+ "grad_norm": 5.426293849945068,
1830
+ "learning_rate": 8.790476190476191e-06,
1831
+ "loss": 0.9268,
1832
+ "step": 254
1833
+ },
1834
+ {
1835
+ "epoch": 2.4113475177304964,
1836
+ "grad_norm": 8.641587257385254,
1837
+ "learning_rate": 8.785714285714286e-06,
1838
+ "loss": 0.9257,
1839
+ "step": 255
1840
+ },
1841
+ {
1842
+ "epoch": 2.42080378250591,
1843
+ "grad_norm": 11.271356582641602,
1844
+ "learning_rate": 8.780952380952382e-06,
1845
+ "loss": 0.9812,
1846
+ "step": 256
1847
+ },
1848
+ {
1849
+ "epoch": 2.430260047281324,
1850
+ "grad_norm": 8.257451057434082,
1851
+ "learning_rate": 8.776190476190476e-06,
1852
+ "loss": 0.9077,
1853
+ "step": 257
1854
+ },
1855
+ {
1856
+ "epoch": 2.4397163120567376,
1857
+ "grad_norm": 8.181621551513672,
1858
+ "learning_rate": 8.771428571428572e-06,
1859
+ "loss": 1.0347,
1860
+ "step": 258
1861
+ },
1862
+ {
1863
+ "epoch": 2.4491725768321513,
1864
+ "grad_norm": 2.263065814971924,
1865
+ "learning_rate": 8.766666666666669e-06,
1866
+ "loss": 0.8987,
1867
+ "step": 259
1868
+ },
1869
+ {
1870
+ "epoch": 2.458628841607565,
1871
+ "grad_norm": 4.538362979888916,
1872
+ "learning_rate": 8.761904761904763e-06,
1873
+ "loss": 0.9275,
1874
+ "step": 260
1875
+ },
1876
+ {
1877
+ "epoch": 2.4680851063829787,
1878
+ "grad_norm": 10.531721115112305,
1879
+ "learning_rate": 8.757142857142858e-06,
1880
+ "loss": 0.8979,
1881
+ "step": 261
1882
+ },
1883
+ {
1884
+ "epoch": 2.4775413711583925,
1885
+ "grad_norm": 4.876132011413574,
1886
+ "learning_rate": 8.752380952380954e-06,
1887
+ "loss": 0.8281,
1888
+ "step": 262
1889
+ },
1890
+ {
1891
+ "epoch": 2.486997635933806,
1892
+ "grad_norm": 2.9724793434143066,
1893
+ "learning_rate": 8.747619047619048e-06,
1894
+ "loss": 0.7766,
1895
+ "step": 263
1896
+ },
1897
+ {
1898
+ "epoch": 2.49645390070922,
1899
+ "grad_norm": 3.8613243103027344,
1900
+ "learning_rate": 8.742857142857144e-06,
1901
+ "loss": 0.939,
1902
+ "step": 264
1903
+ },
1904
+ {
1905
+ "epoch": 2.5059101654846336,
1906
+ "grad_norm": 2.186553478240967,
1907
+ "learning_rate": 8.738095238095239e-06,
1908
+ "loss": 0.8761,
1909
+ "step": 265
1910
+ },
1911
+ {
1912
+ "epoch": 2.5153664302600474,
1913
+ "grad_norm": 3.3984861373901367,
1914
+ "learning_rate": 8.733333333333333e-06,
1915
+ "loss": 0.8638,
1916
+ "step": 266
1917
+ },
1918
+ {
1919
+ "epoch": 2.524822695035461,
1920
+ "grad_norm": 6.405703067779541,
1921
+ "learning_rate": 8.72857142857143e-06,
1922
+ "loss": 0.8797,
1923
+ "step": 267
1924
+ },
1925
+ {
1926
+ "epoch": 2.534278959810875,
1927
+ "grad_norm": 5.020063400268555,
1928
+ "learning_rate": 8.723809523809524e-06,
1929
+ "loss": 0.827,
1930
+ "step": 268
1931
+ },
1932
+ {
1933
+ "epoch": 2.5437352245862885,
1934
+ "grad_norm": 2.5324349403381348,
1935
+ "learning_rate": 8.71904761904762e-06,
1936
+ "loss": 0.9493,
1937
+ "step": 269
1938
+ },
1939
+ {
1940
+ "epoch": 2.5531914893617023,
1941
+ "grad_norm": 2.4035396575927734,
1942
+ "learning_rate": 8.714285714285715e-06,
1943
+ "loss": 0.929,
1944
+ "step": 270
1945
+ },
1946
+ {
1947
+ "epoch": 2.562647754137116,
1948
+ "grad_norm": 3.447455644607544,
1949
+ "learning_rate": 8.70952380952381e-06,
1950
+ "loss": 0.9404,
1951
+ "step": 271
1952
+ },
1953
+ {
1954
+ "epoch": 2.5721040189125297,
1955
+ "grad_norm": 2.383104085922241,
1956
+ "learning_rate": 8.704761904761905e-06,
1957
+ "loss": 0.8185,
1958
+ "step": 272
1959
+ },
1960
+ {
1961
+ "epoch": 2.581560283687943,
1962
+ "grad_norm": 2.3809287548065186,
1963
+ "learning_rate": 8.700000000000001e-06,
1964
+ "loss": 0.9141,
1965
+ "step": 273
1966
+ },
1967
+ {
1968
+ "epoch": 2.591016548463357,
1969
+ "grad_norm": 2.1600184440612793,
1970
+ "learning_rate": 8.695238095238096e-06,
1971
+ "loss": 0.8376,
1972
+ "step": 274
1973
+ },
1974
+ {
1975
+ "epoch": 2.6004728132387704,
1976
+ "grad_norm": 3.5827438831329346,
1977
+ "learning_rate": 8.690476190476192e-06,
1978
+ "loss": 0.9028,
1979
+ "step": 275
1980
+ },
1981
+ {
1982
+ "epoch": 2.6099290780141846,
1983
+ "grad_norm": 2.5485596656799316,
1984
+ "learning_rate": 8.685714285714287e-06,
1985
+ "loss": 0.8242,
1986
+ "step": 276
1987
+ },
1988
+ {
1989
+ "epoch": 2.619385342789598,
1990
+ "grad_norm": 5.618887901306152,
1991
+ "learning_rate": 8.680952380952383e-06,
1992
+ "loss": 0.9673,
1993
+ "step": 277
1994
+ },
1995
+ {
1996
+ "epoch": 2.628841607565012,
1997
+ "grad_norm": 4.998472690582275,
1998
+ "learning_rate": 8.676190476190477e-06,
1999
+ "loss": 0.9069,
2000
+ "step": 278
2001
+ },
2002
+ {
2003
+ "epoch": 2.6382978723404253,
2004
+ "grad_norm": 5.121683597564697,
2005
+ "learning_rate": 8.671428571428572e-06,
2006
+ "loss": 0.9957,
2007
+ "step": 279
2008
+ },
2009
+ {
2010
+ "epoch": 2.6477541371158395,
2011
+ "grad_norm": 3.0463051795959473,
2012
+ "learning_rate": 8.666666666666668e-06,
2013
+ "loss": 0.772,
2014
+ "step": 280
2015
+ },
2016
+ {
2017
+ "epoch": 2.657210401891253,
2018
+ "grad_norm": 2.2069895267486572,
2019
+ "learning_rate": 8.661904761904762e-06,
2020
+ "loss": 0.8402,
2021
+ "step": 281
2022
+ },
2023
+ {
2024
+ "epoch": 2.6666666666666665,
2025
+ "grad_norm": 3.6136813163757324,
2026
+ "learning_rate": 8.657142857142858e-06,
2027
+ "loss": 0.9181,
2028
+ "step": 282
2029
+ },
2030
+ {
2031
+ "epoch": 2.6761229314420802,
2032
+ "grad_norm": 3.3590173721313477,
2033
+ "learning_rate": 8.652380952380953e-06,
2034
+ "loss": 0.9341,
2035
+ "step": 283
2036
+ },
2037
+ {
2038
+ "epoch": 2.685579196217494,
2039
+ "grad_norm": 7.388670444488525,
2040
+ "learning_rate": 8.647619047619047e-06,
2041
+ "loss": 0.8949,
2042
+ "step": 284
2043
+ },
2044
+ {
2045
+ "epoch": 2.6950354609929077,
2046
+ "grad_norm": 4.200950622558594,
2047
+ "learning_rate": 8.642857142857144e-06,
2048
+ "loss": 0.9149,
2049
+ "step": 285
2050
+ },
2051
+ {
2052
+ "epoch": 2.7044917257683214,
2053
+ "grad_norm": 4.610495090484619,
2054
+ "learning_rate": 8.638095238095238e-06,
2055
+ "loss": 0.918,
2056
+ "step": 286
2057
+ },
2058
+ {
2059
+ "epoch": 2.713947990543735,
2060
+ "grad_norm": 3.4695537090301514,
2061
+ "learning_rate": 8.633333333333334e-06,
2062
+ "loss": 0.9051,
2063
+ "step": 287
2064
+ },
2065
+ {
2066
+ "epoch": 2.723404255319149,
2067
+ "grad_norm": 5.142923355102539,
2068
+ "learning_rate": 8.628571428571429e-06,
2069
+ "loss": 0.9038,
2070
+ "step": 288
2071
+ },
2072
+ {
2073
+ "epoch": 2.7328605200945626,
2074
+ "grad_norm": 3.3170006275177,
2075
+ "learning_rate": 8.623809523809525e-06,
2076
+ "loss": 0.906,
2077
+ "step": 289
2078
+ },
2079
+ {
2080
+ "epoch": 2.7423167848699763,
2081
+ "grad_norm": 2.6813061237335205,
2082
+ "learning_rate": 8.61904761904762e-06,
2083
+ "loss": 0.8809,
2084
+ "step": 290
2085
+ },
2086
+ {
2087
+ "epoch": 2.75177304964539,
2088
+ "grad_norm": 3.356801748275757,
2089
+ "learning_rate": 8.614285714285716e-06,
2090
+ "loss": 1.1035,
2091
+ "step": 291
2092
+ },
2093
+ {
2094
+ "epoch": 2.7612293144208038,
2095
+ "grad_norm": 7.907340049743652,
2096
+ "learning_rate": 8.60952380952381e-06,
2097
+ "loss": 0.9155,
2098
+ "step": 292
2099
+ },
2100
+ {
2101
+ "epoch": 2.7706855791962175,
2102
+ "grad_norm": 2.931204080581665,
2103
+ "learning_rate": 8.604761904761906e-06,
2104
+ "loss": 0.9312,
2105
+ "step": 293
2106
+ },
2107
+ {
2108
+ "epoch": 2.780141843971631,
2109
+ "grad_norm": 4.311042308807373,
2110
+ "learning_rate": 8.6e-06,
2111
+ "loss": 1.0181,
2112
+ "step": 294
2113
+ },
2114
+ {
2115
+ "epoch": 2.789598108747045,
2116
+ "grad_norm": 5.058907508850098,
2117
+ "learning_rate": 8.595238095238097e-06,
2118
+ "loss": 0.7962,
2119
+ "step": 295
2120
+ },
2121
+ {
2122
+ "epoch": 2.7990543735224587,
2123
+ "grad_norm": 3.870861053466797,
2124
+ "learning_rate": 8.590476190476191e-06,
2125
+ "loss": 0.9256,
2126
+ "step": 296
2127
+ },
2128
+ {
2129
+ "epoch": 2.8085106382978724,
2130
+ "grad_norm": 6.868628025054932,
2131
+ "learning_rate": 8.585714285714286e-06,
2132
+ "loss": 0.9178,
2133
+ "step": 297
2134
+ },
2135
+ {
2136
+ "epoch": 2.817966903073286,
2137
+ "grad_norm": 3.905592679977417,
2138
+ "learning_rate": 8.580952380952382e-06,
2139
+ "loss": 0.9549,
2140
+ "step": 298
2141
+ },
2142
+ {
2143
+ "epoch": 2.8274231678487,
2144
+ "grad_norm": 5.346381664276123,
2145
+ "learning_rate": 8.576190476190476e-06,
2146
+ "loss": 0.9538,
2147
+ "step": 299
2148
+ },
2149
+ {
2150
+ "epoch": 2.8368794326241136,
2151
+ "grad_norm": 4.774893283843994,
2152
+ "learning_rate": 8.571428571428571e-06,
2153
+ "loss": 0.972,
2154
+ "step": 300
2155
+ },
2156
+ {
2157
+ "epoch": 2.8368794326241136,
2158
+ "eval_f1_macro": 0.2760298327473914,
2159
+ "eval_loss": 0.9529216885566711,
2160
+ "eval_runtime": 13.9542,
2161
+ "eval_samples_per_second": 121.182,
2162
+ "eval_steps_per_second": 7.596,
2163
+ "step": 300
2164
+ },
2165
+ {
2166
+ "epoch": 2.8463356973995273,
2167
+ "grad_norm": 5.011666297912598,
2168
+ "learning_rate": 8.566666666666667e-06,
2169
+ "loss": 0.8952,
2170
+ "step": 301
2171
+ },
2172
+ {
2173
+ "epoch": 2.855791962174941,
2174
+ "grad_norm": 4.010011196136475,
2175
+ "learning_rate": 8.561904761904762e-06,
2176
+ "loss": 0.8803,
2177
+ "step": 302
2178
+ },
2179
+ {
2180
+ "epoch": 2.8652482269503547,
2181
+ "grad_norm": 3.0806877613067627,
2182
+ "learning_rate": 8.557142857142858e-06,
2183
+ "loss": 0.8768,
2184
+ "step": 303
2185
+ },
2186
+ {
2187
+ "epoch": 2.8747044917257685,
2188
+ "grad_norm": 2.15812349319458,
2189
+ "learning_rate": 8.552380952380954e-06,
2190
+ "loss": 0.9884,
2191
+ "step": 304
2192
+ },
2193
+ {
2194
+ "epoch": 2.884160756501182,
2195
+ "grad_norm": 4.941656112670898,
2196
+ "learning_rate": 8.547619047619048e-06,
2197
+ "loss": 0.9848,
2198
+ "step": 305
2199
+ },
2200
+ {
2201
+ "epoch": 2.8936170212765955,
2202
+ "grad_norm": 7.0383100509643555,
2203
+ "learning_rate": 8.542857142857145e-06,
2204
+ "loss": 0.9367,
2205
+ "step": 306
2206
+ },
2207
+ {
2208
+ "epoch": 2.9030732860520096,
2209
+ "grad_norm": 4.573930263519287,
2210
+ "learning_rate": 8.538095238095239e-06,
2211
+ "loss": 0.882,
2212
+ "step": 307
2213
+ },
2214
+ {
2215
+ "epoch": 2.912529550827423,
2216
+ "grad_norm": 2.9290146827697754,
2217
+ "learning_rate": 8.533333333333335e-06,
2218
+ "loss": 0.9076,
2219
+ "step": 308
2220
+ },
2221
+ {
2222
+ "epoch": 2.921985815602837,
2223
+ "grad_norm": 2.3926682472229004,
2224
+ "learning_rate": 8.52857142857143e-06,
2225
+ "loss": 0.9358,
2226
+ "step": 309
2227
+ },
2228
+ {
2229
+ "epoch": 2.9314420803782504,
2230
+ "grad_norm": 2.4800093173980713,
2231
+ "learning_rate": 8.523809523809524e-06,
2232
+ "loss": 0.8903,
2233
+ "step": 310
2234
+ },
2235
+ {
2236
+ "epoch": 2.9408983451536646,
2237
+ "grad_norm": 4.937615394592285,
2238
+ "learning_rate": 8.51904761904762e-06,
2239
+ "loss": 0.9122,
2240
+ "step": 311
2241
+ },
2242
+ {
2243
+ "epoch": 2.950354609929078,
2244
+ "grad_norm": 3.547682285308838,
2245
+ "learning_rate": 8.514285714285715e-06,
2246
+ "loss": 0.8261,
2247
+ "step": 312
2248
+ },
2249
+ {
2250
+ "epoch": 2.959810874704492,
2251
+ "grad_norm": 2.7049429416656494,
2252
+ "learning_rate": 8.50952380952381e-06,
2253
+ "loss": 0.9048,
2254
+ "step": 313
2255
+ },
2256
+ {
2257
+ "epoch": 2.9692671394799053,
2258
+ "grad_norm": 3.8899621963500977,
2259
+ "learning_rate": 8.504761904761905e-06,
2260
+ "loss": 0.9234,
2261
+ "step": 314
2262
+ },
2263
+ {
2264
+ "epoch": 2.978723404255319,
2265
+ "grad_norm": 6.804818153381348,
2266
+ "learning_rate": 8.5e-06,
2267
+ "loss": 0.9218,
2268
+ "step": 315
2269
+ },
2270
+ {
2271
+ "epoch": 2.9881796690307327,
2272
+ "grad_norm": 5.924713611602783,
2273
+ "learning_rate": 8.495238095238096e-06,
2274
+ "loss": 0.8945,
2275
+ "step": 316
2276
+ },
2277
+ {
2278
+ "epoch": 2.9976359338061465,
2279
+ "grad_norm": 6.0556254386901855,
2280
+ "learning_rate": 8.49047619047619e-06,
2281
+ "loss": 0.783,
2282
+ "step": 317
2283
+ },
2284
+ {
2285
+ "epoch": 3.00709219858156,
2286
+ "grad_norm": 3.446620225906372,
2287
+ "learning_rate": 8.485714285714287e-06,
2288
+ "loss": 0.9164,
2289
+ "step": 318
2290
+ },
2291
+ {
2292
+ "epoch": 3.016548463356974,
2293
+ "grad_norm": 2.734196901321411,
2294
+ "learning_rate": 8.480952380952381e-06,
2295
+ "loss": 0.8043,
2296
+ "step": 319
2297
+ },
2298
+ {
2299
+ "epoch": 3.0260047281323876,
2300
+ "grad_norm": 3.3837802410125732,
2301
+ "learning_rate": 8.476190476190477e-06,
2302
+ "loss": 0.7815,
2303
+ "step": 320
2304
+ },
2305
+ {
2306
+ "epoch": 3.0354609929078014,
2307
+ "grad_norm": 2.1560587882995605,
2308
+ "learning_rate": 8.471428571428572e-06,
2309
+ "loss": 0.7651,
2310
+ "step": 321
2311
+ },
2312
+ {
2313
+ "epoch": 3.044917257683215,
2314
+ "grad_norm": 5.292918682098389,
2315
+ "learning_rate": 8.466666666666668e-06,
2316
+ "loss": 0.8416,
2317
+ "step": 322
2318
+ },
2319
+ {
2320
+ "epoch": 3.054373522458629,
2321
+ "grad_norm": 3.834503412246704,
2322
+ "learning_rate": 8.461904761904763e-06,
2323
+ "loss": 0.8658,
2324
+ "step": 323
2325
+ },
2326
+ {
2327
+ "epoch": 3.0638297872340425,
2328
+ "grad_norm": 5.551731109619141,
2329
+ "learning_rate": 8.457142857142859e-06,
2330
+ "loss": 0.8834,
2331
+ "step": 324
2332
+ },
2333
+ {
2334
+ "epoch": 3.0732860520094563,
2335
+ "grad_norm": 8.869498252868652,
2336
+ "learning_rate": 8.452380952380953e-06,
2337
+ "loss": 0.9549,
2338
+ "step": 325
2339
+ },
2340
+ {
2341
+ "epoch": 3.08274231678487,
2342
+ "grad_norm": 3.6207432746887207,
2343
+ "learning_rate": 8.447619047619048e-06,
2344
+ "loss": 0.8453,
2345
+ "step": 326
2346
+ },
2347
+ {
2348
+ "epoch": 3.0921985815602837,
2349
+ "grad_norm": 5.913300037384033,
2350
+ "learning_rate": 8.442857142857144e-06,
2351
+ "loss": 0.973,
2352
+ "step": 327
2353
+ },
2354
+ {
2355
+ "epoch": 3.1016548463356974,
2356
+ "grad_norm": 6.6758623123168945,
2357
+ "learning_rate": 8.438095238095238e-06,
2358
+ "loss": 0.9095,
2359
+ "step": 328
2360
+ },
2361
+ {
2362
+ "epoch": 3.111111111111111,
2363
+ "grad_norm": 3.1101748943328857,
2364
+ "learning_rate": 8.433333333333334e-06,
2365
+ "loss": 0.8741,
2366
+ "step": 329
2367
+ },
2368
+ {
2369
+ "epoch": 3.120567375886525,
2370
+ "grad_norm": 2.8428375720977783,
2371
+ "learning_rate": 8.428571428571429e-06,
2372
+ "loss": 0.8203,
2373
+ "step": 330
2374
+ },
2375
+ {
2376
+ "epoch": 3.1300236406619386,
2377
+ "grad_norm": 3.4043846130371094,
2378
+ "learning_rate": 8.423809523809523e-06,
2379
+ "loss": 0.9088,
2380
+ "step": 331
2381
+ },
2382
+ {
2383
+ "epoch": 3.1394799054373523,
2384
+ "grad_norm": 3.4590537548065186,
2385
+ "learning_rate": 8.41904761904762e-06,
2386
+ "loss": 0.8837,
2387
+ "step": 332
2388
+ },
2389
+ {
2390
+ "epoch": 3.148936170212766,
2391
+ "grad_norm": 3.684206485748291,
2392
+ "learning_rate": 8.414285714285714e-06,
2393
+ "loss": 0.9075,
2394
+ "step": 333
2395
+ },
2396
+ {
2397
+ "epoch": 3.15839243498818,
2398
+ "grad_norm": 6.9007720947265625,
2399
+ "learning_rate": 8.40952380952381e-06,
2400
+ "loss": 0.8656,
2401
+ "step": 334
2402
+ },
2403
+ {
2404
+ "epoch": 3.1678486997635935,
2405
+ "grad_norm": 4.262526988983154,
2406
+ "learning_rate": 8.404761904761905e-06,
2407
+ "loss": 0.959,
2408
+ "step": 335
2409
+ },
2410
+ {
2411
+ "epoch": 3.1773049645390072,
2412
+ "grad_norm": 6.345574378967285,
2413
+ "learning_rate": 8.400000000000001e-06,
2414
+ "loss": 0.8998,
2415
+ "step": 336
2416
+ },
2417
+ {
2418
+ "epoch": 3.186761229314421,
2419
+ "grad_norm": 4.99959659576416,
2420
+ "learning_rate": 8.395238095238097e-06,
2421
+ "loss": 0.8619,
2422
+ "step": 337
2423
+ },
2424
+ {
2425
+ "epoch": 3.1962174940898347,
2426
+ "grad_norm": 4.0740437507629395,
2427
+ "learning_rate": 8.390476190476192e-06,
2428
+ "loss": 0.7907,
2429
+ "step": 338
2430
+ },
2431
+ {
2432
+ "epoch": 3.2056737588652484,
2433
+ "grad_norm": 3.4886386394500732,
2434
+ "learning_rate": 8.385714285714286e-06,
2435
+ "loss": 0.9112,
2436
+ "step": 339
2437
+ },
2438
+ {
2439
+ "epoch": 3.215130023640662,
2440
+ "grad_norm": 3.8843610286712646,
2441
+ "learning_rate": 8.380952380952382e-06,
2442
+ "loss": 0.8705,
2443
+ "step": 340
2444
+ },
2445
+ {
2446
+ "epoch": 3.2245862884160754,
2447
+ "grad_norm": 5.011359691619873,
2448
+ "learning_rate": 8.376190476190477e-06,
2449
+ "loss": 0.874,
2450
+ "step": 341
2451
+ },
2452
+ {
2453
+ "epoch": 3.2340425531914896,
2454
+ "grad_norm": 4.989591121673584,
2455
+ "learning_rate": 8.371428571428573e-06,
2456
+ "loss": 0.9204,
2457
+ "step": 342
2458
+ },
2459
+ {
2460
+ "epoch": 3.243498817966903,
2461
+ "grad_norm": 3.6091649532318115,
2462
+ "learning_rate": 8.366666666666667e-06,
2463
+ "loss": 0.8482,
2464
+ "step": 343
2465
+ },
2466
+ {
2467
+ "epoch": 3.2529550827423166,
2468
+ "grad_norm": 2.7679734230041504,
2469
+ "learning_rate": 8.361904761904762e-06,
2470
+ "loss": 0.8709,
2471
+ "step": 344
2472
+ },
2473
+ {
2474
+ "epoch": 3.2624113475177303,
2475
+ "grad_norm": 3.253309726715088,
2476
+ "learning_rate": 8.357142857142858e-06,
2477
+ "loss": 0.8301,
2478
+ "step": 345
2479
+ },
2480
+ {
2481
+ "epoch": 3.271867612293144,
2482
+ "grad_norm": 3.695540189743042,
2483
+ "learning_rate": 8.352380952380952e-06,
2484
+ "loss": 0.7988,
2485
+ "step": 346
2486
+ },
2487
+ {
2488
+ "epoch": 3.2813238770685578,
2489
+ "grad_norm": 6.305734157562256,
2490
+ "learning_rate": 8.347619047619049e-06,
2491
+ "loss": 0.8561,
2492
+ "step": 347
2493
+ },
2494
+ {
2495
+ "epoch": 3.2907801418439715,
2496
+ "grad_norm": 4.090425491333008,
2497
+ "learning_rate": 8.342857142857143e-06,
2498
+ "loss": 0.8819,
2499
+ "step": 348
2500
+ },
2501
+ {
2502
+ "epoch": 3.300236406619385,
2503
+ "grad_norm": 8.310733795166016,
2504
+ "learning_rate": 8.33809523809524e-06,
2505
+ "loss": 0.8018,
2506
+ "step": 349
2507
+ },
2508
+ {
2509
+ "epoch": 3.309692671394799,
2510
+ "grad_norm": 7.966689109802246,
2511
+ "learning_rate": 8.333333333333334e-06,
2512
+ "loss": 0.9757,
2513
+ "step": 350
2514
+ },
2515
+ {
2516
+ "epoch": 3.309692671394799,
2517
+ "eval_f1_macro": 0.2898114936552929,
2518
+ "eval_loss": 0.9974462985992432,
2519
+ "eval_runtime": 13.9296,
2520
+ "eval_samples_per_second": 121.396,
2521
+ "eval_steps_per_second": 7.61,
2522
+ "step": 350
2523
+ },
2524
+ {
2525
+ "epoch": 3.3191489361702127,
2526
+ "grad_norm": 5.680312156677246,
2527
+ "learning_rate": 8.32857142857143e-06,
2528
+ "loss": 0.9264,
2529
+ "step": 351
2530
+ },
2531
+ {
2532
+ "epoch": 3.3286052009456264,
2533
+ "grad_norm": 4.721024513244629,
2534
+ "learning_rate": 8.323809523809524e-06,
2535
+ "loss": 0.8284,
2536
+ "step": 352
2537
+ },
2538
+ {
2539
+ "epoch": 3.33806146572104,
2540
+ "grad_norm": 3.5236101150512695,
2541
+ "learning_rate": 8.31904761904762e-06,
2542
+ "loss": 0.823,
2543
+ "step": 353
2544
+ },
2545
+ {
2546
+ "epoch": 3.347517730496454,
2547
+ "grad_norm": 4.502352237701416,
2548
+ "learning_rate": 8.314285714285715e-06,
2549
+ "loss": 0.8426,
2550
+ "step": 354
2551
+ },
2552
+ {
2553
+ "epoch": 3.3569739952718676,
2554
+ "grad_norm": 3.1332571506500244,
2555
+ "learning_rate": 8.309523809523811e-06,
2556
+ "loss": 0.7419,
2557
+ "step": 355
2558
+ },
2559
+ {
2560
+ "epoch": 3.3664302600472813,
2561
+ "grad_norm": 11.218631744384766,
2562
+ "learning_rate": 8.304761904761906e-06,
2563
+ "loss": 0.8824,
2564
+ "step": 356
2565
+ },
2566
+ {
2567
+ "epoch": 3.375886524822695,
2568
+ "grad_norm": 14.600545883178711,
2569
+ "learning_rate": 8.3e-06,
2570
+ "loss": 0.8601,
2571
+ "step": 357
2572
+ },
2573
+ {
2574
+ "epoch": 3.3853427895981087,
2575
+ "grad_norm": 12.865662574768066,
2576
+ "learning_rate": 8.295238095238096e-06,
2577
+ "loss": 0.8797,
2578
+ "step": 358
2579
+ },
2580
+ {
2581
+ "epoch": 3.3947990543735225,
2582
+ "grad_norm": 11.967954635620117,
2583
+ "learning_rate": 8.29047619047619e-06,
2584
+ "loss": 1.0035,
2585
+ "step": 359
2586
+ },
2587
+ {
2588
+ "epoch": 3.404255319148936,
2589
+ "grad_norm": 4.384112358093262,
2590
+ "learning_rate": 8.285714285714287e-06,
2591
+ "loss": 0.9454,
2592
+ "step": 360
2593
+ },
2594
+ {
2595
+ "epoch": 3.41371158392435,
2596
+ "grad_norm": 4.802806854248047,
2597
+ "learning_rate": 8.280952380952381e-06,
2598
+ "loss": 0.804,
2599
+ "step": 361
2600
+ },
2601
+ {
2602
+ "epoch": 3.4231678486997636,
2603
+ "grad_norm": 5.697473526000977,
2604
+ "learning_rate": 8.276190476190476e-06,
2605
+ "loss": 0.936,
2606
+ "step": 362
2607
+ },
2608
+ {
2609
+ "epoch": 3.4326241134751774,
2610
+ "grad_norm": 5.372440814971924,
2611
+ "learning_rate": 8.271428571428572e-06,
2612
+ "loss": 0.8373,
2613
+ "step": 363
2614
+ },
2615
+ {
2616
+ "epoch": 3.442080378250591,
2617
+ "grad_norm": 6.925962924957275,
2618
+ "learning_rate": 8.266666666666667e-06,
2619
+ "loss": 0.8763,
2620
+ "step": 364
2621
+ },
2622
+ {
2623
+ "epoch": 3.451536643026005,
2624
+ "grad_norm": 6.610288143157959,
2625
+ "learning_rate": 8.261904761904763e-06,
2626
+ "loss": 0.8084,
2627
+ "step": 365
2628
+ },
2629
+ {
2630
+ "epoch": 3.4609929078014185,
2631
+ "grad_norm": 3.974923849105835,
2632
+ "learning_rate": 8.257142857142857e-06,
2633
+ "loss": 0.9025,
2634
+ "step": 366
2635
+ },
2636
+ {
2637
+ "epoch": 3.4704491725768323,
2638
+ "grad_norm": 8.421207427978516,
2639
+ "learning_rate": 8.252380952380953e-06,
2640
+ "loss": 0.8086,
2641
+ "step": 367
2642
+ },
2643
+ {
2644
+ "epoch": 3.479905437352246,
2645
+ "grad_norm": 11.004073143005371,
2646
+ "learning_rate": 8.24761904761905e-06,
2647
+ "loss": 0.9149,
2648
+ "step": 368
2649
+ },
2650
+ {
2651
+ "epoch": 3.4893617021276597,
2652
+ "grad_norm": 6.909345626831055,
2653
+ "learning_rate": 8.242857142857144e-06,
2654
+ "loss": 0.7801,
2655
+ "step": 369
2656
+ },
2657
+ {
2658
+ "epoch": 3.4988179669030735,
2659
+ "grad_norm": 4.8476386070251465,
2660
+ "learning_rate": 8.238095238095239e-06,
2661
+ "loss": 0.851,
2662
+ "step": 370
2663
+ },
2664
+ {
2665
+ "epoch": 3.508274231678487,
2666
+ "grad_norm": 3.323458433151245,
2667
+ "learning_rate": 8.233333333333335e-06,
2668
+ "loss": 0.8203,
2669
+ "step": 371
2670
+ },
2671
+ {
2672
+ "epoch": 3.5177304964539005,
2673
+ "grad_norm": 3.748945951461792,
2674
+ "learning_rate": 8.22857142857143e-06,
2675
+ "loss": 0.8316,
2676
+ "step": 372
2677
+ },
2678
+ {
2679
+ "epoch": 3.5271867612293146,
2680
+ "grad_norm": 4.654240608215332,
2681
+ "learning_rate": 8.223809523809525e-06,
2682
+ "loss": 0.8346,
2683
+ "step": 373
2684
+ },
2685
+ {
2686
+ "epoch": 3.536643026004728,
2687
+ "grad_norm": 2.6797022819519043,
2688
+ "learning_rate": 8.21904761904762e-06,
2689
+ "loss": 0.8317,
2690
+ "step": 374
2691
+ },
2692
+ {
2693
+ "epoch": 3.546099290780142,
2694
+ "grad_norm": 7.341446876525879,
2695
+ "learning_rate": 8.214285714285714e-06,
2696
+ "loss": 0.7814,
2697
+ "step": 375
2698
+ },
2699
+ {
2700
+ "epoch": 3.5555555555555554,
2701
+ "grad_norm": 14.088275909423828,
2702
+ "learning_rate": 8.20952380952381e-06,
2703
+ "loss": 0.8655,
2704
+ "step": 376
2705
+ },
2706
+ {
2707
+ "epoch": 3.5650118203309695,
2708
+ "grad_norm": 4.27020263671875,
2709
+ "learning_rate": 8.204761904761905e-06,
2710
+ "loss": 0.9051,
2711
+ "step": 377
2712
+ },
2713
+ {
2714
+ "epoch": 3.574468085106383,
2715
+ "grad_norm": 4.922997951507568,
2716
+ "learning_rate": 8.2e-06,
2717
+ "loss": 0.7647,
2718
+ "step": 378
2719
+ },
2720
+ {
2721
+ "epoch": 3.5839243498817965,
2722
+ "grad_norm": 3.8162479400634766,
2723
+ "learning_rate": 8.195238095238096e-06,
2724
+ "loss": 0.8543,
2725
+ "step": 379
2726
+ },
2727
+ {
2728
+ "epoch": 3.5933806146572103,
2729
+ "grad_norm": 3.899261713027954,
2730
+ "learning_rate": 8.190476190476192e-06,
2731
+ "loss": 0.8353,
2732
+ "step": 380
2733
+ },
2734
+ {
2735
+ "epoch": 3.602836879432624,
2736
+ "grad_norm": 8.860682487487793,
2737
+ "learning_rate": 8.185714285714286e-06,
2738
+ "loss": 0.8721,
2739
+ "step": 381
2740
+ },
2741
+ {
2742
+ "epoch": 3.6122931442080377,
2743
+ "grad_norm": 6.590539455413818,
2744
+ "learning_rate": 8.180952380952382e-06,
2745
+ "loss": 0.8556,
2746
+ "step": 382
2747
+ },
2748
+ {
2749
+ "epoch": 3.6217494089834514,
2750
+ "grad_norm": 8.594223976135254,
2751
+ "learning_rate": 8.176190476190477e-06,
2752
+ "loss": 0.8506,
2753
+ "step": 383
2754
+ },
2755
+ {
2756
+ "epoch": 3.631205673758865,
2757
+ "grad_norm": 5.876694679260254,
2758
+ "learning_rate": 8.171428571428573e-06,
2759
+ "loss": 0.6882,
2760
+ "step": 384
2761
+ },
2762
+ {
2763
+ "epoch": 3.640661938534279,
2764
+ "grad_norm": 10.495857238769531,
2765
+ "learning_rate": 8.166666666666668e-06,
2766
+ "loss": 0.8772,
2767
+ "step": 385
2768
+ },
2769
+ {
2770
+ "epoch": 3.6501182033096926,
2771
+ "grad_norm": 10.53131103515625,
2772
+ "learning_rate": 8.161904761904764e-06,
2773
+ "loss": 0.8082,
2774
+ "step": 386
2775
+ },
2776
+ {
2777
+ "epoch": 3.6595744680851063,
2778
+ "grad_norm": 9.257997512817383,
2779
+ "learning_rate": 8.157142857142858e-06,
2780
+ "loss": 0.9248,
2781
+ "step": 387
2782
+ },
2783
+ {
2784
+ "epoch": 3.66903073286052,
2785
+ "grad_norm": 14.692602157592773,
2786
+ "learning_rate": 8.152380952380953e-06,
2787
+ "loss": 0.8357,
2788
+ "step": 388
2789
+ },
2790
+ {
2791
+ "epoch": 3.678486997635934,
2792
+ "grad_norm": 7.023383617401123,
2793
+ "learning_rate": 8.147619047619049e-06,
2794
+ "loss": 0.83,
2795
+ "step": 389
2796
+ },
2797
+ {
2798
+ "epoch": 3.6879432624113475,
2799
+ "grad_norm": 4.340334415435791,
2800
+ "learning_rate": 8.142857142857143e-06,
2801
+ "loss": 0.849,
2802
+ "step": 390
2803
+ },
2804
+ {
2805
+ "epoch": 3.6973995271867612,
2806
+ "grad_norm": 5.631707191467285,
2807
+ "learning_rate": 8.138095238095238e-06,
2808
+ "loss": 0.8456,
2809
+ "step": 391
2810
+ },
2811
+ {
2812
+ "epoch": 3.706855791962175,
2813
+ "grad_norm": 7.735780239105225,
2814
+ "learning_rate": 8.133333333333334e-06,
2815
+ "loss": 0.9483,
2816
+ "step": 392
2817
+ },
2818
+ {
2819
+ "epoch": 3.7163120567375887,
2820
+ "grad_norm": 6.3525519371032715,
2821
+ "learning_rate": 8.128571428571428e-06,
2822
+ "loss": 0.8846,
2823
+ "step": 393
2824
+ },
2825
+ {
2826
+ "epoch": 3.7257683215130024,
2827
+ "grad_norm": 9.425445556640625,
2828
+ "learning_rate": 8.123809523809525e-06,
2829
+ "loss": 0.8566,
2830
+ "step": 394
2831
+ },
2832
+ {
2833
+ "epoch": 3.735224586288416,
2834
+ "grad_norm": 5.754733085632324,
2835
+ "learning_rate": 8.119047619047619e-06,
2836
+ "loss": 0.8363,
2837
+ "step": 395
2838
+ },
2839
+ {
2840
+ "epoch": 3.74468085106383,
2841
+ "grad_norm": 3.9036519527435303,
2842
+ "learning_rate": 8.114285714285715e-06,
2843
+ "loss": 1.0731,
2844
+ "step": 396
2845
+ },
2846
+ {
2847
+ "epoch": 3.7541371158392436,
2848
+ "grad_norm": 3.9182815551757812,
2849
+ "learning_rate": 8.10952380952381e-06,
2850
+ "loss": 1.0062,
2851
+ "step": 397
2852
+ },
2853
+ {
2854
+ "epoch": 3.7635933806146573,
2855
+ "grad_norm": 10.651973724365234,
2856
+ "learning_rate": 8.104761904761906e-06,
2857
+ "loss": 0.8626,
2858
+ "step": 398
2859
+ },
2860
+ {
2861
+ "epoch": 3.773049645390071,
2862
+ "grad_norm": 10.012125968933105,
2863
+ "learning_rate": 8.1e-06,
2864
+ "loss": 0.9117,
2865
+ "step": 399
2866
+ },
2867
+ {
2868
+ "epoch": 3.7825059101654848,
2869
+ "grad_norm": 9.443008422851562,
2870
+ "learning_rate": 8.095238095238097e-06,
2871
+ "loss": 0.8736,
2872
+ "step": 400
2873
+ },
2874
+ {
2875
+ "epoch": 3.7825059101654848,
2876
+ "eval_f1_macro": 0.3032359897985208,
2877
+ "eval_loss": 0.9756927490234375,
2878
+ "eval_runtime": 13.9326,
2879
+ "eval_samples_per_second": 121.37,
2880
+ "eval_steps_per_second": 7.608,
2881
+ "step": 400
2882
+ },
2883
+ {
2884
+ "epoch": 3.7919621749408985,
2885
+ "grad_norm": 6.038490295410156,
2886
+ "learning_rate": 8.090476190476191e-06,
2887
+ "loss": 0.8401,
2888
+ "step": 401
2889
+ },
2890
+ {
2891
+ "epoch": 3.801418439716312,
2892
+ "grad_norm": 4.123776912689209,
2893
+ "learning_rate": 8.085714285714287e-06,
2894
+ "loss": 0.8569,
2895
+ "step": 402
2896
+ },
2897
+ {
2898
+ "epoch": 3.8108747044917255,
2899
+ "grad_norm": 3.2641398906707764,
2900
+ "learning_rate": 8.080952380952382e-06,
2901
+ "loss": 0.7882,
2902
+ "step": 403
2903
+ },
2904
+ {
2905
+ "epoch": 3.8203309692671397,
2906
+ "grad_norm": 7.038326263427734,
2907
+ "learning_rate": 8.076190476190476e-06,
2908
+ "loss": 0.7609,
2909
+ "step": 404
2910
+ },
2911
+ {
2912
+ "epoch": 3.829787234042553,
2913
+ "grad_norm": 10.111557960510254,
2914
+ "learning_rate": 8.071428571428572e-06,
2915
+ "loss": 0.853,
2916
+ "step": 405
2917
+ },
2918
+ {
2919
+ "epoch": 3.839243498817967,
2920
+ "grad_norm": 8.650494575500488,
2921
+ "learning_rate": 8.066666666666667e-06,
2922
+ "loss": 0.8044,
2923
+ "step": 406
2924
+ },
2925
+ {
2926
+ "epoch": 3.8486997635933804,
2927
+ "grad_norm": 10.700185775756836,
2928
+ "learning_rate": 8.061904761904763e-06,
2929
+ "loss": 0.9259,
2930
+ "step": 407
2931
+ },
2932
+ {
2933
+ "epoch": 3.8581560283687946,
2934
+ "grad_norm": 3.3702189922332764,
2935
+ "learning_rate": 8.057142857142857e-06,
2936
+ "loss": 0.7111,
2937
+ "step": 408
2938
+ },
2939
+ {
2940
+ "epoch": 3.867612293144208,
2941
+ "grad_norm": 4.435060024261475,
2942
+ "learning_rate": 8.052380952380952e-06,
2943
+ "loss": 0.7695,
2944
+ "step": 409
2945
+ },
2946
+ {
2947
+ "epoch": 3.877068557919622,
2948
+ "grad_norm": 8.08366870880127,
2949
+ "learning_rate": 8.047619047619048e-06,
2950
+ "loss": 0.843,
2951
+ "step": 410
2952
+ },
2953
+ {
2954
+ "epoch": 3.8865248226950353,
2955
+ "grad_norm": 6.264552116394043,
2956
+ "learning_rate": 8.042857142857143e-06,
2957
+ "loss": 0.8128,
2958
+ "step": 411
2959
+ },
2960
+ {
2961
+ "epoch": 3.895981087470449,
2962
+ "grad_norm": 4.465488433837891,
2963
+ "learning_rate": 8.038095238095239e-06,
2964
+ "loss": 0.7762,
2965
+ "step": 412
2966
+ },
2967
+ {
2968
+ "epoch": 3.9054373522458627,
2969
+ "grad_norm": 4.775088310241699,
2970
+ "learning_rate": 8.033333333333335e-06,
2971
+ "loss": 0.9124,
2972
+ "step": 413
2973
+ },
2974
+ {
2975
+ "epoch": 3.9148936170212765,
2976
+ "grad_norm": 3.0204665660858154,
2977
+ "learning_rate": 8.02857142857143e-06,
2978
+ "loss": 0.7446,
2979
+ "step": 414
2980
+ },
2981
+ {
2982
+ "epoch": 3.92434988179669,
2983
+ "grad_norm": 7.028670787811279,
2984
+ "learning_rate": 8.023809523809526e-06,
2985
+ "loss": 0.8979,
2986
+ "step": 415
2987
+ },
2988
+ {
2989
+ "epoch": 3.933806146572104,
2990
+ "grad_norm": 7.230231285095215,
2991
+ "learning_rate": 8.01904761904762e-06,
2992
+ "loss": 0.8861,
2993
+ "step": 416
2994
+ },
2995
+ {
2996
+ "epoch": 3.9432624113475176,
2997
+ "grad_norm": 6.362856388092041,
2998
+ "learning_rate": 8.014285714285715e-06,
2999
+ "loss": 0.8104,
3000
+ "step": 417
3001
+ },
3002
+ {
3003
+ "epoch": 3.9527186761229314,
3004
+ "grad_norm": 4.598600387573242,
3005
+ "learning_rate": 8.00952380952381e-06,
3006
+ "loss": 0.6737,
3007
+ "step": 418
3008
+ },
3009
+ {
3010
+ "epoch": 3.962174940898345,
3011
+ "grad_norm": 5.961977481842041,
3012
+ "learning_rate": 8.004761904761905e-06,
3013
+ "loss": 0.8154,
3014
+ "step": 419
3015
+ },
3016
+ {
3017
+ "epoch": 3.971631205673759,
3018
+ "grad_norm": 4.175229072570801,
3019
+ "learning_rate": 8.000000000000001e-06,
3020
+ "loss": 0.7205,
3021
+ "step": 420
3022
+ },
3023
+ {
3024
+ "epoch": 3.9810874704491725,
3025
+ "grad_norm": 6.275330543518066,
3026
+ "learning_rate": 7.995238095238096e-06,
3027
+ "loss": 0.8317,
3028
+ "step": 421
3029
+ },
3030
+ {
3031
+ "epoch": 3.9905437352245863,
3032
+ "grad_norm": 9.173235893249512,
3033
+ "learning_rate": 7.99047619047619e-06,
3034
+ "loss": 0.76,
3035
+ "step": 422
3036
+ },
3037
+ {
3038
+ "epoch": 4.0,
3039
+ "grad_norm": 8.785587310791016,
3040
+ "learning_rate": 7.985714285714286e-06,
3041
+ "loss": 0.9516,
3042
+ "step": 423
3043
+ },
3044
+ {
3045
+ "epoch": 4.009456264775413,
3046
+ "grad_norm": 7.507620334625244,
3047
+ "learning_rate": 7.980952380952381e-06,
3048
+ "loss": 0.7683,
3049
+ "step": 424
3050
+ },
3051
+ {
3052
+ "epoch": 4.0189125295508275,
3053
+ "grad_norm": 5.261313438415527,
3054
+ "learning_rate": 7.976190476190477e-06,
3055
+ "loss": 0.7983,
3056
+ "step": 425
3057
+ },
3058
+ {
3059
+ "epoch": 4.028368794326241,
3060
+ "grad_norm": 6.864541053771973,
3061
+ "learning_rate": 7.971428571428572e-06,
3062
+ "loss": 0.9051,
3063
+ "step": 426
3064
+ },
3065
+ {
3066
+ "epoch": 4.037825059101655,
3067
+ "grad_norm": 5.394888401031494,
3068
+ "learning_rate": 7.966666666666668e-06,
3069
+ "loss": 0.6639,
3070
+ "step": 427
3071
+ },
3072
+ {
3073
+ "epoch": 4.047281323877068,
3074
+ "grad_norm": 7.618609428405762,
3075
+ "learning_rate": 7.961904761904762e-06,
3076
+ "loss": 0.8791,
3077
+ "step": 428
3078
+ },
3079
+ {
3080
+ "epoch": 4.056737588652482,
3081
+ "grad_norm": 7.359228134155273,
3082
+ "learning_rate": 7.957142857142858e-06,
3083
+ "loss": 0.9106,
3084
+ "step": 429
3085
+ },
3086
+ {
3087
+ "epoch": 4.066193853427896,
3088
+ "grad_norm": 3.4456787109375,
3089
+ "learning_rate": 7.952380952380953e-06,
3090
+ "loss": 0.7331,
3091
+ "step": 430
3092
+ },
3093
+ {
3094
+ "epoch": 4.07565011820331,
3095
+ "grad_norm": 4.019674777984619,
3096
+ "learning_rate": 7.947619047619049e-06,
3097
+ "loss": 0.7135,
3098
+ "step": 431
3099
+ },
3100
+ {
3101
+ "epoch": 4.085106382978723,
3102
+ "grad_norm": 6.423426151275635,
3103
+ "learning_rate": 7.942857142857144e-06,
3104
+ "loss": 0.7535,
3105
+ "step": 432
3106
+ },
3107
+ {
3108
+ "epoch": 4.094562647754137,
3109
+ "grad_norm": 6.142768383026123,
3110
+ "learning_rate": 7.93809523809524e-06,
3111
+ "loss": 0.7536,
3112
+ "step": 433
3113
+ },
3114
+ {
3115
+ "epoch": 4.1040189125295505,
3116
+ "grad_norm": 5.819328784942627,
3117
+ "learning_rate": 7.933333333333334e-06,
3118
+ "loss": 0.694,
3119
+ "step": 434
3120
+ },
3121
+ {
3122
+ "epoch": 4.113475177304965,
3123
+ "grad_norm": 8.63968276977539,
3124
+ "learning_rate": 7.928571428571429e-06,
3125
+ "loss": 0.6545,
3126
+ "step": 435
3127
+ },
3128
+ {
3129
+ "epoch": 4.122931442080378,
3130
+ "grad_norm": 3.8477113246917725,
3131
+ "learning_rate": 7.923809523809525e-06,
3132
+ "loss": 0.7708,
3133
+ "step": 436
3134
+ },
3135
+ {
3136
+ "epoch": 4.132387706855792,
3137
+ "grad_norm": 7.744399070739746,
3138
+ "learning_rate": 7.91904761904762e-06,
3139
+ "loss": 0.6903,
3140
+ "step": 437
3141
+ },
3142
+ {
3143
+ "epoch": 4.141843971631205,
3144
+ "grad_norm": 4.299695014953613,
3145
+ "learning_rate": 7.914285714285715e-06,
3146
+ "loss": 0.7932,
3147
+ "step": 438
3148
+ },
3149
+ {
3150
+ "epoch": 4.15130023640662,
3151
+ "grad_norm": 4.58480978012085,
3152
+ "learning_rate": 7.90952380952381e-06,
3153
+ "loss": 0.7599,
3154
+ "step": 439
3155
+ },
3156
+ {
3157
+ "epoch": 4.160756501182033,
3158
+ "grad_norm": 4.495441436767578,
3159
+ "learning_rate": 7.904761904761904e-06,
3160
+ "loss": 0.8248,
3161
+ "step": 440
3162
+ },
3163
+ {
3164
+ "epoch": 4.170212765957447,
3165
+ "grad_norm": 9.265257835388184,
3166
+ "learning_rate": 7.9e-06,
3167
+ "loss": 0.8237,
3168
+ "step": 441
3169
+ },
3170
+ {
3171
+ "epoch": 4.17966903073286,
3172
+ "grad_norm": 15.275799751281738,
3173
+ "learning_rate": 7.895238095238095e-06,
3174
+ "loss": 0.7988,
3175
+ "step": 442
3176
+ },
3177
+ {
3178
+ "epoch": 4.1891252955082745,
3179
+ "grad_norm": 13.653985023498535,
3180
+ "learning_rate": 7.890476190476191e-06,
3181
+ "loss": 0.6772,
3182
+ "step": 443
3183
+ },
3184
+ {
3185
+ "epoch": 4.198581560283688,
3186
+ "grad_norm": 7.599392890930176,
3187
+ "learning_rate": 7.885714285714286e-06,
3188
+ "loss": 0.8561,
3189
+ "step": 444
3190
+ },
3191
+ {
3192
+ "epoch": 4.208037825059102,
3193
+ "grad_norm": 4.898796081542969,
3194
+ "learning_rate": 7.880952380952382e-06,
3195
+ "loss": 0.7731,
3196
+ "step": 445
3197
+ },
3198
+ {
3199
+ "epoch": 4.217494089834515,
3200
+ "grad_norm": 10.565406799316406,
3201
+ "learning_rate": 7.876190476190478e-06,
3202
+ "loss": 0.7069,
3203
+ "step": 446
3204
+ },
3205
+ {
3206
+ "epoch": 4.226950354609929,
3207
+ "grad_norm": 14.41064167022705,
3208
+ "learning_rate": 7.871428571428573e-06,
3209
+ "loss": 0.9041,
3210
+ "step": 447
3211
+ },
3212
+ {
3213
+ "epoch": 4.236406619385343,
3214
+ "grad_norm": 10.0387544631958,
3215
+ "learning_rate": 7.866666666666667e-06,
3216
+ "loss": 0.7182,
3217
+ "step": 448
3218
+ },
3219
+ {
3220
+ "epoch": 4.245862884160757,
3221
+ "grad_norm": 8.706192016601562,
3222
+ "learning_rate": 7.861904761904763e-06,
3223
+ "loss": 0.7909,
3224
+ "step": 449
3225
+ },
3226
+ {
3227
+ "epoch": 4.25531914893617,
3228
+ "grad_norm": 4.160794734954834,
3229
+ "learning_rate": 7.857142857142858e-06,
3230
+ "loss": 0.7255,
3231
+ "step": 450
3232
+ },
3233
+ {
3234
+ "epoch": 4.25531914893617,
3235
+ "eval_f1_macro": 0.2963613043601986,
3236
+ "eval_loss": 1.0044952630996704,
3237
+ "eval_runtime": 13.9352,
3238
+ "eval_samples_per_second": 121.347,
3239
+ "eval_steps_per_second": 7.607,
3240
+ "step": 450
3241
+ }
3242
+ ],
3243
+ "logging_steps": 1,
3244
+ "max_steps": 2100,
3245
+ "num_input_tokens_seen": 0,
3246
+ "num_train_epochs": 20,
3247
+ "save_steps": 50,
3248
+ "stateful_callbacks": {
3249
+ "TrainerControl": {
3250
+ "args": {
3251
+ "should_epoch_stop": false,
3252
+ "should_evaluate": false,
3253
+ "should_log": false,
3254
+ "should_save": true,
3255
+ "should_training_stop": false
3256
+ },
3257
+ "attributes": {}
3258
+ }
3259
+ },
3260
+ "total_flos": 1.8945034676158464e+16,
3261
+ "train_batch_size": 32,
3262
+ "trial_name": null,
3263
+ "trial_params": null
3264
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a4b16aac786647762b25afc3632de284987370ea7a8627dc15ff4a46a03a4e6
3
+ size 5112