DewiBrynJones commited on
Commit
7578b01
1 Parent(s): 6d7256e

hyfforddi gyda rhagor o ddata / train with more data

Browse files
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zst filter=lfs diff=lfs merge=lfs -text
32
  *tfevents* filter=lfs diff=lfs merge=lfs -text
33
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zst filter=lfs diff=lfs merge=lfs -text
32
  *tfevents* filter=lfs diff=lfs merge=lfs -text
33
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
34
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -83,15 +83,15 @@ The model achieves the following F1 scores for the different punctuation markers
83
 
84
  | Label | Precision | Recall | f1-score | Support |
85
  | ------------- | ----- | ----- | ----- | ----- |
86
- | 0 | 0.99 | 0.99 | 0.99 | 5053572 |
87
- | . | 0.89 | 0.88 | 0.88 | 224920 |
88
- | , | 0.83 | 0.82 | 0.82 | 363886 |
89
- | ? | 0.91 | 0.87 | 0.89 | 20762 |
90
- | - | 0.95 | 0.94 | 0.94 | 13161 |
91
- | : | 0.92 | 0.89 | 0.90 | 5274 |
92
- | | | | | |
93
- | accuracy | | | 0.98 | 11012581 |
94
- | macro average | 0.92 | 0.90 | 0.91 | 11012581 |
95
- | weighted average | 0.98 | 0.98 | 0.98 | 11012581 |
96
 
97
  ##
 
83
 
84
  | Label | Precision | Recall | f1-score | Support |
85
  | ------------- | ----- | ----- | ----- | ----- |
86
+ | 0 | 0.99 | 0.99 | 0.99 | 12124280 |
87
+ | . | 0.88 | 0.89 | 0.88 | 455896 |
88
+ | , | 0.84 | 0.82 | 0.83 | 771813 |
89
+ | ? | 0.92 | 0.88 | 0.90 | 54878 |
90
+ | - | 0.95 | 0.94 | 0.95 | 31545 |
91
+ | : | 0.91 | 0.87 | 0.89 | 39618 |
92
+ | | | | | |
93
+ | accuracy | | | 0.98 | 13478030 |
94
+ | macro avg | 0.91 | 0.90 | 0.91 | 13478030 |
95
+ |weighted avg | 0.97 | 0.98 | 0.97 | 13478030 |
96
 
97
  ##
config.json CHANGED
@@ -37,7 +37,7 @@
37
  "pad_token_id": 1,
38
  "position_embedding_type": "absolute",
39
  "torch_dtype": "float32",
40
- "transformers_version": "4.23.1",
41
  "type_vocab_size": 1,
42
  "use_cache": true,
43
  "vocab_size": 250002
 
37
  "pad_token_id": 1,
38
  "position_embedding_type": "absolute",
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.38.2",
41
  "type_vocab_size": 1,
42
  "use_cache": true,
43
  "vocab_size": 250002
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e822b3fca11cbb329f3dd7714e9112e5b0f0c81c5e9a9822e2b1723e773acc28
3
- size 2235524465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8451f7541aa39231946b6b295e7664c5f79779915d9a75c607fbb9f69739a8c7
3
+ size 2235436456
model_final_suite_results_task2.json ADDED
@@ -0,0 +1,2370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tests": {
3
+ "22": {
4
+ "id": 22,
5
+ "task": 2,
6
+ "model": "xlm-roberta-large",
7
+ "languages": [
8
+ "cy"
9
+ ],
10
+ "augmentation": [
11
+ ""
12
+ ],
13
+ "data_percentage": 1,
14
+ "use_token_type_ids": false,
15
+ "tokenizer_config": {
16
+ "strip_accent": false,
17
+ "add_prefix_space": true
18
+ },
19
+ "opimizer_config": {
20
+ "adafactor": true,
21
+ "num_train_epochs": 2
22
+ },
23
+ "result": [
24
+ {
25
+ "loss": 1.882,
26
+ "grad_norm": Infinity,
27
+ "learning_rate": 0.0,
28
+ "epoch": 0.0,
29
+ "step": 1
30
+ },
31
+ {
32
+ "loss": 0.5285,
33
+ "grad_norm": 2.4953722953796387,
34
+ "learning_rate": 3.99453087019932e-05,
35
+ "epoch": 0.01,
36
+ "step": 100
37
+ },
38
+ {
39
+ "loss": 0.1702,
40
+ "grad_norm": 1.8322360515594482,
41
+ "learning_rate": 3.982377248420029e-05,
42
+ "epoch": 0.01,
43
+ "step": 200
44
+ },
45
+ {
46
+ "loss": 0.1472,
47
+ "grad_norm": 1.7121275663375854,
48
+ "learning_rate": 3.970223626640739e-05,
49
+ "epoch": 0.02,
50
+ "step": 300
51
+ },
52
+ {
53
+ "loss": 0.1342,
54
+ "grad_norm": 1.7097556591033936,
55
+ "learning_rate": 3.958070004861449e-05,
56
+ "epoch": 0.02,
57
+ "step": 400
58
+ },
59
+ {
60
+ "loss": 0.1288,
61
+ "grad_norm": 1.57424795627594,
62
+ "learning_rate": 3.9459163830821586e-05,
63
+ "epoch": 0.03,
64
+ "step": 500
65
+ },
66
+ {
67
+ "loss": 0.1247,
68
+ "grad_norm": 1.7552311420440674,
69
+ "learning_rate": 3.9337627613028686e-05,
70
+ "epoch": 0.04,
71
+ "step": 600
72
+ },
73
+ {
74
+ "loss": 0.1218,
75
+ "grad_norm": 1.6224812269210815,
76
+ "learning_rate": 3.9216091395235786e-05,
77
+ "epoch": 0.04,
78
+ "step": 700
79
+ },
80
+ {
81
+ "loss": 0.1176,
82
+ "grad_norm": 1.8368713855743408,
83
+ "learning_rate": 3.909455517744288e-05,
84
+ "epoch": 0.05,
85
+ "step": 800
86
+ },
87
+ {
88
+ "loss": 0.1119,
89
+ "grad_norm": 1.4631482362747192,
90
+ "learning_rate": 3.897301895964998e-05,
91
+ "epoch": 0.05,
92
+ "step": 900
93
+ },
94
+ {
95
+ "loss": 0.1098,
96
+ "grad_norm": 1.2774118185043335,
97
+ "learning_rate": 3.885148274185708e-05,
98
+ "epoch": 0.06,
99
+ "step": 1000
100
+ },
101
+ {
102
+ "loss": 0.1083,
103
+ "grad_norm": 1.187245488166809,
104
+ "learning_rate": 3.872994652406417e-05,
105
+ "epoch": 0.07,
106
+ "step": 1100
107
+ },
108
+ {
109
+ "loss": 0.1075,
110
+ "grad_norm": 1.6492900848388672,
111
+ "learning_rate": 3.860841030627127e-05,
112
+ "epoch": 0.07,
113
+ "step": 1200
114
+ },
115
+ {
116
+ "loss": 0.107,
117
+ "grad_norm": 1.4514034986495972,
118
+ "learning_rate": 3.8486874088478366e-05,
119
+ "epoch": 0.08,
120
+ "step": 1300
121
+ },
122
+ {
123
+ "loss": 0.1036,
124
+ "grad_norm": 1.0488823652267456,
125
+ "learning_rate": 3.8365337870685466e-05,
126
+ "epoch": 0.08,
127
+ "step": 1400
128
+ },
129
+ {
130
+ "loss": 0.1021,
131
+ "grad_norm": 1.5489355325698853,
132
+ "learning_rate": 3.8243801652892566e-05,
133
+ "epoch": 0.09,
134
+ "step": 1500
135
+ },
136
+ {
137
+ "loss": 0.1008,
138
+ "grad_norm": 1.2730894088745117,
139
+ "learning_rate": 3.812226543509966e-05,
140
+ "epoch": 0.1,
141
+ "step": 1600
142
+ },
143
+ {
144
+ "loss": 0.1004,
145
+ "grad_norm": 1.6920459270477295,
146
+ "learning_rate": 3.800072921730676e-05,
147
+ "epoch": 0.1,
148
+ "step": 1700
149
+ },
150
+ {
151
+ "loss": 0.1006,
152
+ "grad_norm": 0.9863981008529663,
153
+ "learning_rate": 3.787919299951386e-05,
154
+ "epoch": 0.11,
155
+ "step": 1800
156
+ },
157
+ {
158
+ "loss": 0.0982,
159
+ "grad_norm": 0.9981995820999146,
160
+ "learning_rate": 3.775765678172095e-05,
161
+ "epoch": 0.12,
162
+ "step": 1900
163
+ },
164
+ {
165
+ "loss": 0.0975,
166
+ "grad_norm": 1.021620273590088,
167
+ "learning_rate": 3.763612056392805e-05,
168
+ "epoch": 0.12,
169
+ "step": 2000
170
+ },
171
+ {
172
+ "loss": 0.0989,
173
+ "grad_norm": 1.2811397314071655,
174
+ "learning_rate": 3.751458434613515e-05,
175
+ "epoch": 0.13,
176
+ "step": 2100
177
+ },
178
+ {
179
+ "loss": 0.0959,
180
+ "grad_norm": 1.5976190567016602,
181
+ "learning_rate": 3.7393048128342246e-05,
182
+ "epoch": 0.13,
183
+ "step": 2200
184
+ },
185
+ {
186
+ "loss": 0.0961,
187
+ "grad_norm": 0.9754481911659241,
188
+ "learning_rate": 3.7271511910549346e-05,
189
+ "epoch": 0.14,
190
+ "step": 2300
191
+ },
192
+ {
193
+ "loss": 0.0956,
194
+ "grad_norm": 0.9418678283691406,
195
+ "learning_rate": 3.7149975692756447e-05,
196
+ "epoch": 0.15,
197
+ "step": 2400
198
+ },
199
+ {
200
+ "loss": 0.0954,
201
+ "grad_norm": 1.294745922088623,
202
+ "learning_rate": 3.702843947496354e-05,
203
+ "epoch": 0.15,
204
+ "step": 2500
205
+ },
206
+ {
207
+ "loss": 0.0943,
208
+ "grad_norm": 1.3049461841583252,
209
+ "learning_rate": 3.690690325717064e-05,
210
+ "epoch": 0.16,
211
+ "step": 2600
212
+ },
213
+ {
214
+ "loss": 0.0936,
215
+ "grad_norm": 1.1144427061080933,
216
+ "learning_rate": 3.678536703937774e-05,
217
+ "epoch": 0.16,
218
+ "step": 2700
219
+ },
220
+ {
221
+ "loss": 0.0939,
222
+ "grad_norm": 1.3424856662750244,
223
+ "learning_rate": 3.666383082158483e-05,
224
+ "epoch": 0.17,
225
+ "step": 2800
226
+ },
227
+ {
228
+ "loss": 0.0947,
229
+ "grad_norm": 1.123299241065979,
230
+ "learning_rate": 3.6542294603791933e-05,
231
+ "epoch": 0.18,
232
+ "step": 2900
233
+ },
234
+ {
235
+ "loss": 0.0932,
236
+ "grad_norm": 1.456009864807129,
237
+ "learning_rate": 3.642075838599903e-05,
238
+ "epoch": 0.18,
239
+ "step": 3000
240
+ },
241
+ {
242
+ "loss": 0.0927,
243
+ "grad_norm": 1.4363266229629517,
244
+ "learning_rate": 3.629922216820613e-05,
245
+ "epoch": 0.19,
246
+ "step": 3100
247
+ },
248
+ {
249
+ "loss": 0.0907,
250
+ "grad_norm": 0.7776892185211182,
251
+ "learning_rate": 3.617768595041323e-05,
252
+ "epoch": 0.19,
253
+ "step": 3200
254
+ },
255
+ {
256
+ "loss": 0.092,
257
+ "grad_norm": 25.731966018676758,
258
+ "learning_rate": 3.605614973262032e-05,
259
+ "epoch": 0.2,
260
+ "step": 3300
261
+ },
262
+ {
263
+ "loss": 0.091,
264
+ "grad_norm": 0.9259088039398193,
265
+ "learning_rate": 3.593461351482742e-05,
266
+ "epoch": 0.21,
267
+ "step": 3400
268
+ },
269
+ {
270
+ "loss": 0.0915,
271
+ "grad_norm": 0.851094663143158,
272
+ "learning_rate": 3.581307729703452e-05,
273
+ "epoch": 0.21,
274
+ "step": 3500
275
+ },
276
+ {
277
+ "loss": 0.0902,
278
+ "grad_norm": 1.5700650215148926,
279
+ "learning_rate": 3.5691541079241614e-05,
280
+ "epoch": 0.22,
281
+ "step": 3600
282
+ },
283
+ {
284
+ "loss": 0.0888,
285
+ "grad_norm": 1.13387930393219,
286
+ "learning_rate": 3.5570004861448714e-05,
287
+ "epoch": 0.22,
288
+ "step": 3700
289
+ },
290
+ {
291
+ "loss": 0.089,
292
+ "grad_norm": 1.2357937097549438,
293
+ "learning_rate": 3.5448468643655814e-05,
294
+ "epoch": 0.23,
295
+ "step": 3800
296
+ },
297
+ {
298
+ "loss": 0.0898,
299
+ "grad_norm": 0.9063655734062195,
300
+ "learning_rate": 3.532693242586291e-05,
301
+ "epoch": 0.24,
302
+ "step": 3900
303
+ },
304
+ {
305
+ "loss": 0.0893,
306
+ "grad_norm": 1.1259723901748657,
307
+ "learning_rate": 3.520539620807001e-05,
308
+ "epoch": 0.24,
309
+ "step": 4000
310
+ },
311
+ {
312
+ "loss": 0.0889,
313
+ "grad_norm": 0.8327601552009583,
314
+ "learning_rate": 3.508385999027711e-05,
315
+ "epoch": 0.25,
316
+ "step": 4100
317
+ },
318
+ {
319
+ "loss": 0.0862,
320
+ "grad_norm": 1.2368316650390625,
321
+ "learning_rate": 3.49623237724842e-05,
322
+ "epoch": 0.25,
323
+ "step": 4200
324
+ },
325
+ {
326
+ "loss": 0.0867,
327
+ "grad_norm": 1.1474043130874634,
328
+ "learning_rate": 3.48407875546913e-05,
329
+ "epoch": 0.26,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "loss": 0.0858,
334
+ "grad_norm": 0.6887868046760559,
335
+ "learning_rate": 3.47192513368984e-05,
336
+ "epoch": 0.27,
337
+ "step": 4400
338
+ },
339
+ {
340
+ "loss": 0.0877,
341
+ "grad_norm": 0.8170347809791565,
342
+ "learning_rate": 3.4597715119105494e-05,
343
+ "epoch": 0.27,
344
+ "step": 4500
345
+ },
346
+ {
347
+ "loss": 0.0871,
348
+ "grad_norm": 0.7361243367195129,
349
+ "learning_rate": 3.4476178901312594e-05,
350
+ "epoch": 0.28,
351
+ "step": 4600
352
+ },
353
+ {
354
+ "loss": 0.0878,
355
+ "grad_norm": 1.0975162982940674,
356
+ "learning_rate": 3.435464268351969e-05,
357
+ "epoch": 0.29,
358
+ "step": 4700
359
+ },
360
+ {
361
+ "loss": 0.0863,
362
+ "grad_norm": 0.931176483631134,
363
+ "learning_rate": 3.4233106465726794e-05,
364
+ "epoch": 0.29,
365
+ "step": 4800
366
+ },
367
+ {
368
+ "loss": 0.0853,
369
+ "grad_norm": 1.0259523391723633,
370
+ "learning_rate": 3.411157024793389e-05,
371
+ "epoch": 0.3,
372
+ "step": 4900
373
+ },
374
+ {
375
+ "loss": 0.0876,
376
+ "grad_norm": 1.1680504083633423,
377
+ "learning_rate": 3.399003403014098e-05,
378
+ "epoch": 0.3,
379
+ "step": 5000
380
+ },
381
+ {
382
+ "loss": 0.0855,
383
+ "grad_norm": 1.2358198165893555,
384
+ "learning_rate": 3.386849781234809e-05,
385
+ "epoch": 0.31,
386
+ "step": 5100
387
+ },
388
+ {
389
+ "loss": 0.085,
390
+ "grad_norm": 0.8484376668930054,
391
+ "learning_rate": 3.374696159455518e-05,
392
+ "epoch": 0.32,
393
+ "step": 5200
394
+ },
395
+ {
396
+ "loss": 0.085,
397
+ "grad_norm": 1.5419291257858276,
398
+ "learning_rate": 3.3625425376762274e-05,
399
+ "epoch": 0.32,
400
+ "step": 5300
401
+ },
402
+ {
403
+ "loss": 0.0849,
404
+ "grad_norm": 1.0334900617599487,
405
+ "learning_rate": 3.3503889158969374e-05,
406
+ "epoch": 0.33,
407
+ "step": 5400
408
+ },
409
+ {
410
+ "loss": 0.0854,
411
+ "grad_norm": 1.0367408990859985,
412
+ "learning_rate": 3.3382352941176474e-05,
413
+ "epoch": 0.33,
414
+ "step": 5500
415
+ },
416
+ {
417
+ "loss": 0.0853,
418
+ "grad_norm": 0.8429509401321411,
419
+ "learning_rate": 3.326081672338357e-05,
420
+ "epoch": 0.34,
421
+ "step": 5600
422
+ },
423
+ {
424
+ "loss": 0.086,
425
+ "grad_norm": 0.9059005379676819,
426
+ "learning_rate": 3.313928050559067e-05,
427
+ "epoch": 0.35,
428
+ "step": 5700
429
+ },
430
+ {
431
+ "loss": 0.0846,
432
+ "grad_norm": 1.1803362369537354,
433
+ "learning_rate": 3.301774428779777e-05,
434
+ "epoch": 0.35,
435
+ "step": 5800
436
+ },
437
+ {
438
+ "loss": 0.0817,
439
+ "grad_norm": 0.7263641357421875,
440
+ "learning_rate": 3.289620807000487e-05,
441
+ "epoch": 0.36,
442
+ "step": 5900
443
+ },
444
+ {
445
+ "loss": 0.0831,
446
+ "grad_norm": 0.8227238655090332,
447
+ "learning_rate": 3.277467185221196e-05,
448
+ "epoch": 0.36,
449
+ "step": 6000
450
+ },
451
+ {
452
+ "loss": 0.0839,
453
+ "grad_norm": 1.0349544286727905,
454
+ "learning_rate": 3.2653135634419055e-05,
455
+ "epoch": 0.37,
456
+ "step": 6100
457
+ },
458
+ {
459
+ "loss": 0.0827,
460
+ "grad_norm": 0.8446714282035828,
461
+ "learning_rate": 3.253159941662616e-05,
462
+ "epoch": 0.38,
463
+ "step": 6200
464
+ },
465
+ {
466
+ "loss": 0.082,
467
+ "grad_norm": 1.1419836282730103,
468
+ "learning_rate": 3.2410063198833255e-05,
469
+ "epoch": 0.38,
470
+ "step": 6300
471
+ },
472
+ {
473
+ "loss": 0.0812,
474
+ "grad_norm": 0.9505990147590637,
475
+ "learning_rate": 3.228852698104035e-05,
476
+ "epoch": 0.39,
477
+ "step": 6400
478
+ },
479
+ {
480
+ "loss": 0.0806,
481
+ "grad_norm": 1.0036993026733398,
482
+ "learning_rate": 3.2166990763247455e-05,
483
+ "epoch": 0.39,
484
+ "step": 6500
485
+ },
486
+ {
487
+ "loss": 0.0819,
488
+ "grad_norm": 0.7694116234779358,
489
+ "learning_rate": 3.204545454545455e-05,
490
+ "epoch": 0.4,
491
+ "step": 6600
492
+ },
493
+ {
494
+ "loss": 0.0818,
495
+ "grad_norm": 0.7389699220657349,
496
+ "learning_rate": 3.192391832766165e-05,
497
+ "epoch": 0.41,
498
+ "step": 6700
499
+ },
500
+ {
501
+ "loss": 0.0829,
502
+ "grad_norm": 0.8264873623847961,
503
+ "learning_rate": 3.180238210986874e-05,
504
+ "epoch": 0.41,
505
+ "step": 6800
506
+ },
507
+ {
508
+ "loss": 0.0849,
509
+ "grad_norm": 0.8844084143638611,
510
+ "learning_rate": 3.168084589207584e-05,
511
+ "epoch": 0.42,
512
+ "step": 6900
513
+ },
514
+ {
515
+ "loss": 0.0816,
516
+ "grad_norm": 0.8728023171424866,
517
+ "learning_rate": 3.155930967428294e-05,
518
+ "epoch": 0.42,
519
+ "step": 7000
520
+ },
521
+ {
522
+ "loss": 0.0799,
523
+ "grad_norm": 1.218404769897461,
524
+ "learning_rate": 3.1437773456490035e-05,
525
+ "epoch": 0.43,
526
+ "step": 7100
527
+ },
528
+ {
529
+ "loss": 0.0797,
530
+ "grad_norm": 0.7085688710212708,
531
+ "learning_rate": 3.1316237238697135e-05,
532
+ "epoch": 0.44,
533
+ "step": 7200
534
+ },
535
+ {
536
+ "loss": 0.0795,
537
+ "grad_norm": 0.8446517586708069,
538
+ "learning_rate": 3.1194701020904235e-05,
539
+ "epoch": 0.44,
540
+ "step": 7300
541
+ },
542
+ {
543
+ "loss": 0.0817,
544
+ "grad_norm": 1.3226453065872192,
545
+ "learning_rate": 3.107316480311133e-05,
546
+ "epoch": 0.45,
547
+ "step": 7400
548
+ },
549
+ {
550
+ "loss": 0.0816,
551
+ "grad_norm": 0.7685155868530273,
552
+ "learning_rate": 3.095162858531843e-05,
553
+ "epoch": 0.46,
554
+ "step": 7500
555
+ },
556
+ {
557
+ "loss": 0.0806,
558
+ "grad_norm": 0.7135798335075378,
559
+ "learning_rate": 3.083009236752553e-05,
560
+ "epoch": 0.46,
561
+ "step": 7600
562
+ },
563
+ {
564
+ "loss": 0.0795,
565
+ "grad_norm": 1.0276037454605103,
566
+ "learning_rate": 3.070855614973262e-05,
567
+ "epoch": 0.47,
568
+ "step": 7700
569
+ },
570
+ {
571
+ "loss": 0.081,
572
+ "grad_norm": 1.1788092851638794,
573
+ "learning_rate": 3.058701993193972e-05,
574
+ "epoch": 0.47,
575
+ "step": 7800
576
+ },
577
+ {
578
+ "loss": 0.0791,
579
+ "grad_norm": 1.0305782556533813,
580
+ "learning_rate": 3.046548371414682e-05,
581
+ "epoch": 0.48,
582
+ "step": 7900
583
+ },
584
+ {
585
+ "loss": 0.0805,
586
+ "grad_norm": 1.4414223432540894,
587
+ "learning_rate": 3.0343947496353915e-05,
588
+ "epoch": 0.49,
589
+ "step": 8000
590
+ },
591
+ {
592
+ "loss": 0.0799,
593
+ "grad_norm": 0.8137165904045105,
594
+ "learning_rate": 3.0222411278561012e-05,
595
+ "epoch": 0.49,
596
+ "step": 8100
597
+ },
598
+ {
599
+ "loss": 0.08,
600
+ "grad_norm": 1.1238079071044922,
601
+ "learning_rate": 3.0100875060768112e-05,
602
+ "epoch": 0.5,
603
+ "step": 8200
604
+ },
605
+ {
606
+ "loss": 0.0792,
607
+ "grad_norm": 0.9724037647247314,
608
+ "learning_rate": 2.997933884297521e-05,
609
+ "epoch": 0.5,
610
+ "step": 8300
611
+ },
612
+ {
613
+ "loss": 0.0793,
614
+ "grad_norm": 1.0247116088867188,
615
+ "learning_rate": 2.9857802625182306e-05,
616
+ "epoch": 0.51,
617
+ "step": 8400
618
+ },
619
+ {
620
+ "loss": 0.0783,
621
+ "grad_norm": 1.454062581062317,
622
+ "learning_rate": 2.9737481769567335e-05,
623
+ "epoch": 0.52,
624
+ "step": 8500
625
+ },
626
+ {
627
+ "loss": 0.0788,
628
+ "grad_norm": 0.7570217251777649,
629
+ "learning_rate": 2.961594555177443e-05,
630
+ "epoch": 0.52,
631
+ "step": 8600
632
+ },
633
+ {
634
+ "loss": 0.0768,
635
+ "grad_norm": 1.1738083362579346,
636
+ "learning_rate": 2.9494409333981528e-05,
637
+ "epoch": 0.53,
638
+ "step": 8700
639
+ },
640
+ {
641
+ "loss": 0.0778,
642
+ "grad_norm": 0.7776427268981934,
643
+ "learning_rate": 2.9372873116188625e-05,
644
+ "epoch": 0.53,
645
+ "step": 8800
646
+ },
647
+ {
648
+ "loss": 0.0763,
649
+ "grad_norm": 1.226198673248291,
650
+ "learning_rate": 2.9251336898395725e-05,
651
+ "epoch": 0.54,
652
+ "step": 8900
653
+ },
654
+ {
655
+ "loss": 0.0761,
656
+ "grad_norm": 0.8859773874282837,
657
+ "learning_rate": 2.912980068060282e-05,
658
+ "epoch": 0.55,
659
+ "step": 9000
660
+ },
661
+ {
662
+ "loss": 0.0765,
663
+ "grad_norm": 1.0220259428024292,
664
+ "learning_rate": 2.9008264462809918e-05,
665
+ "epoch": 0.55,
666
+ "step": 9100
667
+ },
668
+ {
669
+ "loss": 0.0777,
670
+ "grad_norm": 1.0430243015289307,
671
+ "learning_rate": 2.888672824501702e-05,
672
+ "epoch": 0.56,
673
+ "step": 9200
674
+ },
675
+ {
676
+ "loss": 0.0775,
677
+ "grad_norm": 1.1380356550216675,
678
+ "learning_rate": 2.8765192027224115e-05,
679
+ "epoch": 0.56,
680
+ "step": 9300
681
+ },
682
+ {
683
+ "loss": 0.0775,
684
+ "grad_norm": 0.6778531670570374,
685
+ "learning_rate": 2.8643655809431212e-05,
686
+ "epoch": 0.57,
687
+ "step": 9400
688
+ },
689
+ {
690
+ "loss": 0.0782,
691
+ "grad_norm": 1.0413175821304321,
692
+ "learning_rate": 2.852211959163831e-05,
693
+ "epoch": 0.58,
694
+ "step": 9500
695
+ },
696
+ {
697
+ "loss": 0.0791,
698
+ "grad_norm": 1.1399835348129272,
699
+ "learning_rate": 2.840058337384541e-05,
700
+ "epoch": 0.58,
701
+ "step": 9600
702
+ },
703
+ {
704
+ "loss": 0.0763,
705
+ "grad_norm": 0.968399703502655,
706
+ "learning_rate": 2.8279047156052505e-05,
707
+ "epoch": 0.59,
708
+ "step": 9700
709
+ },
710
+ {
711
+ "loss": 0.0763,
712
+ "grad_norm": 1.0254497528076172,
713
+ "learning_rate": 2.8157510938259602e-05,
714
+ "epoch": 0.59,
715
+ "step": 9800
716
+ },
717
+ {
718
+ "loss": 0.0771,
719
+ "grad_norm": 0.8642473220825195,
720
+ "learning_rate": 2.8035974720466702e-05,
721
+ "epoch": 0.6,
722
+ "step": 9900
723
+ },
724
+ {
725
+ "loss": 0.0772,
726
+ "grad_norm": 1.1130231618881226,
727
+ "learning_rate": 2.79144385026738e-05,
728
+ "epoch": 0.61,
729
+ "step": 10000
730
+ },
731
+ {
732
+ "loss": 0.0793,
733
+ "grad_norm": 1.4455962181091309,
734
+ "learning_rate": 2.7792902284880895e-05,
735
+ "epoch": 0.61,
736
+ "step": 10100
737
+ },
738
+ {
739
+ "loss": 0.077,
740
+ "grad_norm": 0.9273576736450195,
741
+ "learning_rate": 2.7671366067087992e-05,
742
+ "epoch": 0.62,
743
+ "step": 10200
744
+ },
745
+ {
746
+ "loss": 0.0766,
747
+ "grad_norm": 0.8223456740379333,
748
+ "learning_rate": 2.7549829849295092e-05,
749
+ "epoch": 0.62,
750
+ "step": 10300
751
+ },
752
+ {
753
+ "loss": 0.0765,
754
+ "grad_norm": 1.1068949699401855,
755
+ "learning_rate": 2.742829363150219e-05,
756
+ "epoch": 0.63,
757
+ "step": 10400
758
+ },
759
+ {
760
+ "loss": 0.0762,
761
+ "grad_norm": 1.0787135362625122,
762
+ "learning_rate": 2.7306757413709285e-05,
763
+ "epoch": 0.64,
764
+ "step": 10500
765
+ },
766
+ {
767
+ "loss": 0.0765,
768
+ "grad_norm": 0.6019480228424072,
769
+ "learning_rate": 2.7185221195916386e-05,
770
+ "epoch": 0.64,
771
+ "step": 10600
772
+ },
773
+ {
774
+ "loss": 0.0756,
775
+ "grad_norm": 0.7752580046653748,
776
+ "learning_rate": 2.7063684978123482e-05,
777
+ "epoch": 0.65,
778
+ "step": 10700
779
+ },
780
+ {
781
+ "loss": 0.0762,
782
+ "grad_norm": 0.9023341536521912,
783
+ "learning_rate": 2.6943364122508508e-05,
784
+ "epoch": 0.66,
785
+ "step": 10800
786
+ },
787
+ {
788
+ "loss": 0.0759,
789
+ "grad_norm": 1.1154266595840454,
790
+ "learning_rate": 2.6821827904715608e-05,
791
+ "epoch": 0.66,
792
+ "step": 10900
793
+ },
794
+ {
795
+ "loss": 0.0752,
796
+ "grad_norm": 1.5197564363479614,
797
+ "learning_rate": 2.6700291686922705e-05,
798
+ "epoch": 0.67,
799
+ "step": 11000
800
+ },
801
+ {
802
+ "loss": 0.0757,
803
+ "grad_norm": 0.8111494183540344,
804
+ "learning_rate": 2.65787554691298e-05,
805
+ "epoch": 0.67,
806
+ "step": 11100
807
+ },
808
+ {
809
+ "loss": 0.0749,
810
+ "grad_norm": 0.6413083076477051,
811
+ "learning_rate": 2.6457219251336898e-05,
812
+ "epoch": 0.68,
813
+ "step": 11200
814
+ },
815
+ {
816
+ "loss": 0.0754,
817
+ "grad_norm": 0.8996323943138123,
818
+ "learning_rate": 2.6335683033544e-05,
819
+ "epoch": 0.69,
820
+ "step": 11300
821
+ },
822
+ {
823
+ "loss": 0.0744,
824
+ "grad_norm": 0.7931196093559265,
825
+ "learning_rate": 2.6214146815751095e-05,
826
+ "epoch": 0.69,
827
+ "step": 11400
828
+ },
829
+ {
830
+ "loss": 0.0742,
831
+ "grad_norm": 1.0821586847305298,
832
+ "learning_rate": 2.609261059795819e-05,
833
+ "epoch": 0.7,
834
+ "step": 11500
835
+ },
836
+ {
837
+ "loss": 0.0722,
838
+ "grad_norm": 0.9964590072631836,
839
+ "learning_rate": 2.5971074380165292e-05,
840
+ "epoch": 0.7,
841
+ "step": 11600
842
+ },
843
+ {
844
+ "loss": 0.0752,
845
+ "grad_norm": 0.7918893694877625,
846
+ "learning_rate": 2.584953816237239e-05,
847
+ "epoch": 0.71,
848
+ "step": 11700
849
+ },
850
+ {
851
+ "loss": 0.0734,
852
+ "grad_norm": 0.6565855145454407,
853
+ "learning_rate": 2.5728001944579485e-05,
854
+ "epoch": 0.72,
855
+ "step": 11800
856
+ },
857
+ {
858
+ "loss": 0.0717,
859
+ "grad_norm": 1.9885566234588623,
860
+ "learning_rate": 2.5606465726786582e-05,
861
+ "epoch": 0.72,
862
+ "step": 11900
863
+ },
864
+ {
865
+ "loss": 0.0747,
866
+ "grad_norm": 0.6101750135421753,
867
+ "learning_rate": 2.5484929508993682e-05,
868
+ "epoch": 0.73,
869
+ "step": 12000
870
+ },
871
+ {
872
+ "loss": 0.073,
873
+ "grad_norm": 1.001930594444275,
874
+ "learning_rate": 2.536339329120078e-05,
875
+ "epoch": 0.73,
876
+ "step": 12100
877
+ },
878
+ {
879
+ "loss": 0.074,
880
+ "grad_norm": 0.880673348903656,
881
+ "learning_rate": 2.5241857073407875e-05,
882
+ "epoch": 0.74,
883
+ "step": 12200
884
+ },
885
+ {
886
+ "loss": 0.0738,
887
+ "grad_norm": 0.7980429530143738,
888
+ "learning_rate": 2.5120320855614975e-05,
889
+ "epoch": 0.75,
890
+ "step": 12300
891
+ },
892
+ {
893
+ "loss": 0.0758,
894
+ "grad_norm": 1.0153135061264038,
895
+ "learning_rate": 2.4998784637822072e-05,
896
+ "epoch": 0.75,
897
+ "step": 12400
898
+ },
899
+ {
900
+ "loss": 0.0742,
901
+ "grad_norm": 0.8344822525978088,
902
+ "learning_rate": 2.487724842002917e-05,
903
+ "epoch": 0.76,
904
+ "step": 12500
905
+ },
906
+ {
907
+ "loss": 0.0738,
908
+ "grad_norm": 0.6752304434776306,
909
+ "learning_rate": 2.4755712202236272e-05,
910
+ "epoch": 0.76,
911
+ "step": 12600
912
+ },
913
+ {
914
+ "loss": 0.0732,
915
+ "grad_norm": 1.1106210947036743,
916
+ "learning_rate": 2.4634175984443366e-05,
917
+ "epoch": 0.77,
918
+ "step": 12700
919
+ },
920
+ {
921
+ "loss": 0.0754,
922
+ "grad_norm": 0.8022058606147766,
923
+ "learning_rate": 2.4512639766650462e-05,
924
+ "epoch": 0.78,
925
+ "step": 12800
926
+ },
927
+ {
928
+ "loss": 0.0735,
929
+ "grad_norm": 0.737308144569397,
930
+ "learning_rate": 2.439110354885756e-05,
931
+ "epoch": 0.78,
932
+ "step": 12900
933
+ },
934
+ {
935
+ "loss": 0.0738,
936
+ "grad_norm": 2.094043493270874,
937
+ "learning_rate": 2.4269567331064662e-05,
938
+ "epoch": 0.79,
939
+ "step": 13000
940
+ },
941
+ {
942
+ "loss": 0.072,
943
+ "grad_norm": 1.1105279922485352,
944
+ "learning_rate": 2.4148031113271756e-05,
945
+ "epoch": 0.79,
946
+ "step": 13100
947
+ },
948
+ {
949
+ "loss": 0.0716,
950
+ "grad_norm": 1.2243571281433105,
951
+ "learning_rate": 2.4026494895478852e-05,
952
+ "epoch": 0.8,
953
+ "step": 13200
954
+ },
955
+ {
956
+ "loss": 0.0718,
957
+ "grad_norm": 1.0883300304412842,
958
+ "learning_rate": 2.3904958677685956e-05,
959
+ "epoch": 0.81,
960
+ "step": 13300
961
+ },
962
+ {
963
+ "loss": 0.0727,
964
+ "grad_norm": 0.9934273362159729,
965
+ "learning_rate": 2.378342245989305e-05,
966
+ "epoch": 0.81,
967
+ "step": 13400
968
+ },
969
+ {
970
+ "loss": 0.0721,
971
+ "grad_norm": 0.7145100831985474,
972
+ "learning_rate": 2.3661886242100146e-05,
973
+ "epoch": 0.82,
974
+ "step": 13500
975
+ },
976
+ {
977
+ "loss": 0.0721,
978
+ "grad_norm": 0.8873516321182251,
979
+ "learning_rate": 2.3540350024307243e-05,
980
+ "epoch": 0.83,
981
+ "step": 13600
982
+ },
983
+ {
984
+ "loss": 0.0723,
985
+ "grad_norm": 0.7798359990119934,
986
+ "learning_rate": 2.3418813806514346e-05,
987
+ "epoch": 0.83,
988
+ "step": 13700
989
+ },
990
+ {
991
+ "loss": 0.0726,
992
+ "grad_norm": 0.9411553740501404,
993
+ "learning_rate": 2.329727758872144e-05,
994
+ "epoch": 0.84,
995
+ "step": 13800
996
+ },
997
+ {
998
+ "loss": 0.0715,
999
+ "grad_norm": 0.7994709610939026,
1000
+ "learning_rate": 2.3175741370928536e-05,
1001
+ "epoch": 0.84,
1002
+ "step": 13900
1003
+ },
1004
+ {
1005
+ "loss": 0.0732,
1006
+ "grad_norm": 0.5489715337753296,
1007
+ "learning_rate": 2.305420515313564e-05,
1008
+ "epoch": 0.85,
1009
+ "step": 14000
1010
+ },
1011
+ {
1012
+ "loss": 0.0699,
1013
+ "grad_norm": 0.5710996389389038,
1014
+ "learning_rate": 2.2932668935342736e-05,
1015
+ "epoch": 0.86,
1016
+ "step": 14100
1017
+ },
1018
+ {
1019
+ "loss": 0.073,
1020
+ "grad_norm": 0.7003745436668396,
1021
+ "learning_rate": 2.281113271754983e-05,
1022
+ "epoch": 0.86,
1023
+ "step": 14200
1024
+ },
1025
+ {
1026
+ "loss": 0.0722,
1027
+ "grad_norm": 0.6743086576461792,
1028
+ "learning_rate": 2.2689596499756926e-05,
1029
+ "epoch": 0.87,
1030
+ "step": 14300
1031
+ },
1032
+ {
1033
+ "loss": 0.0699,
1034
+ "grad_norm": 0.6730968356132507,
1035
+ "learning_rate": 2.256806028196403e-05,
1036
+ "epoch": 0.87,
1037
+ "step": 14400
1038
+ },
1039
+ {
1040
+ "loss": 0.0719,
1041
+ "grad_norm": 0.7155641913414001,
1042
+ "learning_rate": 2.2446524064171126e-05,
1043
+ "epoch": 0.88,
1044
+ "step": 14500
1045
+ },
1046
+ {
1047
+ "loss": 0.0708,
1048
+ "grad_norm": 0.8122462630271912,
1049
+ "learning_rate": 2.232498784637822e-05,
1050
+ "epoch": 0.89,
1051
+ "step": 14600
1052
+ },
1053
+ {
1054
+ "loss": 0.0718,
1055
+ "grad_norm": 0.8022533655166626,
1056
+ "learning_rate": 2.2203451628585323e-05,
1057
+ "epoch": 0.89,
1058
+ "step": 14700
1059
+ },
1060
+ {
1061
+ "loss": 0.0712,
1062
+ "grad_norm": 0.545359194278717,
1063
+ "learning_rate": 2.208191541079242e-05,
1064
+ "epoch": 0.9,
1065
+ "step": 14800
1066
+ },
1067
+ {
1068
+ "loss": 0.0711,
1069
+ "grad_norm": 0.8318025469779968,
1070
+ "learning_rate": 2.1960379192999513e-05,
1071
+ "epoch": 0.9,
1072
+ "step": 14900
1073
+ },
1074
+ {
1075
+ "loss": 0.0706,
1076
+ "grad_norm": 0.9334779381752014,
1077
+ "learning_rate": 2.1838842975206616e-05,
1078
+ "epoch": 0.91,
1079
+ "step": 15000
1080
+ },
1081
+ {
1082
+ "loss": 0.0701,
1083
+ "grad_norm": 0.8202875256538391,
1084
+ "learning_rate": 2.1717306757413713e-05,
1085
+ "epoch": 0.92,
1086
+ "step": 15100
1087
+ },
1088
+ {
1089
+ "loss": 0.07,
1090
+ "grad_norm": 0.8788963556289673,
1091
+ "learning_rate": 2.159577053962081e-05,
1092
+ "epoch": 0.92,
1093
+ "step": 15200
1094
+ },
1095
+ {
1096
+ "loss": 0.0713,
1097
+ "grad_norm": 1.023823618888855,
1098
+ "learning_rate": 2.1474234321827903e-05,
1099
+ "epoch": 0.93,
1100
+ "step": 15300
1101
+ },
1102
+ {
1103
+ "loss": 0.0697,
1104
+ "grad_norm": 0.8784018158912659,
1105
+ "learning_rate": 2.1353913466212936e-05,
1106
+ "epoch": 0.93,
1107
+ "step": 15400
1108
+ },
1109
+ {
1110
+ "loss": 0.0695,
1111
+ "grad_norm": 1.1254814863204956,
1112
+ "learning_rate": 2.1232377248420032e-05,
1113
+ "epoch": 0.94,
1114
+ "step": 15500
1115
+ },
1116
+ {
1117
+ "loss": 0.0697,
1118
+ "grad_norm": 0.9760749340057373,
1119
+ "learning_rate": 2.1110841030627126e-05,
1120
+ "epoch": 0.95,
1121
+ "step": 15600
1122
+ },
1123
+ {
1124
+ "loss": 0.0709,
1125
+ "grad_norm": 1.0121357440948486,
1126
+ "learning_rate": 2.098930481283423e-05,
1127
+ "epoch": 0.95,
1128
+ "step": 15700
1129
+ },
1130
+ {
1131
+ "loss": 0.0717,
1132
+ "grad_norm": 0.7810111045837402,
1133
+ "learning_rate": 2.0867768595041326e-05,
1134
+ "epoch": 0.96,
1135
+ "step": 15800
1136
+ },
1137
+ {
1138
+ "loss": 0.0692,
1139
+ "grad_norm": 0.6813214421272278,
1140
+ "learning_rate": 2.074623237724842e-05,
1141
+ "epoch": 0.96,
1142
+ "step": 15900
1143
+ },
1144
+ {
1145
+ "loss": 0.0696,
1146
+ "grad_norm": 0.7685451507568359,
1147
+ "learning_rate": 2.0624696159455516e-05,
1148
+ "epoch": 0.97,
1149
+ "step": 16000
1150
+ },
1151
+ {
1152
+ "loss": 0.0702,
1153
+ "grad_norm": 3.3225691318511963,
1154
+ "learning_rate": 2.050315994166262e-05,
1155
+ "epoch": 0.98,
1156
+ "step": 16100
1157
+ },
1158
+ {
1159
+ "loss": 0.0702,
1160
+ "grad_norm": 0.7979671955108643,
1161
+ "learning_rate": 2.0381623723869716e-05,
1162
+ "epoch": 0.98,
1163
+ "step": 16200
1164
+ },
1165
+ {
1166
+ "loss": 0.0691,
1167
+ "grad_norm": 3.4929583072662354,
1168
+ "learning_rate": 2.026008750607681e-05,
1169
+ "epoch": 0.99,
1170
+ "step": 16300
1171
+ },
1172
+ {
1173
+ "loss": 0.0703,
1174
+ "grad_norm": 0.7738245725631714,
1175
+ "learning_rate": 2.0138551288283913e-05,
1176
+ "epoch": 1.0,
1177
+ "step": 16400
1178
+ },
1179
+ {
1180
+ "eval_loss": 0.06881729513406754,
1181
+ "eval_f1": 0.8973916467400326,
1182
+ "eval_precision": 0.9049522471305407,
1183
+ "eval_recall": 0.8906029559155776,
1184
+ "eval_accuracy": 0.9730252863363563,
1185
+ "eval_runtime": 304.4852,
1186
+ "eval_samples_per_second": 86.796,
1187
+ "eval_steps_per_second": 10.851,
1188
+ "epoch": 1.0,
1189
+ "step": 16481
1190
+ },
1191
+ {
1192
+ "loss": 0.0684,
1193
+ "grad_norm": 0.891858696937561,
1194
+ "learning_rate": 2.001701507049101e-05,
1195
+ "epoch": 1.0,
1196
+ "step": 16500
1197
+ },
1198
+ {
1199
+ "loss": 0.0619,
1200
+ "grad_norm": 0.6408938765525818,
1201
+ "learning_rate": 1.9895478852698106e-05,
1202
+ "epoch": 1.01,
1203
+ "step": 16600
1204
+ },
1205
+ {
1206
+ "loss": 0.0629,
1207
+ "grad_norm": 0.7390792965888977,
1208
+ "learning_rate": 1.9773942634905203e-05,
1209
+ "epoch": 1.01,
1210
+ "step": 16700
1211
+ },
1212
+ {
1213
+ "loss": 0.0604,
1214
+ "grad_norm": 0.5206795930862427,
1215
+ "learning_rate": 1.9652406417112303e-05,
1216
+ "epoch": 1.02,
1217
+ "step": 16800
1218
+ },
1219
+ {
1220
+ "loss": 0.0613,
1221
+ "grad_norm": 0.909116268157959,
1222
+ "learning_rate": 1.95308701993194e-05,
1223
+ "epoch": 1.03,
1224
+ "step": 16900
1225
+ },
1226
+ {
1227
+ "loss": 0.0616,
1228
+ "grad_norm": 0.8701964020729065,
1229
+ "learning_rate": 1.9409333981526496e-05,
1230
+ "epoch": 1.03,
1231
+ "step": 17000
1232
+ },
1233
+ {
1234
+ "loss": 0.0625,
1235
+ "grad_norm": 1.0762407779693604,
1236
+ "learning_rate": 1.9287797763733593e-05,
1237
+ "epoch": 1.04,
1238
+ "step": 17100
1239
+ },
1240
+ {
1241
+ "loss": 0.0615,
1242
+ "grad_norm": 0.7816362380981445,
1243
+ "learning_rate": 1.9166261545940693e-05,
1244
+ "epoch": 1.04,
1245
+ "step": 17200
1246
+ },
1247
+ {
1248
+ "loss": 0.0626,
1249
+ "grad_norm": 0.6983965039253235,
1250
+ "learning_rate": 1.904594069032572e-05,
1251
+ "epoch": 1.05,
1252
+ "step": 17300
1253
+ },
1254
+ {
1255
+ "loss": 0.0621,
1256
+ "grad_norm": 0.910698413848877,
1257
+ "learning_rate": 1.8924404472532816e-05,
1258
+ "epoch": 1.06,
1259
+ "step": 17400
1260
+ },
1261
+ {
1262
+ "loss": 0.0631,
1263
+ "grad_norm": 0.8654133677482605,
1264
+ "learning_rate": 1.8802868254739916e-05,
1265
+ "epoch": 1.06,
1266
+ "step": 17500
1267
+ },
1268
+ {
1269
+ "loss": 0.062,
1270
+ "grad_norm": 0.8351789712905884,
1271
+ "learning_rate": 1.8681332036947012e-05,
1272
+ "epoch": 1.07,
1273
+ "step": 17600
1274
+ },
1275
+ {
1276
+ "loss": 0.0604,
1277
+ "grad_norm": 0.7861587405204773,
1278
+ "learning_rate": 1.855979581915411e-05,
1279
+ "epoch": 1.07,
1280
+ "step": 17700
1281
+ },
1282
+ {
1283
+ "loss": 0.0609,
1284
+ "grad_norm": 0.7295276522636414,
1285
+ "learning_rate": 1.843825960136121e-05,
1286
+ "epoch": 1.08,
1287
+ "step": 17800
1288
+ },
1289
+ {
1290
+ "loss": 0.0616,
1291
+ "grad_norm": 1.0210868120193481,
1292
+ "learning_rate": 1.8316723383568306e-05,
1293
+ "epoch": 1.09,
1294
+ "step": 17900
1295
+ },
1296
+ {
1297
+ "loss": 0.0616,
1298
+ "grad_norm": 0.8220874071121216,
1299
+ "learning_rate": 1.8195187165775403e-05,
1300
+ "epoch": 1.09,
1301
+ "step": 18000
1302
+ },
1303
+ {
1304
+ "loss": 0.0607,
1305
+ "grad_norm": 0.7961727380752563,
1306
+ "learning_rate": 1.80736509479825e-05,
1307
+ "epoch": 1.1,
1308
+ "step": 18100
1309
+ },
1310
+ {
1311
+ "loss": 0.0614,
1312
+ "grad_norm": 1.0390113592147827,
1313
+ "learning_rate": 1.79521147301896e-05,
1314
+ "epoch": 1.1,
1315
+ "step": 18200
1316
+ },
1317
+ {
1318
+ "loss": 0.0625,
1319
+ "grad_norm": 0.8423497080802917,
1320
+ "learning_rate": 1.7830578512396696e-05,
1321
+ "epoch": 1.11,
1322
+ "step": 18300
1323
+ },
1324
+ {
1325
+ "loss": 0.0618,
1326
+ "grad_norm": 0.7576957941055298,
1327
+ "learning_rate": 1.7709042294603793e-05,
1328
+ "epoch": 1.12,
1329
+ "step": 18400
1330
+ },
1331
+ {
1332
+ "loss": 0.061,
1333
+ "grad_norm": 0.7174555659294128,
1334
+ "learning_rate": 1.7587506076810893e-05,
1335
+ "epoch": 1.12,
1336
+ "step": 18500
1337
+ },
1338
+ {
1339
+ "loss": 0.0602,
1340
+ "grad_norm": 0.7977816462516785,
1341
+ "learning_rate": 1.746596985901799e-05,
1342
+ "epoch": 1.13,
1343
+ "step": 18600
1344
+ },
1345
+ {
1346
+ "loss": 0.0617,
1347
+ "grad_norm": 0.8125550150871277,
1348
+ "learning_rate": 1.7344433641225086e-05,
1349
+ "epoch": 1.13,
1350
+ "step": 18700
1351
+ },
1352
+ {
1353
+ "loss": 0.0605,
1354
+ "grad_norm": 1.3914258480072021,
1355
+ "learning_rate": 1.7222897423432183e-05,
1356
+ "epoch": 1.14,
1357
+ "step": 18800
1358
+ },
1359
+ {
1360
+ "loss": 0.0614,
1361
+ "grad_norm": 0.8273860812187195,
1362
+ "learning_rate": 1.7101361205639283e-05,
1363
+ "epoch": 1.15,
1364
+ "step": 18900
1365
+ },
1366
+ {
1367
+ "loss": 0.0606,
1368
+ "grad_norm": 0.7267687916755676,
1369
+ "learning_rate": 1.697982498784638e-05,
1370
+ "epoch": 1.15,
1371
+ "step": 19000
1372
+ },
1373
+ {
1374
+ "loss": 0.0624,
1375
+ "grad_norm": 1.075861930847168,
1376
+ "learning_rate": 1.6858288770053476e-05,
1377
+ "epoch": 1.16,
1378
+ "step": 19100
1379
+ },
1380
+ {
1381
+ "loss": 0.062,
1382
+ "grad_norm": 0.867139995098114,
1383
+ "learning_rate": 1.6736752552260576e-05,
1384
+ "epoch": 1.16,
1385
+ "step": 19200
1386
+ },
1387
+ {
1388
+ "loss": 0.0595,
1389
+ "grad_norm": 0.6730388402938843,
1390
+ "learning_rate": 1.6615216334467673e-05,
1391
+ "epoch": 1.17,
1392
+ "step": 19300
1393
+ },
1394
+ {
1395
+ "loss": 0.0603,
1396
+ "grad_norm": 0.7329290509223938,
1397
+ "learning_rate": 1.649368011667477e-05,
1398
+ "epoch": 1.18,
1399
+ "step": 19400
1400
+ },
1401
+ {
1402
+ "loss": 0.0605,
1403
+ "grad_norm": 1.0000228881835938,
1404
+ "learning_rate": 1.6372143898881866e-05,
1405
+ "epoch": 1.18,
1406
+ "step": 19500
1407
+ },
1408
+ {
1409
+ "loss": 0.0599,
1410
+ "grad_norm": 1.0037493705749512,
1411
+ "learning_rate": 1.6250607681088967e-05,
1412
+ "epoch": 1.19,
1413
+ "step": 19600
1414
+ },
1415
+ {
1416
+ "loss": 0.0616,
1417
+ "grad_norm": 0.7647894024848938,
1418
+ "learning_rate": 1.6129071463296063e-05,
1419
+ "epoch": 1.2,
1420
+ "step": 19700
1421
+ },
1422
+ {
1423
+ "loss": 0.0604,
1424
+ "grad_norm": 0.78948575258255,
1425
+ "learning_rate": 1.600753524550316e-05,
1426
+ "epoch": 1.2,
1427
+ "step": 19800
1428
+ },
1429
+ {
1430
+ "loss": 0.0609,
1431
+ "grad_norm": 0.8443770408630371,
1432
+ "learning_rate": 1.588599902771026e-05,
1433
+ "epoch": 1.21,
1434
+ "step": 19900
1435
+ },
1436
+ {
1437
+ "loss": 0.0599,
1438
+ "grad_norm": 1.1531789302825928,
1439
+ "learning_rate": 1.5764462809917357e-05,
1440
+ "epoch": 1.21,
1441
+ "step": 20000
1442
+ },
1443
+ {
1444
+ "loss": 0.0605,
1445
+ "grad_norm": 0.7325319647789001,
1446
+ "learning_rate": 1.5642926592124453e-05,
1447
+ "epoch": 1.22,
1448
+ "step": 20100
1449
+ },
1450
+ {
1451
+ "loss": 0.0606,
1452
+ "grad_norm": 0.8585038185119629,
1453
+ "learning_rate": 1.5521390374331553e-05,
1454
+ "epoch": 1.23,
1455
+ "step": 20200
1456
+ },
1457
+ {
1458
+ "loss": 0.0602,
1459
+ "grad_norm": 0.6652311086654663,
1460
+ "learning_rate": 1.539985415653865e-05,
1461
+ "epoch": 1.23,
1462
+ "step": 20300
1463
+ },
1464
+ {
1465
+ "loss": 0.0605,
1466
+ "grad_norm": 0.9240396618843079,
1467
+ "learning_rate": 1.5278317938745747e-05,
1468
+ "epoch": 1.24,
1469
+ "step": 20400
1470
+ },
1471
+ {
1472
+ "loss": 0.0609,
1473
+ "grad_norm": 0.9992942214012146,
1474
+ "learning_rate": 1.5156781720952845e-05,
1475
+ "epoch": 1.24,
1476
+ "step": 20500
1477
+ },
1478
+ {
1479
+ "loss": 0.0604,
1480
+ "grad_norm": 0.7454150915145874,
1481
+ "learning_rate": 1.5035245503159944e-05,
1482
+ "epoch": 1.25,
1483
+ "step": 20600
1484
+ },
1485
+ {
1486
+ "loss": 0.0598,
1487
+ "grad_norm": 0.8551883101463318,
1488
+ "learning_rate": 1.491370928536704e-05,
1489
+ "epoch": 1.26,
1490
+ "step": 20700
1491
+ },
1492
+ {
1493
+ "loss": 0.061,
1494
+ "grad_norm": 0.8273564577102661,
1495
+ "learning_rate": 1.4792173067574139e-05,
1496
+ "epoch": 1.26,
1497
+ "step": 20800
1498
+ },
1499
+ {
1500
+ "loss": 0.06,
1501
+ "grad_norm": 0.925244927406311,
1502
+ "learning_rate": 1.4671852211959166e-05,
1503
+ "epoch": 1.27,
1504
+ "step": 20900
1505
+ },
1506
+ {
1507
+ "loss": 0.0587,
1508
+ "grad_norm": 0.5892955660820007,
1509
+ "learning_rate": 1.4550315994166261e-05,
1510
+ "epoch": 1.27,
1511
+ "step": 21000
1512
+ },
1513
+ {
1514
+ "loss": 0.0602,
1515
+ "grad_norm": 0.7904210090637207,
1516
+ "learning_rate": 1.4428779776373361e-05,
1517
+ "epoch": 1.28,
1518
+ "step": 21100
1519
+ },
1520
+ {
1521
+ "loss": 0.0625,
1522
+ "grad_norm": 1.2804646492004395,
1523
+ "learning_rate": 1.430724355858046e-05,
1524
+ "epoch": 1.29,
1525
+ "step": 21200
1526
+ },
1527
+ {
1528
+ "loss": 0.0607,
1529
+ "grad_norm": 0.9952909350395203,
1530
+ "learning_rate": 1.4185707340787556e-05,
1531
+ "epoch": 1.29,
1532
+ "step": 21300
1533
+ },
1534
+ {
1535
+ "loss": 0.0602,
1536
+ "grad_norm": 0.9036094546318054,
1537
+ "learning_rate": 1.4064171122994655e-05,
1538
+ "epoch": 1.3,
1539
+ "step": 21400
1540
+ },
1541
+ {
1542
+ "loss": 0.0594,
1543
+ "grad_norm": 0.8128438591957092,
1544
+ "learning_rate": 1.3942634905201751e-05,
1545
+ "epoch": 1.3,
1546
+ "step": 21500
1547
+ },
1548
+ {
1549
+ "loss": 0.0593,
1550
+ "grad_norm": 0.786703884601593,
1551
+ "learning_rate": 1.382109868740885e-05,
1552
+ "epoch": 1.31,
1553
+ "step": 21600
1554
+ },
1555
+ {
1556
+ "loss": 0.0604,
1557
+ "grad_norm": 1.107258677482605,
1558
+ "learning_rate": 1.3699562469615946e-05,
1559
+ "epoch": 1.32,
1560
+ "step": 21700
1561
+ },
1562
+ {
1563
+ "loss": 0.0596,
1564
+ "grad_norm": 1.0990906953811646,
1565
+ "learning_rate": 1.3578026251823045e-05,
1566
+ "epoch": 1.32,
1567
+ "step": 21800
1568
+ },
1569
+ {
1570
+ "loss": 0.0611,
1571
+ "grad_norm": 0.7040949463844299,
1572
+ "learning_rate": 1.3456490034030143e-05,
1573
+ "epoch": 1.33,
1574
+ "step": 21900
1575
+ },
1576
+ {
1577
+ "loss": 0.0582,
1578
+ "grad_norm": 0.7568740248680115,
1579
+ "learning_rate": 1.333495381623724e-05,
1580
+ "epoch": 1.33,
1581
+ "step": 22000
1582
+ },
1583
+ {
1584
+ "loss": 0.0595,
1585
+ "grad_norm": 0.6342681646347046,
1586
+ "learning_rate": 1.3213417598444338e-05,
1587
+ "epoch": 1.34,
1588
+ "step": 22100
1589
+ },
1590
+ {
1591
+ "loss": 0.0597,
1592
+ "grad_norm": 0.7555422186851501,
1593
+ "learning_rate": 1.3091881380651435e-05,
1594
+ "epoch": 1.35,
1595
+ "step": 22200
1596
+ },
1597
+ {
1598
+ "loss": 0.0587,
1599
+ "grad_norm": 0.8620259165763855,
1600
+ "learning_rate": 1.2970345162858533e-05,
1601
+ "epoch": 1.35,
1602
+ "step": 22300
1603
+ },
1604
+ {
1605
+ "loss": 0.0586,
1606
+ "grad_norm": 1.4132779836654663,
1607
+ "learning_rate": 1.2848808945065632e-05,
1608
+ "epoch": 1.36,
1609
+ "step": 22400
1610
+ },
1611
+ {
1612
+ "loss": 0.0594,
1613
+ "grad_norm": 0.9352446794509888,
1614
+ "learning_rate": 1.2727272727272728e-05,
1615
+ "epoch": 1.37,
1616
+ "step": 22500
1617
+ },
1618
+ {
1619
+ "loss": 0.0581,
1620
+ "grad_norm": 0.8808399438858032,
1621
+ "learning_rate": 1.2605736509479827e-05,
1622
+ "epoch": 1.37,
1623
+ "step": 22600
1624
+ },
1625
+ {
1626
+ "loss": 0.0603,
1627
+ "grad_norm": 0.8254494071006775,
1628
+ "learning_rate": 1.2484200291686924e-05,
1629
+ "epoch": 1.38,
1630
+ "step": 22700
1631
+ },
1632
+ {
1633
+ "loss": 0.0589,
1634
+ "grad_norm": 0.9145941138267517,
1635
+ "learning_rate": 1.2362664073894022e-05,
1636
+ "epoch": 1.38,
1637
+ "step": 22800
1638
+ },
1639
+ {
1640
+ "loss": 0.0594,
1641
+ "grad_norm": 1.267179012298584,
1642
+ "learning_rate": 1.2241127856101119e-05,
1643
+ "epoch": 1.39,
1644
+ "step": 22900
1645
+ },
1646
+ {
1647
+ "loss": 0.0585,
1648
+ "grad_norm": 0.9012957215309143,
1649
+ "learning_rate": 1.2119591638308217e-05,
1650
+ "epoch": 1.4,
1651
+ "step": 23000
1652
+ },
1653
+ {
1654
+ "loss": 0.0581,
1655
+ "grad_norm": 1.053276777267456,
1656
+ "learning_rate": 1.1998055420515315e-05,
1657
+ "epoch": 1.4,
1658
+ "step": 23100
1659
+ },
1660
+ {
1661
+ "loss": 0.0579,
1662
+ "grad_norm": 1.031724214553833,
1663
+ "learning_rate": 1.1876519202722412e-05,
1664
+ "epoch": 1.41,
1665
+ "step": 23200
1666
+ },
1667
+ {
1668
+ "loss": 0.0574,
1669
+ "grad_norm": 0.8730105757713318,
1670
+ "learning_rate": 1.175498298492951e-05,
1671
+ "epoch": 1.41,
1672
+ "step": 23300
1673
+ },
1674
+ {
1675
+ "loss": 0.0589,
1676
+ "grad_norm": 0.871724545955658,
1677
+ "learning_rate": 1.1633446767136607e-05,
1678
+ "epoch": 1.42,
1679
+ "step": 23400
1680
+ },
1681
+ {
1682
+ "loss": 0.0585,
1683
+ "grad_norm": 0.9031744599342346,
1684
+ "learning_rate": 1.1511910549343706e-05,
1685
+ "epoch": 1.43,
1686
+ "step": 23500
1687
+ },
1688
+ {
1689
+ "loss": 0.0586,
1690
+ "grad_norm": 0.5891318917274475,
1691
+ "learning_rate": 1.1390374331550802e-05,
1692
+ "epoch": 1.43,
1693
+ "step": 23600
1694
+ },
1695
+ {
1696
+ "loss": 0.0584,
1697
+ "grad_norm": 0.7399836182594299,
1698
+ "learning_rate": 1.12688381137579e-05,
1699
+ "epoch": 1.44,
1700
+ "step": 23700
1701
+ },
1702
+ {
1703
+ "loss": 0.0596,
1704
+ "grad_norm": 0.47165361046791077,
1705
+ "learning_rate": 1.1147301895964999e-05,
1706
+ "epoch": 1.44,
1707
+ "step": 23800
1708
+ },
1709
+ {
1710
+ "loss": 0.0588,
1711
+ "grad_norm": 0.8805158734321594,
1712
+ "learning_rate": 1.1025765678172096e-05,
1713
+ "epoch": 1.45,
1714
+ "step": 23900
1715
+ },
1716
+ {
1717
+ "loss": 0.0587,
1718
+ "grad_norm": 0.6524300575256348,
1719
+ "learning_rate": 1.0904229460379194e-05,
1720
+ "epoch": 1.46,
1721
+ "step": 24000
1722
+ },
1723
+ {
1724
+ "loss": 0.0599,
1725
+ "grad_norm": 0.7314462661743164,
1726
+ "learning_rate": 1.078269324258629e-05,
1727
+ "epoch": 1.46,
1728
+ "step": 24100
1729
+ },
1730
+ {
1731
+ "loss": 0.0587,
1732
+ "grad_norm": 0.7969116568565369,
1733
+ "learning_rate": 1.0661157024793389e-05,
1734
+ "epoch": 1.47,
1735
+ "step": 24200
1736
+ },
1737
+ {
1738
+ "loss": 0.0574,
1739
+ "grad_norm": 0.6548510193824768,
1740
+ "learning_rate": 1.0539620807000488e-05,
1741
+ "epoch": 1.47,
1742
+ "step": 24300
1743
+ },
1744
+ {
1745
+ "loss": 0.0601,
1746
+ "grad_norm": 0.6944112181663513,
1747
+ "learning_rate": 1.0418084589207584e-05,
1748
+ "epoch": 1.48,
1749
+ "step": 24400
1750
+ },
1751
+ {
1752
+ "loss": 0.0595,
1753
+ "grad_norm": 1.0091618299484253,
1754
+ "learning_rate": 1.0296548371414683e-05,
1755
+ "epoch": 1.49,
1756
+ "step": 24500
1757
+ },
1758
+ {
1759
+ "loss": 0.0567,
1760
+ "grad_norm": 0.7692497372627258,
1761
+ "learning_rate": 1.017501215362178e-05,
1762
+ "epoch": 1.49,
1763
+ "step": 24600
1764
+ },
1765
+ {
1766
+ "loss": 0.0567,
1767
+ "grad_norm": 1.2263282537460327,
1768
+ "learning_rate": 1.0053475935828878e-05,
1769
+ "epoch": 1.5,
1770
+ "step": 24700
1771
+ },
1772
+ {
1773
+ "loss": 0.058,
1774
+ "grad_norm": 1.412335753440857,
1775
+ "learning_rate": 9.931939718035976e-06,
1776
+ "epoch": 1.5,
1777
+ "step": 24800
1778
+ },
1779
+ {
1780
+ "loss": 0.0584,
1781
+ "grad_norm": 0.9114163517951965,
1782
+ "learning_rate": 9.810403500243073e-06,
1783
+ "epoch": 1.51,
1784
+ "step": 24900
1785
+ },
1786
+ {
1787
+ "loss": 0.0579,
1788
+ "grad_norm": 0.8343012928962708,
1789
+ "learning_rate": 9.688867282450171e-06,
1790
+ "epoch": 1.52,
1791
+ "step": 25000
1792
+ },
1793
+ {
1794
+ "loss": 0.0581,
1795
+ "grad_norm": 0.7137165665626526,
1796
+ "learning_rate": 9.567331064657268e-06,
1797
+ "epoch": 1.52,
1798
+ "step": 25100
1799
+ },
1800
+ {
1801
+ "loss": 0.0572,
1802
+ "grad_norm": 0.8871126174926758,
1803
+ "learning_rate": 9.445794846864366e-06,
1804
+ "epoch": 1.53,
1805
+ "step": 25200
1806
+ },
1807
+ {
1808
+ "loss": 0.0588,
1809
+ "grad_norm": 1.9913699626922607,
1810
+ "learning_rate": 9.324258629071465e-06,
1811
+ "epoch": 1.54,
1812
+ "step": 25300
1813
+ },
1814
+ {
1815
+ "loss": 0.0586,
1816
+ "grad_norm": 0.702129065990448,
1817
+ "learning_rate": 9.202722411278561e-06,
1818
+ "epoch": 1.54,
1819
+ "step": 25400
1820
+ },
1821
+ {
1822
+ "loss": 0.0589,
1823
+ "grad_norm": 0.759503960609436,
1824
+ "learning_rate": 9.08118619348566e-06,
1825
+ "epoch": 1.55,
1826
+ "step": 25500
1827
+ },
1828
+ {
1829
+ "loss": 0.0598,
1830
+ "grad_norm": 0.7731884717941284,
1831
+ "learning_rate": 8.959649975692756e-06,
1832
+ "epoch": 1.55,
1833
+ "step": 25600
1834
+ },
1835
+ {
1836
+ "loss": 0.0574,
1837
+ "grad_norm": 0.830560028553009,
1838
+ "learning_rate": 8.838113757899855e-06,
1839
+ "epoch": 1.56,
1840
+ "step": 25700
1841
+ },
1842
+ {
1843
+ "loss": 0.0561,
1844
+ "grad_norm": 0.612714946269989,
1845
+ "learning_rate": 8.716577540106953e-06,
1846
+ "epoch": 1.57,
1847
+ "step": 25800
1848
+ },
1849
+ {
1850
+ "loss": 0.0583,
1851
+ "grad_norm": 0.6476453542709351,
1852
+ "learning_rate": 8.59504132231405e-06,
1853
+ "epoch": 1.57,
1854
+ "step": 25900
1855
+ },
1856
+ {
1857
+ "loss": 0.0567,
1858
+ "grad_norm": 0.6660561561584473,
1859
+ "learning_rate": 8.473505104521148e-06,
1860
+ "epoch": 1.58,
1861
+ "step": 26000
1862
+ },
1863
+ {
1864
+ "loss": 0.0575,
1865
+ "grad_norm": 0.6638226509094238,
1866
+ "learning_rate": 8.351968886728245e-06,
1867
+ "epoch": 1.58,
1868
+ "step": 26100
1869
+ },
1870
+ {
1871
+ "loss": 0.0567,
1872
+ "grad_norm": 0.6452857255935669,
1873
+ "learning_rate": 8.231648031113272e-06,
1874
+ "epoch": 1.59,
1875
+ "step": 26200
1876
+ },
1877
+ {
1878
+ "loss": 0.0567,
1879
+ "grad_norm": 0.819333016872406,
1880
+ "learning_rate": 8.11011181332037e-06,
1881
+ "epoch": 1.6,
1882
+ "step": 26300
1883
+ },
1884
+ {
1885
+ "loss": 0.0571,
1886
+ "grad_norm": 1.2114768028259277,
1887
+ "learning_rate": 7.988575595527467e-06,
1888
+ "epoch": 1.6,
1889
+ "step": 26400
1890
+ },
1891
+ {
1892
+ "loss": 0.0577,
1893
+ "grad_norm": 0.7581117153167725,
1894
+ "learning_rate": 7.867039377734566e-06,
1895
+ "epoch": 1.61,
1896
+ "step": 26500
1897
+ },
1898
+ {
1899
+ "loss": 0.0575,
1900
+ "grad_norm": 0.5861278772354126,
1901
+ "learning_rate": 7.745503159941663e-06,
1902
+ "epoch": 1.61,
1903
+ "step": 26600
1904
+ },
1905
+ {
1906
+ "loss": 0.0567,
1907
+ "grad_norm": 0.7154746055603027,
1908
+ "learning_rate": 7.623966942148761e-06,
1909
+ "epoch": 1.62,
1910
+ "step": 26700
1911
+ },
1912
+ {
1913
+ "loss": 0.0574,
1914
+ "grad_norm": 1.072407841682434,
1915
+ "learning_rate": 7.502430724355859e-06,
1916
+ "epoch": 1.63,
1917
+ "step": 26800
1918
+ },
1919
+ {
1920
+ "loss": 0.0572,
1921
+ "grad_norm": 0.8198044896125793,
1922
+ "learning_rate": 7.380894506562957e-06,
1923
+ "epoch": 1.63,
1924
+ "step": 26900
1925
+ },
1926
+ {
1927
+ "loss": 0.0562,
1928
+ "grad_norm": 0.7912253141403198,
1929
+ "learning_rate": 7.259358288770054e-06,
1930
+ "epoch": 1.64,
1931
+ "step": 27000
1932
+ },
1933
+ {
1934
+ "loss": 0.0567,
1935
+ "grad_norm": 0.9015645980834961,
1936
+ "learning_rate": 7.137822070977152e-06,
1937
+ "epoch": 1.64,
1938
+ "step": 27100
1939
+ },
1940
+ {
1941
+ "loss": 0.0551,
1942
+ "grad_norm": 0.6205886602401733,
1943
+ "learning_rate": 7.0162858531842495e-06,
1944
+ "epoch": 1.65,
1945
+ "step": 27200
1946
+ },
1947
+ {
1948
+ "loss": 0.0581,
1949
+ "grad_norm": 0.8834924697875977,
1950
+ "learning_rate": 6.894749635391347e-06,
1951
+ "epoch": 1.66,
1952
+ "step": 27300
1953
+ },
1954
+ {
1955
+ "loss": 0.0565,
1956
+ "grad_norm": 0.7698688507080078,
1957
+ "learning_rate": 6.773213417598445e-06,
1958
+ "epoch": 1.66,
1959
+ "step": 27400
1960
+ },
1961
+ {
1962
+ "loss": 0.0575,
1963
+ "grad_norm": 0.8447450399398804,
1964
+ "learning_rate": 6.651677199805543e-06,
1965
+ "epoch": 1.67,
1966
+ "step": 27500
1967
+ },
1968
+ {
1969
+ "loss": 0.057,
1970
+ "grad_norm": 1.6002224683761597,
1971
+ "learning_rate": 6.5301409820126404e-06,
1972
+ "epoch": 1.67,
1973
+ "step": 27600
1974
+ },
1975
+ {
1976
+ "loss": 0.0558,
1977
+ "grad_norm": 0.8625892996788025,
1978
+ "learning_rate": 6.408604764219738e-06,
1979
+ "epoch": 1.68,
1980
+ "step": 27700
1981
+ },
1982
+ {
1983
+ "loss": 0.0566,
1984
+ "grad_norm": 0.7483322024345398,
1985
+ "learning_rate": 6.2870685464268355e-06,
1986
+ "epoch": 1.69,
1987
+ "step": 27800
1988
+ },
1989
+ {
1990
+ "loss": 0.0571,
1991
+ "grad_norm": 0.781535804271698,
1992
+ "learning_rate": 6.165532328633933e-06,
1993
+ "epoch": 1.69,
1994
+ "step": 27900
1995
+ },
1996
+ {
1997
+ "loss": 0.0563,
1998
+ "grad_norm": 0.8761783838272095,
1999
+ "learning_rate": 6.0439961108410314e-06,
2000
+ "epoch": 1.7,
2001
+ "step": 28000
2002
+ },
2003
+ {
2004
+ "loss": 0.0565,
2005
+ "grad_norm": 0.5183244943618774,
2006
+ "learning_rate": 5.922459893048129e-06,
2007
+ "epoch": 1.7,
2008
+ "step": 28100
2009
+ },
2010
+ {
2011
+ "loss": 0.0564,
2012
+ "grad_norm": 0.7939796447753906,
2013
+ "learning_rate": 5.8009236752552265e-06,
2014
+ "epoch": 1.71,
2015
+ "step": 28200
2016
+ },
2017
+ {
2018
+ "loss": 0.0576,
2019
+ "grad_norm": 0.7260966300964355,
2020
+ "learning_rate": 5.679387457462324e-06,
2021
+ "epoch": 1.72,
2022
+ "step": 28300
2023
+ },
2024
+ {
2025
+ "loss": 0.0569,
2026
+ "grad_norm": 0.9087544083595276,
2027
+ "learning_rate": 5.557851239669422e-06,
2028
+ "epoch": 1.72,
2029
+ "step": 28400
2030
+ },
2031
+ {
2032
+ "loss": 0.056,
2033
+ "grad_norm": 0.7275218367576599,
2034
+ "learning_rate": 5.436315021876519e-06,
2035
+ "epoch": 1.73,
2036
+ "step": 28500
2037
+ },
2038
+ {
2039
+ "loss": 0.0563,
2040
+ "grad_norm": 0.5983753800392151,
2041
+ "learning_rate": 5.315994166261547e-06,
2042
+ "epoch": 1.74,
2043
+ "step": 28600
2044
+ },
2045
+ {
2046
+ "loss": 0.0564,
2047
+ "grad_norm": 0.912756085395813,
2048
+ "learning_rate": 5.194457948468644e-06,
2049
+ "epoch": 1.74,
2050
+ "step": 28700
2051
+ },
2052
+ {
2053
+ "loss": 0.0555,
2054
+ "grad_norm": 0.6085710525512695,
2055
+ "learning_rate": 5.072921730675742e-06,
2056
+ "epoch": 1.75,
2057
+ "step": 28800
2058
+ },
2059
+ {
2060
+ "loss": 0.0571,
2061
+ "grad_norm": 0.6775307655334473,
2062
+ "learning_rate": 4.95138551288284e-06,
2063
+ "epoch": 1.75,
2064
+ "step": 28900
2065
+ },
2066
+ {
2067
+ "loss": 0.0543,
2068
+ "grad_norm": 0.7438898682594299,
2069
+ "learning_rate": 4.829849295089938e-06,
2070
+ "epoch": 1.76,
2071
+ "step": 29000
2072
+ },
2073
+ {
2074
+ "loss": 0.0567,
2075
+ "grad_norm": 0.719668984413147,
2076
+ "learning_rate": 4.708313077297035e-06,
2077
+ "epoch": 1.77,
2078
+ "step": 29100
2079
+ },
2080
+ {
2081
+ "loss": 0.0565,
2082
+ "grad_norm": 0.8647979497909546,
2083
+ "learning_rate": 4.586776859504133e-06,
2084
+ "epoch": 1.77,
2085
+ "step": 29200
2086
+ },
2087
+ {
2088
+ "loss": 0.057,
2089
+ "grad_norm": 0.8238335847854614,
2090
+ "learning_rate": 4.46524064171123e-06,
2091
+ "epoch": 1.78,
2092
+ "step": 29300
2093
+ },
2094
+ {
2095
+ "loss": 0.0563,
2096
+ "grad_norm": 3.2504589557647705,
2097
+ "learning_rate": 4.343704423918328e-06,
2098
+ "epoch": 1.78,
2099
+ "step": 29400
2100
+ },
2101
+ {
2102
+ "loss": 0.0536,
2103
+ "grad_norm": 0.7106683850288391,
2104
+ "learning_rate": 4.222168206125426e-06,
2105
+ "epoch": 1.79,
2106
+ "step": 29500
2107
+ },
2108
+ {
2109
+ "loss": 0.056,
2110
+ "grad_norm": 0.9477577209472656,
2111
+ "learning_rate": 4.100631988332524e-06,
2112
+ "epoch": 1.8,
2113
+ "step": 29600
2114
+ },
2115
+ {
2116
+ "loss": 0.0562,
2117
+ "grad_norm": 0.8888897895812988,
2118
+ "learning_rate": 3.979095770539621e-06,
2119
+ "epoch": 1.8,
2120
+ "step": 29700
2121
+ },
2122
+ {
2123
+ "loss": 0.0562,
2124
+ "grad_norm": 0.7125309705734253,
2125
+ "learning_rate": 3.857559552746719e-06,
2126
+ "epoch": 1.81,
2127
+ "step": 29800
2128
+ },
2129
+ {
2130
+ "loss": 0.0552,
2131
+ "grad_norm": 0.7241693139076233,
2132
+ "learning_rate": 3.7360233349538167e-06,
2133
+ "epoch": 1.81,
2134
+ "step": 29900
2135
+ },
2136
+ {
2137
+ "loss": 0.0556,
2138
+ "grad_norm": 0.9381842613220215,
2139
+ "learning_rate": 3.6144871171609143e-06,
2140
+ "epoch": 1.82,
2141
+ "step": 30000
2142
+ },
2143
+ {
2144
+ "loss": 0.0551,
2145
+ "grad_norm": 0.6808192133903503,
2146
+ "learning_rate": 3.492950899368012e-06,
2147
+ "epoch": 1.83,
2148
+ "step": 30100
2149
+ },
2150
+ {
2151
+ "loss": 0.0561,
2152
+ "grad_norm": 0.6042631268501282,
2153
+ "learning_rate": 3.3714146815751098e-06,
2154
+ "epoch": 1.83,
2155
+ "step": 30200
2156
+ },
2157
+ {
2158
+ "loss": 0.0553,
2159
+ "grad_norm": 0.5585273504257202,
2160
+ "learning_rate": 3.2498784637822073e-06,
2161
+ "epoch": 1.84,
2162
+ "step": 30300
2163
+ },
2164
+ {
2165
+ "loss": 0.0545,
2166
+ "grad_norm": 0.9048868417739868,
2167
+ "learning_rate": 3.128342245989305e-06,
2168
+ "epoch": 1.84,
2169
+ "step": 30400
2170
+ },
2171
+ {
2172
+ "loss": 0.0557,
2173
+ "grad_norm": 0.8429957628250122,
2174
+ "learning_rate": 3.006806028196403e-06,
2175
+ "epoch": 1.85,
2176
+ "step": 30500
2177
+ },
2178
+ {
2179
+ "loss": 0.0563,
2180
+ "grad_norm": 0.7962875962257385,
2181
+ "learning_rate": 2.8852698104035003e-06,
2182
+ "epoch": 1.86,
2183
+ "step": 30600
2184
+ },
2185
+ {
2186
+ "loss": 0.0559,
2187
+ "grad_norm": 0.7854676246643066,
2188
+ "learning_rate": 2.763733592610598e-06,
2189
+ "epoch": 1.86,
2190
+ "step": 30700
2191
+ },
2192
+ {
2193
+ "loss": 0.0561,
2194
+ "grad_norm": 1.694869041442871,
2195
+ "learning_rate": 2.642197374817696e-06,
2196
+ "epoch": 1.87,
2197
+ "step": 30800
2198
+ },
2199
+ {
2200
+ "loss": 0.0568,
2201
+ "grad_norm": 0.6683087944984436,
2202
+ "learning_rate": 2.5206611570247934e-06,
2203
+ "epoch": 1.87,
2204
+ "step": 30900
2205
+ },
2206
+ {
2207
+ "loss": 0.0548,
2208
+ "grad_norm": 0.5675504803657532,
2209
+ "learning_rate": 2.3991249392318913e-06,
2210
+ "epoch": 1.88,
2211
+ "step": 31000
2212
+ },
2213
+ {
2214
+ "loss": 0.0552,
2215
+ "grad_norm": 0.9730797410011292,
2216
+ "learning_rate": 2.2775887214389893e-06,
2217
+ "epoch": 1.89,
2218
+ "step": 31100
2219
+ },
2220
+ {
2221
+ "loss": 0.0568,
2222
+ "grad_norm": 0.8015105128288269,
2223
+ "learning_rate": 2.156052503646087e-06,
2224
+ "epoch": 1.89,
2225
+ "step": 31200
2226
+ },
2227
+ {
2228
+ "loss": 0.0552,
2229
+ "grad_norm": 0.5437925457954407,
2230
+ "learning_rate": 2.0345162858531844e-06,
2231
+ "epoch": 1.9,
2232
+ "step": 31300
2233
+ },
2234
+ {
2235
+ "loss": 0.0558,
2236
+ "grad_norm": 0.8105918765068054,
2237
+ "learning_rate": 1.9129800680602823e-06,
2238
+ "epoch": 1.91,
2239
+ "step": 31400
2240
+ },
2241
+ {
2242
+ "loss": 0.0567,
2243
+ "grad_norm": 0.8699814677238464,
2244
+ "learning_rate": 1.7914438502673799e-06,
2245
+ "epoch": 1.91,
2246
+ "step": 31500
2247
+ },
2248
+ {
2249
+ "loss": 0.0556,
2250
+ "grad_norm": 0.542261004447937,
2251
+ "learning_rate": 1.6699076324744776e-06,
2252
+ "epoch": 1.92,
2253
+ "step": 31600
2254
+ },
2255
+ {
2256
+ "loss": 0.0553,
2257
+ "grad_norm": 0.6852170825004578,
2258
+ "learning_rate": 1.5483714146815754e-06,
2259
+ "epoch": 1.92,
2260
+ "step": 31700
2261
+ },
2262
+ {
2263
+ "loss": 0.0559,
2264
+ "grad_norm": 0.8324136137962341,
2265
+ "learning_rate": 1.426835196888673e-06,
2266
+ "epoch": 1.93,
2267
+ "step": 31800
2268
+ },
2269
+ {
2270
+ "loss": 0.0539,
2271
+ "grad_norm": 0.5395376086235046,
2272
+ "learning_rate": 1.3052989790957707e-06,
2273
+ "epoch": 1.94,
2274
+ "step": 31900
2275
+ },
2276
+ {
2277
+ "loss": 0.0557,
2278
+ "grad_norm": 1.0665556192398071,
2279
+ "learning_rate": 1.1837627613028684e-06,
2280
+ "epoch": 1.94,
2281
+ "step": 32000
2282
+ },
2283
+ {
2284
+ "loss": 0.0556,
2285
+ "grad_norm": 0.5730076432228088,
2286
+ "learning_rate": 1.062226543509966e-06,
2287
+ "epoch": 1.95,
2288
+ "step": 32100
2289
+ },
2290
+ {
2291
+ "loss": 0.0566,
2292
+ "grad_norm": 0.8526155352592468,
2293
+ "learning_rate": 9.406903257170638e-07,
2294
+ "epoch": 1.95,
2295
+ "step": 32200
2296
+ },
2297
+ {
2298
+ "loss": 0.0554,
2299
+ "grad_norm": 0.47227638959884644,
2300
+ "learning_rate": 8.191541079241614e-07,
2301
+ "epoch": 1.96,
2302
+ "step": 32300
2303
+ },
2304
+ {
2305
+ "loss": 0.0559,
2306
+ "grad_norm": 0.5771980881690979,
2307
+ "learning_rate": 6.976178901312592e-07,
2308
+ "epoch": 1.97,
2309
+ "step": 32400
2310
+ },
2311
+ {
2312
+ "loss": 0.0553,
2313
+ "grad_norm": 0.7183811068534851,
2314
+ "learning_rate": 5.772970345162859e-07,
2315
+ "epoch": 1.97,
2316
+ "step": 32500
2317
+ },
2318
+ {
2319
+ "loss": 0.0556,
2320
+ "grad_norm": 0.7808952927589417,
2321
+ "learning_rate": 4.557608167233836e-07,
2322
+ "epoch": 1.98,
2323
+ "step": 32600
2324
+ },
2325
+ {
2326
+ "loss": 0.0549,
2327
+ "grad_norm": 0.7201197743415833,
2328
+ "learning_rate": 3.3422459893048135e-07,
2329
+ "epoch": 1.98,
2330
+ "step": 32700
2331
+ },
2332
+ {
2333
+ "loss": 0.0546,
2334
+ "grad_norm": 0.822515606880188,
2335
+ "learning_rate": 2.1268838113757902e-07,
2336
+ "epoch": 1.99,
2337
+ "step": 32800
2338
+ },
2339
+ {
2340
+ "loss": 0.0556,
2341
+ "grad_norm": 0.6968460083007812,
2342
+ "learning_rate": 9.115216334467672e-08,
2343
+ "epoch": 2.0,
2344
+ "step": 32900
2345
+ },
2346
+ {
2347
+ "eval_loss": 0.06514331698417664,
2348
+ "eval_f1": 0.9055283859012663,
2349
+ "eval_precision": 0.9128121708644065,
2350
+ "eval_recall": 0.898553824781504,
2351
+ "eval_accuracy": 0.9750088848296079,
2352
+ "eval_runtime": 304.326,
2353
+ "eval_samples_per_second": 86.841,
2354
+ "eval_steps_per_second": 10.857,
2355
+ "epoch": 2.0,
2356
+ "step": 32962
2357
+ },
2358
+ {
2359
+ "train_runtime": 12949.9436,
2360
+ "train_samples_per_second": 20.363,
2361
+ "train_steps_per_second": 2.545,
2362
+ "total_flos": 2.448996403000443e+17,
2363
+ "train_loss": 0.07225221031304233,
2364
+ "epoch": 2.0,
2365
+ "step": 32962
2366
+ }
2367
+ ]
2368
+ }
2369
+ }
2370
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2c509a525eb51aebb33fb59c24ee923c1d4c1db23c3ae81fe05ccf354084f7b
3
- size 17082758
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a4661b2cb6b8a1007906509fe18cbfbc03062a086102bf7b80cfedb80f16c37
3
+ size 17082854
tokenizer_config.json CHANGED
@@ -1,21 +1,55 @@
1
  {
2
  "add_prefix_space": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "bos_token": "<s>",
 
4
  "cls_token": "<s>",
5
  "eos_token": "</s>",
6
- "mask_token": {
7
- "__type": "AddedToken",
8
- "content": "<mask>",
9
- "lstrip": true,
10
- "normalized": true,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
  "model_max_length": 512,
15
- "name_or_path": "xlm-roberta-large",
16
  "pad_token": "<pad>",
17
  "sep_token": "</s>",
18
- "special_tokens_map_file": null,
19
  "strip_accent": false,
20
  "tokenizer_class": "XLMRobertaTokenizer",
21
  "unk_token": "<unk>"
 
1
  {
2
  "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "250001": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
  "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
  "cls_token": "<s>",
48
  "eos_token": "</s>",
49
+ "mask_token": "<mask>",
 
 
 
 
 
 
 
50
  "model_max_length": 512,
 
51
  "pad_token": "<pad>",
52
  "sep_token": "</s>",
 
53
  "strip_accent": false,
54
  "tokenizer_class": "XLMRobertaTokenizer",
55
  "unk_token": "<unk>"
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb87f568d0ad43eded222ce946cb43af506e51d22b520c14b13a681be169babd
3
- size 3439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:296a9074f10521f17345eee926fdaec197e411506a306ca2492cb9e8ac592ccd
3
+ size 4920