BramVanroy commited on
Commit
74f3fb9
1 Parent(s): 39daf5f

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: BramVanroy/fietje-2b
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ datasets:
9
+ - generator
10
+ model-index:
11
+ - name: fietje-2b-sft
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # fietje-2b-sft
19
+
20
+ This model is a fine-tuned version of [BramVanroy/fietje-2b](https://huggingface.co/BramVanroy/fietje-2b) on the generator dataset.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 0.8818
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 6e-05
42
+ - train_batch_size: 42
43
+ - eval_batch_size: 42
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 16
47
+ - total_train_batch_size: 672
48
+ - total_eval_batch_size: 672
49
+ - optimizer: Adam with betas=(0.9,0.98) and epsilon=1e-07
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 3.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.9325 | 1.0 | 178 | 0.9060 |
59
+ | 0.8687 | 2.0 | 356 | 0.8850 |
60
+ | 0.8385 | 3.0 | 534 | 0.8818 |
61
+
62
+
63
+ ### Framework versions
64
+
65
+ - Transformers 4.39.1
66
+ - Pytorch 2.1.2+cu121
67
+ - Datasets 2.18.0
68
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.9222131907270196,
4
+ "train_runtime": 33956.1835,
5
+ "train_samples": 201571,
6
+ "train_samples_per_second": 10.545,
7
+ "train_steps_per_second": 0.016
8
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50295,
4
+ "eos_token_id": 50296,
5
+ "pad_token_id": 50296,
6
+ "transformers_version": "4.39.1"
7
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa5fdc0699ada9a30d5743b6cd7fe8fb8d47fa11d8aab9ad200274f4506d5377
3
+ size 4990961488
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d51beda028780b6ce995b6aaf725a35a176df8cc2acec3649e095dfb5f5fc660
3
+ size 559207842
model.safetensors.index.json ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5550119154
4
+ },
5
+ "weight_map": {
6
+ "lm_head.bias": "model-00002-of-00002.safetensors",
7
+ "lm_head.weight": "model-00002-of-00002.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
9
+ "model.final_layernorm.bias": "model-00002-of-00002.safetensors",
10
+ "model.final_layernorm.weight": "model-00002-of-00002.safetensors",
11
+ "model.layers.0.input_layernorm.bias": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.dense.bias": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.dense.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
20
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
22
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
24
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.input_layernorm.bias": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.dense.bias": "model-00001-of-00002.safetensors",
32
+ "model.layers.1.self_attn.dense.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
34
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
36
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
38
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.input_layernorm.bias": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
44
+ "model.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.10.self_attn.dense.bias": "model-00001-of-00002.safetensors",
46
+ "model.layers.10.self_attn.dense.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
48
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
50
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
52
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.input_layernorm.bias": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
56
+ "model.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
58
+ "model.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.11.self_attn.dense.bias": "model-00001-of-00002.safetensors",
60
+ "model.layers.11.self_attn.dense.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
62
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
64
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
66
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.12.input_layernorm.bias": "model-00001-of-00002.safetensors",
68
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
70
+ "model.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
72
+ "model.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.12.self_attn.dense.bias": "model-00001-of-00002.safetensors",
74
+ "model.layers.12.self_attn.dense.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
76
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
78
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
80
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.13.input_layernorm.bias": "model-00001-of-00002.safetensors",
82
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
84
+ "model.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
86
+ "model.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.13.self_attn.dense.bias": "model-00001-of-00002.safetensors",
88
+ "model.layers.13.self_attn.dense.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
90
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
92
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
94
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.14.input_layernorm.bias": "model-00001-of-00002.safetensors",
96
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
98
+ "model.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
100
+ "model.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.14.self_attn.dense.bias": "model-00001-of-00002.safetensors",
102
+ "model.layers.14.self_attn.dense.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
104
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
106
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
108
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.15.input_layernorm.bias": "model-00001-of-00002.safetensors",
110
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
112
+ "model.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
114
+ "model.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.15.self_attn.dense.bias": "model-00001-of-00002.safetensors",
116
+ "model.layers.15.self_attn.dense.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
118
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
120
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
122
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.16.input_layernorm.bias": "model-00001-of-00002.safetensors",
124
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
126
+ "model.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
128
+ "model.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.16.self_attn.dense.bias": "model-00001-of-00002.safetensors",
130
+ "model.layers.16.self_attn.dense.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
132
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
134
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
136
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.17.input_layernorm.bias": "model-00001-of-00002.safetensors",
138
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
140
+ "model.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
142
+ "model.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.17.self_attn.dense.bias": "model-00001-of-00002.safetensors",
144
+ "model.layers.17.self_attn.dense.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
146
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
148
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
150
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.18.input_layernorm.bias": "model-00001-of-00002.safetensors",
152
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
154
+ "model.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
156
+ "model.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.18.self_attn.dense.bias": "model-00001-of-00002.safetensors",
158
+ "model.layers.18.self_attn.dense.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
160
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
162
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
164
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.19.input_layernorm.bias": "model-00001-of-00002.safetensors",
166
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
168
+ "model.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
170
+ "model.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.19.self_attn.dense.bias": "model-00001-of-00002.safetensors",
172
+ "model.layers.19.self_attn.dense.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
174
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
176
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
178
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.2.input_layernorm.bias": "model-00001-of-00002.safetensors",
180
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
182
+ "model.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
184
+ "model.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.2.self_attn.dense.bias": "model-00001-of-00002.safetensors",
186
+ "model.layers.2.self_attn.dense.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
188
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
190
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
192
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.20.input_layernorm.bias": "model-00001-of-00002.safetensors",
194
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
196
+ "model.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
198
+ "model.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.20.self_attn.dense.bias": "model-00001-of-00002.safetensors",
200
+ "model.layers.20.self_attn.dense.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
202
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
204
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
206
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.21.input_layernorm.bias": "model-00001-of-00002.safetensors",
208
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
210
+ "model.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
212
+ "model.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.21.self_attn.dense.bias": "model-00001-of-00002.safetensors",
214
+ "model.layers.21.self_attn.dense.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
216
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
218
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
220
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.22.input_layernorm.bias": "model-00001-of-00002.safetensors",
222
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
224
+ "model.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
225
+ "model.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
226
+ "model.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.22.self_attn.dense.bias": "model-00001-of-00002.safetensors",
228
+ "model.layers.22.self_attn.dense.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
230
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
232
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
234
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
235
+ "model.layers.23.input_layernorm.bias": "model-00001-of-00002.safetensors",
236
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
237
+ "model.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
238
+ "model.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
240
+ "model.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.23.self_attn.dense.bias": "model-00001-of-00002.safetensors",
242
+ "model.layers.23.self_attn.dense.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
244
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
246
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
247
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
248
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.24.input_layernorm.bias": "model-00001-of-00002.safetensors",
250
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
252
+ "model.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
254
+ "model.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.24.self_attn.dense.bias": "model-00001-of-00002.safetensors",
256
+ "model.layers.24.self_attn.dense.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
258
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
259
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
260
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
261
+ "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
262
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.25.input_layernorm.bias": "model-00001-of-00002.safetensors",
264
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
266
+ "model.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
268
+ "model.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.25.self_attn.dense.bias": "model-00001-of-00002.safetensors",
270
+ "model.layers.25.self_attn.dense.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
272
+ "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
273
+ "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
274
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
275
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
276
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
277
+ "model.layers.26.input_layernorm.bias": "model-00001-of-00002.safetensors",
278
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
279
+ "model.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
280
+ "model.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
281
+ "model.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
282
+ "model.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
283
+ "model.layers.26.self_attn.dense.bias": "model-00001-of-00002.safetensors",
284
+ "model.layers.26.self_attn.dense.weight": "model-00001-of-00002.safetensors",
285
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
286
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
288
+ "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
290
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.27.input_layernorm.bias": "model-00001-of-00002.safetensors",
292
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.27.mlp.fc1.bias": "model-00001-of-00002.safetensors",
294
+ "model.layers.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
295
+ "model.layers.27.mlp.fc2.bias": "model-00001-of-00002.safetensors",
296
+ "model.layers.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
297
+ "model.layers.27.self_attn.dense.bias": "model-00001-of-00002.safetensors",
298
+ "model.layers.27.self_attn.dense.weight": "model-00001-of-00002.safetensors",
299
+ "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
300
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
301
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
302
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
303
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
304
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
305
+ "model.layers.28.input_layernorm.bias": "model-00001-of-00002.safetensors",
306
+ "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
307
+ "model.layers.28.mlp.fc1.bias": "model-00001-of-00002.safetensors",
308
+ "model.layers.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
309
+ "model.layers.28.mlp.fc2.bias": "model-00001-of-00002.safetensors",
310
+ "model.layers.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
311
+ "model.layers.28.self_attn.dense.bias": "model-00001-of-00002.safetensors",
312
+ "model.layers.28.self_attn.dense.weight": "model-00001-of-00002.safetensors",
313
+ "model.layers.28.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
314
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
315
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
316
+ "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
317
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
318
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
319
+ "model.layers.29.input_layernorm.bias": "model-00001-of-00002.safetensors",
320
+ "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors",
321
+ "model.layers.29.mlp.fc1.bias": "model-00001-of-00002.safetensors",
322
+ "model.layers.29.mlp.fc1.weight": "model-00001-of-00002.safetensors",
323
+ "model.layers.29.mlp.fc2.bias": "model-00001-of-00002.safetensors",
324
+ "model.layers.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
325
+ "model.layers.29.self_attn.dense.bias": "model-00001-of-00002.safetensors",
326
+ "model.layers.29.self_attn.dense.weight": "model-00001-of-00002.safetensors",
327
+ "model.layers.29.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
328
+ "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
329
+ "model.layers.29.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
330
+ "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
331
+ "model.layers.29.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
332
+ "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
333
+ "model.layers.3.input_layernorm.bias": "model-00001-of-00002.safetensors",
334
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
335
+ "model.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
336
+ "model.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
337
+ "model.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
338
+ "model.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.3.self_attn.dense.bias": "model-00001-of-00002.safetensors",
340
+ "model.layers.3.self_attn.dense.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
342
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
344
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
346
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.30.input_layernorm.bias": "model-00002-of-00002.safetensors",
348
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.30.mlp.fc1.bias": "model-00002-of-00002.safetensors",
350
+ "model.layers.30.mlp.fc1.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.30.mlp.fc2.bias": "model-00002-of-00002.safetensors",
352
+ "model.layers.30.mlp.fc2.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.30.self_attn.dense.bias": "model-00002-of-00002.safetensors",
354
+ "model.layers.30.self_attn.dense.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
356
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.30.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
358
+ "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
360
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.31.input_layernorm.bias": "model-00002-of-00002.safetensors",
362
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.31.mlp.fc1.bias": "model-00002-of-00002.safetensors",
364
+ "model.layers.31.mlp.fc1.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.31.mlp.fc2.bias": "model-00002-of-00002.safetensors",
366
+ "model.layers.31.mlp.fc2.weight": "model-00002-of-00002.safetensors",
367
+ "model.layers.31.self_attn.dense.bias": "model-00002-of-00002.safetensors",
368
+ "model.layers.31.self_attn.dense.weight": "model-00002-of-00002.safetensors",
369
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
370
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
371
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
372
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
373
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
374
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
375
+ "model.layers.4.input_layernorm.bias": "model-00001-of-00002.safetensors",
376
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
378
+ "model.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
380
+ "model.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.4.self_attn.dense.bias": "model-00001-of-00002.safetensors",
382
+ "model.layers.4.self_attn.dense.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
384
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
386
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
388
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.5.input_layernorm.bias": "model-00001-of-00002.safetensors",
390
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
392
+ "model.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
394
+ "model.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.5.self_attn.dense.bias": "model-00001-of-00002.safetensors",
396
+ "model.layers.5.self_attn.dense.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
398
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
400
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
402
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.6.input_layernorm.bias": "model-00001-of-00002.safetensors",
404
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
405
+ "model.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
406
+ "model.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
407
+ "model.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
408
+ "model.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
409
+ "model.layers.6.self_attn.dense.bias": "model-00001-of-00002.safetensors",
410
+ "model.layers.6.self_attn.dense.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
412
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
414
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
415
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
416
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
417
+ "model.layers.7.input_layernorm.bias": "model-00001-of-00002.safetensors",
418
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
420
+ "model.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
421
+ "model.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
422
+ "model.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.7.self_attn.dense.bias": "model-00001-of-00002.safetensors",
424
+ "model.layers.7.self_attn.dense.weight": "model-00001-of-00002.safetensors",
425
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
426
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
427
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
428
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
429
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
430
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.8.input_layernorm.bias": "model-00001-of-00002.safetensors",
432
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
433
+ "model.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
434
+ "model.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
436
+ "model.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
437
+ "model.layers.8.self_attn.dense.bias": "model-00001-of-00002.safetensors",
438
+ "model.layers.8.self_attn.dense.weight": "model-00001-of-00002.safetensors",
439
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
440
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
441
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
442
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
443
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
444
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
445
+ "model.layers.9.input_layernorm.bias": "model-00001-of-00002.safetensors",
446
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
447
+ "model.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
448
+ "model.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
449
+ "model.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
450
+ "model.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
451
+ "model.layers.9.self_attn.dense.bias": "model-00001-of-00002.safetensors",
452
+ "model.layers.9.self_attn.dense.weight": "model-00001-of-00002.safetensors",
453
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
454
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
455
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
456
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
457
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
458
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors"
459
+ }
460
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.9222131907270196,
4
+ "train_runtime": 33956.1835,
5
+ "train_samples": 201571,
6
+ "train_samples_per_second": 10.545,
7
+ "train_steps_per_second": 0.016
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,3792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 534,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 307.2925252001398,
14
+ "learning_rate": 1.111111111111111e-06,
15
+ "loss": 2.14,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "grad_norm": 348.5579299962218,
21
+ "learning_rate": 2.222222222222222e-06,
22
+ "loss": 2.1592,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.02,
27
+ "grad_norm": 295.8217735171242,
28
+ "learning_rate": 3.3333333333333333e-06,
29
+ "loss": 2.1112,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.02,
34
+ "grad_norm": 320.95278420676283,
35
+ "learning_rate": 4.444444444444444e-06,
36
+ "loss": 2.0727,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.03,
41
+ "grad_norm": 273.6994356101882,
42
+ "learning_rate": 5.555555555555555e-06,
43
+ "loss": 1.8522,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.03,
48
+ "grad_norm": 137.18652764811333,
49
+ "learning_rate": 6.666666666666667e-06,
50
+ "loss": 1.5844,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.04,
55
+ "grad_norm": 8.105539597979915,
56
+ "learning_rate": 7.777777777777777e-06,
57
+ "loss": 1.4424,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.04,
62
+ "grad_norm": 4.0951798632526275,
63
+ "learning_rate": 8.888888888888888e-06,
64
+ "loss": 1.4244,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.05,
69
+ "grad_norm": 2.6078027918916717,
70
+ "learning_rate": 9.999999999999999e-06,
71
+ "loss": 1.3763,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.06,
76
+ "grad_norm": 1.9798839986563381,
77
+ "learning_rate": 1.111111111111111e-05,
78
+ "loss": 1.3583,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.06,
83
+ "grad_norm": 2.065159602417454,
84
+ "learning_rate": 1.2222222222222222e-05,
85
+ "loss": 1.3335,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "grad_norm": 1.3002391180584008,
91
+ "learning_rate": 1.3333333333333333e-05,
92
+ "loss": 1.309,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.07,
97
+ "grad_norm": 2.2825778137111357,
98
+ "learning_rate": 1.4444444444444444e-05,
99
+ "loss": 1.2932,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.08,
104
+ "grad_norm": 1.0205928394685457,
105
+ "learning_rate": 1.5555555555555555e-05,
106
+ "loss": 1.2689,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.08,
111
+ "grad_norm": 1.3373431587246087,
112
+ "learning_rate": 1.6666666666666667e-05,
113
+ "loss": 1.2489,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.09,
118
+ "grad_norm": 0.8043774075647928,
119
+ "learning_rate": 1.7777777777777777e-05,
120
+ "loss": 1.2302,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.1,
125
+ "grad_norm": 0.8741531962350515,
126
+ "learning_rate": 1.888888888888889e-05,
127
+ "loss": 1.2195,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "grad_norm": 0.7028376256413013,
133
+ "learning_rate": 1.9999999999999998e-05,
134
+ "loss": 1.2027,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.11,
139
+ "grad_norm": 0.6648050831010063,
140
+ "learning_rate": 2.111111111111111e-05,
141
+ "loss": 1.1887,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.11,
146
+ "grad_norm": 0.7354468182477213,
147
+ "learning_rate": 2.222222222222222e-05,
148
+ "loss": 1.1704,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.12,
153
+ "grad_norm": 0.7839624362161648,
154
+ "learning_rate": 2.3333333333333336e-05,
155
+ "loss": 1.1654,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.12,
160
+ "grad_norm": 0.7752033041911668,
161
+ "learning_rate": 2.4444444444444445e-05,
162
+ "loss": 1.1544,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.13,
167
+ "grad_norm": 0.7396236066452132,
168
+ "learning_rate": 2.5555555555555557e-05,
169
+ "loss": 1.138,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.13,
174
+ "grad_norm": 0.748517600591021,
175
+ "learning_rate": 2.6666666666666667e-05,
176
+ "loss": 1.133,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.14,
181
+ "grad_norm": 0.7065074963471533,
182
+ "learning_rate": 2.777777777777778e-05,
183
+ "loss": 1.1119,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.15,
188
+ "grad_norm": 0.7289352652301433,
189
+ "learning_rate": 2.8888888888888888e-05,
190
+ "loss": 1.1132,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.15,
195
+ "grad_norm": 0.7324972388581718,
196
+ "learning_rate": 3e-05,
197
+ "loss": 1.1008,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.16,
202
+ "grad_norm": 0.7020306466156484,
203
+ "learning_rate": 3.111111111111111e-05,
204
+ "loss": 1.1067,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.16,
209
+ "grad_norm": 0.7080095311543003,
210
+ "learning_rate": 3.222222222222223e-05,
211
+ "loss": 1.1028,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "grad_norm": 0.7149252394051553,
217
+ "learning_rate": 3.3333333333333335e-05,
218
+ "loss": 1.0953,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.17,
223
+ "grad_norm": 0.7285982439983161,
224
+ "learning_rate": 3.444444444444445e-05,
225
+ "loss": 1.0859,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.18,
230
+ "grad_norm": 0.7057181936211316,
231
+ "learning_rate": 3.555555555555555e-05,
232
+ "loss": 1.0829,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.19,
237
+ "grad_norm": 0.718290068719298,
238
+ "learning_rate": 3.666666666666667e-05,
239
+ "loss": 1.0658,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.19,
244
+ "grad_norm": 0.6998775916382852,
245
+ "learning_rate": 3.777777777777778e-05,
246
+ "loss": 1.0687,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.2,
251
+ "grad_norm": 0.6851057008807085,
252
+ "learning_rate": 3.888888888888889e-05,
253
+ "loss": 1.0701,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.2,
258
+ "grad_norm": 0.6724489509428345,
259
+ "learning_rate": 3.9999999999999996e-05,
260
+ "loss": 1.0675,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.21,
265
+ "grad_norm": 0.6480552435697329,
266
+ "learning_rate": 4.1111111111111116e-05,
267
+ "loss": 1.0537,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.21,
272
+ "grad_norm": 0.6383559954105056,
273
+ "learning_rate": 4.222222222222222e-05,
274
+ "loss": 1.0463,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "grad_norm": 0.6029067517028285,
280
+ "learning_rate": 4.3333333333333334e-05,
281
+ "loss": 1.0422,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.22,
286
+ "grad_norm": 0.5709974683191396,
287
+ "learning_rate": 4.444444444444444e-05,
288
+ "loss": 1.037,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.23,
293
+ "grad_norm": 0.5363586512263623,
294
+ "learning_rate": 4.555555555555556e-05,
295
+ "loss": 1.0249,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.24,
300
+ "grad_norm": 0.5064701225673895,
301
+ "learning_rate": 4.666666666666667e-05,
302
+ "loss": 1.0269,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.24,
307
+ "grad_norm": 0.4492710010511058,
308
+ "learning_rate": 4.777777777777778e-05,
309
+ "loss": 1.014,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.25,
314
+ "grad_norm": 0.4154607764986574,
315
+ "learning_rate": 4.888888888888889e-05,
316
+ "loss": 1.0209,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.25,
321
+ "grad_norm": 0.3647804068160132,
322
+ "learning_rate": 5e-05,
323
+ "loss": 1.019,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.26,
328
+ "grad_norm": 0.3247880218369176,
329
+ "learning_rate": 5.1111111111111115e-05,
330
+ "loss": 1.0142,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.26,
335
+ "grad_norm": 0.3054048294238843,
336
+ "learning_rate": 5.222222222222222e-05,
337
+ "loss": 1.005,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.27,
342
+ "grad_norm": 0.2911566588563454,
343
+ "learning_rate": 5.333333333333333e-05,
344
+ "loss": 1.0182,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.28,
349
+ "grad_norm": 0.30175247842711633,
350
+ "learning_rate": 5.4444444444444446e-05,
351
+ "loss": 1.003,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.28,
356
+ "grad_norm": 0.2957438299690764,
357
+ "learning_rate": 5.555555555555556e-05,
358
+ "loss": 1.0018,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.29,
363
+ "grad_norm": 0.222811420448816,
364
+ "learning_rate": 5.6666666666666664e-05,
365
+ "loss": 0.9982,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.29,
370
+ "grad_norm": 0.19282721337866324,
371
+ "learning_rate": 5.7777777777777776e-05,
372
+ "loss": 1.003,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.3,
377
+ "grad_norm": 0.2271139316205582,
378
+ "learning_rate": 5.888888888888889e-05,
379
+ "loss": 0.9884,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.3,
384
+ "grad_norm": 0.22400303611020278,
385
+ "learning_rate": 6e-05,
386
+ "loss": 0.9933,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.31,
391
+ "grad_norm": 0.2640000653332148,
392
+ "learning_rate": 5.999935744992388e-05,
393
+ "loss": 0.9898,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.31,
398
+ "grad_norm": 0.29683962789566454,
399
+ "learning_rate": 5.999742982722021e-05,
400
+ "loss": 0.9894,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.32,
405
+ "grad_norm": 0.23953312072588218,
406
+ "learning_rate": 5.999421721446195e-05,
407
+ "loss": 0.9891,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.33,
412
+ "grad_norm": 0.22760431232110737,
413
+ "learning_rate": 5.9989719749266715e-05,
414
+ "loss": 0.9794,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.33,
419
+ "grad_norm": 0.28069530624064654,
420
+ "learning_rate": 5.998393762429097e-05,
421
+ "loss": 0.9827,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.34,
426
+ "grad_norm": 0.33248489828390104,
427
+ "learning_rate": 5.997687108722169e-05,
428
+ "loss": 0.9829,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.34,
433
+ "grad_norm": 0.37460157299926583,
434
+ "learning_rate": 5.9968520440765807e-05,
435
+ "loss": 0.9865,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.35,
440
+ "grad_norm": 0.40204606775328766,
441
+ "learning_rate": 5.9958886042637214e-05,
442
+ "loss": 0.9872,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.35,
447
+ "grad_norm": 0.2760468657673376,
448
+ "learning_rate": 5.994796830554148e-05,
449
+ "loss": 0.9825,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.36,
454
+ "grad_norm": 0.25200297636338487,
455
+ "learning_rate": 5.9935767697158103e-05,
456
+ "loss": 0.9761,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.37,
461
+ "grad_norm": 0.3387334098621161,
462
+ "learning_rate": 5.992228474012056e-05,
463
+ "loss": 0.9724,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.37,
468
+ "grad_norm": 0.3352356569193807,
469
+ "learning_rate": 5.990752001199384e-05,
470
+ "loss": 0.9694,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.38,
475
+ "grad_norm": 0.25307126239627026,
476
+ "learning_rate": 5.989147414524976e-05,
477
+ "loss": 0.9751,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.38,
482
+ "grad_norm": 0.28603448445681706,
483
+ "learning_rate": 5.987414782723985e-05,
484
+ "loss": 0.9675,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.39,
489
+ "grad_norm": 0.2512510471162242,
490
+ "learning_rate": 5.985554180016591e-05,
491
+ "loss": 0.9713,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.39,
496
+ "grad_norm": 0.23308838498745504,
497
+ "learning_rate": 5.98356568610482e-05,
498
+ "loss": 0.9675,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.4,
503
+ "grad_norm": 0.2507676754613168,
504
+ "learning_rate": 5.981449386169134e-05,
505
+ "loss": 0.9768,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.4,
510
+ "grad_norm": 0.17834215895890057,
511
+ "learning_rate": 5.979205370864779e-05,
512
+ "loss": 0.9736,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.41,
517
+ "grad_norm": 0.22157097251518248,
518
+ "learning_rate": 5.976833736317901e-05,
519
+ "loss": 0.9761,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.42,
524
+ "grad_norm": 0.19478083716793773,
525
+ "learning_rate": 5.9743345841214316e-05,
526
+ "loss": 0.9578,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.42,
531
+ "grad_norm": 0.23929984172471444,
532
+ "learning_rate": 5.9717080213307314e-05,
533
+ "loss": 0.9637,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.43,
538
+ "grad_norm": 0.1792504123152509,
539
+ "learning_rate": 5.968954160459011e-05,
540
+ "loss": 0.9694,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.43,
545
+ "grad_norm": 0.1942820411854134,
546
+ "learning_rate": 5.966073119472502e-05,
547
+ "loss": 0.9654,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.44,
552
+ "grad_norm": 0.20225817833116058,
553
+ "learning_rate": 5.963065021785414e-05,
554
+ "loss": 0.9568,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.44,
559
+ "grad_norm": 0.17871245685588816,
560
+ "learning_rate": 5.9599299962546375e-05,
561
+ "loss": 0.9672,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.45,
566
+ "grad_norm": 0.19357967707631799,
567
+ "learning_rate": 5.956668177174234e-05,
568
+ "loss": 0.9581,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.46,
573
+ "grad_norm": 0.17703904963665285,
574
+ "learning_rate": 5.953279704269675e-05,
575
+ "loss": 0.9399,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.46,
580
+ "grad_norm": 0.18397738069475075,
581
+ "learning_rate": 5.949764722691864e-05,
582
+ "loss": 0.9582,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.47,
587
+ "grad_norm": 0.1667458999382299,
588
+ "learning_rate": 5.9461233830109117e-05,
589
+ "loss": 0.9574,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.47,
594
+ "grad_norm": 0.15246629979305598,
595
+ "learning_rate": 5.9423558412096914e-05,
596
+ "loss": 0.9624,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.48,
601
+ "grad_norm": 0.147474418280348,
602
+ "learning_rate": 5.938462258677154e-05,
603
+ "loss": 0.9574,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.48,
608
+ "grad_norm": 0.18908685471550463,
609
+ "learning_rate": 5.934442802201417e-05,
610
+ "loss": 0.9559,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.49,
615
+ "grad_norm": 0.15962484297812873,
616
+ "learning_rate": 5.930297643962617e-05,
617
+ "loss": 0.9565,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.49,
622
+ "grad_norm": 0.15626218745541218,
623
+ "learning_rate": 5.926026961525538e-05,
624
+ "loss": 0.9669,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.5,
629
+ "grad_norm": 0.16363056001381104,
630
+ "learning_rate": 5.921630937832001e-05,
631
+ "loss": 0.9575,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.51,
636
+ "grad_norm": 0.17266359267085402,
637
+ "learning_rate": 5.91710976119303e-05,
638
+ "loss": 0.9482,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.51,
643
+ "grad_norm": 0.16154285700465842,
644
+ "learning_rate": 5.9124636252807844e-05,
645
+ "loss": 0.9486,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.52,
650
+ "grad_norm": 0.1505238744935054,
651
+ "learning_rate": 5.907692729120263e-05,
652
+ "loss": 0.9465,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.52,
657
+ "grad_norm": 0.1665515957602505,
658
+ "learning_rate": 5.9027972770807796e-05,
659
+ "loss": 0.9458,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.53,
664
+ "grad_norm": 0.15288744931626277,
665
+ "learning_rate": 5.897777478867205e-05,
666
+ "loss": 0.9513,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.53,
671
+ "grad_norm": 0.13893588088386338,
672
+ "learning_rate": 5.892633549510988e-05,
673
+ "loss": 0.9517,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.54,
678
+ "grad_norm": 0.15518080707211024,
679
+ "learning_rate": 5.887365709360941e-05,
680
+ "loss": 0.956,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.54,
685
+ "grad_norm": 0.16735904200015367,
686
+ "learning_rate": 5.881974184073806e-05,
687
+ "loss": 0.9644,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.55,
692
+ "grad_norm": 0.16211582659946666,
693
+ "learning_rate": 5.876459204604579e-05,
694
+ "loss": 0.947,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.56,
699
+ "grad_norm": 0.16067404496452142,
700
+ "learning_rate": 5.8708210071966266e-05,
701
+ "loss": 0.9493,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.56,
706
+ "grad_norm": 0.16466229984590008,
707
+ "learning_rate": 5.8650598333715604e-05,
708
+ "loss": 0.9525,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.57,
713
+ "grad_norm": 0.15635651077742227,
714
+ "learning_rate": 5.8591759299188915e-05,
715
+ "loss": 0.9462,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.57,
720
+ "grad_norm": 0.1445739463712597,
721
+ "learning_rate": 5.853169548885461e-05,
722
+ "loss": 0.9557,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.58,
727
+ "grad_norm": 0.16706069442335733,
728
+ "learning_rate": 5.847040947564642e-05,
729
+ "loss": 0.9571,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.58,
734
+ "grad_norm": 0.14383289999902457,
735
+ "learning_rate": 5.8407903884853173e-05,
736
+ "loss": 0.9452,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.59,
741
+ "grad_norm": 0.1409356084289515,
742
+ "learning_rate": 5.8344181394006345e-05,
743
+ "loss": 0.9452,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.6,
748
+ "grad_norm": 0.1697312533943109,
749
+ "learning_rate": 5.827924473276536e-05,
750
+ "loss": 0.9567,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.6,
755
+ "grad_norm": 0.17531082504359882,
756
+ "learning_rate": 5.821309668280065e-05,
757
+ "loss": 0.9462,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.61,
762
+ "grad_norm": 0.1639474813042821,
763
+ "learning_rate": 5.814574007767453e-05,
764
+ "loss": 0.9485,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.61,
769
+ "grad_norm": 0.17850681384081132,
770
+ "learning_rate": 5.807717780271977e-05,
771
+ "loss": 0.9366,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.62,
776
+ "grad_norm": 0.16125613154173873,
777
+ "learning_rate": 5.800741279491605e-05,
778
+ "loss": 0.9451,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.62,
783
+ "grad_norm": 0.197900179672024,
784
+ "learning_rate": 5.7936448042764106e-05,
785
+ "loss": 0.9495,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.63,
790
+ "grad_norm": 0.21076815721637063,
791
+ "learning_rate": 5.7864286586157726e-05,
792
+ "loss": 0.9435,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.63,
797
+ "grad_norm": 0.19782059920889028,
798
+ "learning_rate": 5.7790931516253545e-05,
799
+ "loss": 0.9416,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.64,
804
+ "grad_norm": 0.1836244550876009,
805
+ "learning_rate": 5.7716385975338605e-05,
806
+ "loss": 0.9466,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.65,
811
+ "grad_norm": 0.1613256822257701,
812
+ "learning_rate": 5.764065315669578e-05,
813
+ "loss": 0.9513,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.65,
818
+ "grad_norm": 0.16741350820768655,
819
+ "learning_rate": 5.756373630446695e-05,
820
+ "loss": 0.9418,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.66,
825
+ "grad_norm": 0.18843157688751017,
826
+ "learning_rate": 5.748563871351408e-05,
827
+ "loss": 0.945,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.66,
832
+ "grad_norm": 0.15069085597597218,
833
+ "learning_rate": 5.7406363729278026e-05,
834
+ "loss": 0.9466,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.67,
839
+ "grad_norm": 0.16662142102625724,
840
+ "learning_rate": 5.7325914747635275e-05,
841
+ "loss": 0.9486,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.67,
846
+ "grad_norm": 0.19590155354881233,
847
+ "learning_rate": 5.724429521475244e-05,
848
+ "loss": 0.9435,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.68,
853
+ "grad_norm": 0.15881882311241918,
854
+ "learning_rate": 5.716150862693866e-05,
855
+ "loss": 0.9466,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.69,
860
+ "grad_norm": 0.14283264252478434,
861
+ "learning_rate": 5.707755853049582e-05,
862
+ "loss": 0.9412,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.69,
867
+ "grad_norm": 0.16782472517291772,
868
+ "learning_rate": 5.699244852156665e-05,
869
+ "loss": 0.9382,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.7,
874
+ "grad_norm": 0.16548141172284359,
875
+ "learning_rate": 5.690618224598065e-05,
876
+ "loss": 0.9479,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.7,
881
+ "grad_norm": 0.13809707490800946,
882
+ "learning_rate": 5.681876339909797e-05,
883
+ "loss": 0.9429,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.71,
888
+ "grad_norm": 0.18656335425800988,
889
+ "learning_rate": 5.673019572565103e-05,
890
+ "loss": 0.9381,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.71,
895
+ "grad_norm": 0.1732296780937527,
896
+ "learning_rate": 5.664048301958422e-05,
897
+ "loss": 0.9431,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.72,
902
+ "grad_norm": 0.14221356678314365,
903
+ "learning_rate": 5.654962912389126e-05,
904
+ "loss": 0.9523,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.72,
909
+ "grad_norm": 0.15119656927035072,
910
+ "learning_rate": 5.645763793045065e-05,
911
+ "loss": 0.9392,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.73,
916
+ "grad_norm": 0.14464956279442504,
917
+ "learning_rate": 5.636451337985896e-05,
918
+ "loss": 0.9384,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.74,
923
+ "grad_norm": 0.13140580588464784,
924
+ "learning_rate": 5.627025946126199e-05,
925
+ "loss": 0.9372,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.74,
930
+ "grad_norm": 0.14493562139149813,
931
+ "learning_rate": 5.617488021218392e-05,
932
+ "loss": 0.9358,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.75,
937
+ "grad_norm": 0.14893216335321577,
938
+ "learning_rate": 5.6078379718354315e-05,
939
+ "loss": 0.9419,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.75,
944
+ "grad_norm": 0.14722396099871643,
945
+ "learning_rate": 5.5980762113533166e-05,
946
+ "loss": 0.944,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.76,
951
+ "grad_norm": 0.1484388680331261,
952
+ "learning_rate": 5.588203157933376e-05,
953
+ "loss": 0.946,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.76,
958
+ "grad_norm": 0.14453486799975523,
959
+ "learning_rate": 5.578219234504359e-05,
960
+ "loss": 0.9502,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.77,
965
+ "grad_norm": 0.13849264435281988,
966
+ "learning_rate": 5.568124868744315e-05,
967
+ "loss": 0.9339,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.78,
972
+ "grad_norm": 0.16199659629172095,
973
+ "learning_rate": 5.557920493062277e-05,
974
+ "loss": 0.9238,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.78,
979
+ "grad_norm": 0.16192446153254933,
980
+ "learning_rate": 5.547606544579737e-05,
981
+ "loss": 0.9336,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.79,
986
+ "grad_norm": 0.14648464155352814,
987
+ "learning_rate": 5.5371834651119204e-05,
988
+ "loss": 0.9305,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.79,
993
+ "grad_norm": 0.1383162638141149,
994
+ "learning_rate": 5.5266517011488596e-05,
995
+ "loss": 0.9391,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.8,
1000
+ "grad_norm": 0.1543998232523172,
1001
+ "learning_rate": 5.5160117038362726e-05,
1002
+ "loss": 0.9366,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.8,
1007
+ "grad_norm": 0.15730672921508043,
1008
+ "learning_rate": 5.5052639289562294e-05,
1009
+ "loss": 0.9346,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.81,
1014
+ "grad_norm": 0.13879299352222021,
1015
+ "learning_rate": 5.494408836907636e-05,
1016
+ "loss": 0.9364,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.81,
1021
+ "grad_norm": 0.15487093688296455,
1022
+ "learning_rate": 5.483446892686507e-05,
1023
+ "loss": 0.9246,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.82,
1028
+ "grad_norm": 0.14401066674945762,
1029
+ "learning_rate": 5.472378565866047e-05,
1030
+ "loss": 0.9361,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.83,
1035
+ "grad_norm": 0.14277478926628612,
1036
+ "learning_rate": 5.461204330576541e-05,
1037
+ "loss": 0.9389,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.83,
1042
+ "grad_norm": 0.16131149816932222,
1043
+ "learning_rate": 5.4499246654850374e-05,
1044
+ "loss": 0.9371,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.84,
1049
+ "grad_norm": 0.17407874387998404,
1050
+ "learning_rate": 5.4385400537748465e-05,
1051
+ "loss": 0.9372,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.84,
1056
+ "grad_norm": 0.17636478629943686,
1057
+ "learning_rate": 5.427050983124843e-05,
1058
+ "loss": 0.9343,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.85,
1063
+ "grad_norm": 0.16977563161518314,
1064
+ "learning_rate": 5.4154579456885744e-05,
1065
+ "loss": 0.9281,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.85,
1070
+ "grad_norm": 0.14318477872489785,
1071
+ "learning_rate": 5.403761438073182e-05,
1072
+ "loss": 0.9365,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.86,
1077
+ "grad_norm": 0.1397942734256781,
1078
+ "learning_rate": 5.3919619613181215e-05,
1079
+ "loss": 0.9469,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.87,
1084
+ "grad_norm": 0.137963649770748,
1085
+ "learning_rate": 5.3800600208737054e-05,
1086
+ "loss": 0.9359,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.87,
1091
+ "grad_norm": 0.15171364959388206,
1092
+ "learning_rate": 5.3680561265794496e-05,
1093
+ "loss": 0.9269,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.88,
1098
+ "grad_norm": 0.14709787438935637,
1099
+ "learning_rate": 5.3559507926422344e-05,
1100
+ "loss": 0.9383,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.88,
1105
+ "grad_norm": 0.12890500807018168,
1106
+ "learning_rate": 5.343744537614276e-05,
1107
+ "loss": 0.924,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.89,
1112
+ "grad_norm": 0.11549683107184264,
1113
+ "learning_rate": 5.331437884370913e-05,
1114
+ "loss": 0.9283,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.89,
1119
+ "grad_norm": 0.15022761689747757,
1120
+ "learning_rate": 5.319031360088211e-05,
1121
+ "loss": 0.9307,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.9,
1126
+ "grad_norm": 0.15714839886122223,
1127
+ "learning_rate": 5.306525496220379e-05,
1128
+ "loss": 0.935,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.9,
1133
+ "grad_norm": 0.16859132275494298,
1134
+ "learning_rate": 5.293920828477001e-05,
1135
+ "loss": 0.9239,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.91,
1140
+ "grad_norm": 0.13633581062447336,
1141
+ "learning_rate": 5.281217896800093e-05,
1142
+ "loss": 0.9414,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.92,
1147
+ "grad_norm": 0.1513556776118746,
1148
+ "learning_rate": 5.268417245340968e-05,
1149
+ "loss": 0.9338,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.92,
1154
+ "grad_norm": 0.1757338104436492,
1155
+ "learning_rate": 5.255519422436932e-05,
1156
+ "loss": 0.9351,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.93,
1161
+ "grad_norm": 0.19112425290854476,
1162
+ "learning_rate": 5.242524980587791e-05,
1163
+ "loss": 0.9333,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.93,
1168
+ "grad_norm": 0.17979027309488244,
1169
+ "learning_rate": 5.2294344764321825e-05,
1170
+ "loss": 0.9179,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.94,
1175
+ "grad_norm": 0.16529910589891425,
1176
+ "learning_rate": 5.2162484707237387e-05,
1177
+ "loss": 0.9356,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.94,
1182
+ "grad_norm": 0.20692577816446356,
1183
+ "learning_rate": 5.202967528307057e-05,
1184
+ "loss": 0.9276,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.95,
1189
+ "grad_norm": 0.197154695709334,
1190
+ "learning_rate": 5.1895922180935066e-05,
1191
+ "loss": 0.9303,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.96,
1196
+ "grad_norm": 0.14150974456692325,
1197
+ "learning_rate": 5.176123113036863e-05,
1198
+ "loss": 0.9364,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.96,
1203
+ "grad_norm": 0.15101923044037124,
1204
+ "learning_rate": 5.162560790108756e-05,
1205
+ "loss": 0.9219,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.97,
1210
+ "grad_norm": 0.19585100899813354,
1211
+ "learning_rate": 5.148905830273964e-05,
1212
+ "loss": 0.9282,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.97,
1217
+ "grad_norm": 0.16466476745138203,
1218
+ "learning_rate": 5.135158818465514e-05,
1219
+ "loss": 0.9267,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.98,
1224
+ "grad_norm": 0.13928826936678446,
1225
+ "learning_rate": 5.1213203435596425e-05,
1226
+ "loss": 0.9204,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.98,
1231
+ "grad_norm": 0.16777686357665667,
1232
+ "learning_rate": 5.107390998350555e-05,
1233
+ "loss": 0.9209,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.99,
1238
+ "grad_norm": 0.13941558577566035,
1239
+ "learning_rate": 5.093371379525041e-05,
1240
+ "loss": 0.933,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.99,
1245
+ "grad_norm": 0.1312376870568112,
1246
+ "learning_rate": 5.079262087636908e-05,
1247
+ "loss": 0.9273,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 1.0,
1252
+ "grad_norm": 0.13304976321730042,
1253
+ "learning_rate": 5.0650637270812615e-05,
1254
+ "loss": 0.9325,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 1.0,
1259
+ "eval_loss": 0.9060415029525757,
1260
+ "eval_runtime": 367.2491,
1261
+ "eval_samples_per_second": 35.657,
1262
+ "eval_steps_per_second": 0.054,
1263
+ "step": 178
1264
+ },
1265
+ {
1266
+ "epoch": 1.01,
1267
+ "grad_norm": 0.14350856865519188,
1268
+ "learning_rate": 5.0507769060686136e-05,
1269
+ "loss": 0.8991,
1270
+ "step": 179
1271
+ },
1272
+ {
1273
+ "epoch": 1.01,
1274
+ "grad_norm": 0.14242795375317008,
1275
+ "learning_rate": 5.036402236598826e-05,
1276
+ "loss": 0.8819,
1277
+ "step": 180
1278
+ },
1279
+ {
1280
+ "epoch": 1.02,
1281
+ "grad_norm": 0.1481380625280409,
1282
+ "learning_rate": 5.021940334434894e-05,
1283
+ "loss": 0.9013,
1284
+ "step": 181
1285
+ },
1286
+ {
1287
+ "epoch": 1.02,
1288
+ "grad_norm": 0.14135102829732513,
1289
+ "learning_rate": 5.007391819076575e-05,
1290
+ "loss": 0.8876,
1291
+ "step": 182
1292
+ },
1293
+ {
1294
+ "epoch": 1.03,
1295
+ "grad_norm": 0.14410031273084303,
1296
+ "learning_rate": 4.9927573137338456e-05,
1297
+ "loss": 0.8962,
1298
+ "step": 183
1299
+ },
1300
+ {
1301
+ "epoch": 1.03,
1302
+ "grad_norm": 0.1356425817573695,
1303
+ "learning_rate": 4.978037445300207e-05,
1304
+ "loss": 0.8984,
1305
+ "step": 184
1306
+ },
1307
+ {
1308
+ "epoch": 1.04,
1309
+ "grad_norm": 0.15138222667915374,
1310
+ "learning_rate": 4.963232844325832e-05,
1311
+ "loss": 0.8934,
1312
+ "step": 185
1313
+ },
1314
+ {
1315
+ "epoch": 1.04,
1316
+ "grad_norm": 0.1476182931802454,
1317
+ "learning_rate": 4.948344144990551e-05,
1318
+ "loss": 0.8875,
1319
+ "step": 186
1320
+ },
1321
+ {
1322
+ "epoch": 1.05,
1323
+ "grad_norm": 0.1542666076621437,
1324
+ "learning_rate": 4.933371985076692e-05,
1325
+ "loss": 0.8916,
1326
+ "step": 187
1327
+ },
1328
+ {
1329
+ "epoch": 1.06,
1330
+ "grad_norm": 0.1522457621047058,
1331
+ "learning_rate": 4.9183170059417543e-05,
1332
+ "loss": 0.8924,
1333
+ "step": 188
1334
+ },
1335
+ {
1336
+ "epoch": 1.06,
1337
+ "grad_norm": 0.1401929471695145,
1338
+ "learning_rate": 4.903179852490937e-05,
1339
+ "loss": 0.8961,
1340
+ "step": 189
1341
+ },
1342
+ {
1343
+ "epoch": 1.07,
1344
+ "grad_norm": 0.1397403483022461,
1345
+ "learning_rate": 4.887961173149513e-05,
1346
+ "loss": 0.8841,
1347
+ "step": 190
1348
+ },
1349
+ {
1350
+ "epoch": 1.07,
1351
+ "grad_norm": 0.16199935662086337,
1352
+ "learning_rate": 4.872661619835054e-05,
1353
+ "loss": 0.8934,
1354
+ "step": 191
1355
+ },
1356
+ {
1357
+ "epoch": 1.08,
1358
+ "grad_norm": 0.17579997757407093,
1359
+ "learning_rate": 4.857281847929503e-05,
1360
+ "loss": 0.8912,
1361
+ "step": 192
1362
+ },
1363
+ {
1364
+ "epoch": 1.08,
1365
+ "grad_norm": 0.12918417902945037,
1366
+ "learning_rate": 4.8418225162510994e-05,
1367
+ "loss": 0.8955,
1368
+ "step": 193
1369
+ },
1370
+ {
1371
+ "epoch": 1.09,
1372
+ "grad_norm": 0.1357691544652963,
1373
+ "learning_rate": 4.826284287026162e-05,
1374
+ "loss": 0.8876,
1375
+ "step": 194
1376
+ },
1377
+ {
1378
+ "epoch": 1.1,
1379
+ "grad_norm": 0.13562412043146593,
1380
+ "learning_rate": 4.8106678258607146e-05,
1381
+ "loss": 0.8925,
1382
+ "step": 195
1383
+ },
1384
+ {
1385
+ "epoch": 1.1,
1386
+ "grad_norm": 0.14459127567703792,
1387
+ "learning_rate": 4.794973801711977e-05,
1388
+ "loss": 0.9046,
1389
+ "step": 196
1390
+ },
1391
+ {
1392
+ "epoch": 1.11,
1393
+ "grad_norm": 0.13612859027230298,
1394
+ "learning_rate": 4.7792028868597114e-05,
1395
+ "loss": 0.9008,
1396
+ "step": 197
1397
+ },
1398
+ {
1399
+ "epoch": 1.11,
1400
+ "grad_norm": 0.13139758263957305,
1401
+ "learning_rate": 4.7633557568774194e-05,
1402
+ "loss": 0.8883,
1403
+ "step": 198
1404
+ },
1405
+ {
1406
+ "epoch": 1.12,
1407
+ "grad_norm": 0.1332017950455885,
1408
+ "learning_rate": 4.7474330906034067e-05,
1409
+ "loss": 0.8872,
1410
+ "step": 199
1411
+ },
1412
+ {
1413
+ "epoch": 1.12,
1414
+ "grad_norm": 0.14003547669435232,
1415
+ "learning_rate": 4.731435570111701e-05,
1416
+ "loss": 0.8852,
1417
+ "step": 200
1418
+ },
1419
+ {
1420
+ "epoch": 1.13,
1421
+ "grad_norm": 0.12548622314931585,
1422
+ "learning_rate": 4.7153638806828365e-05,
1423
+ "loss": 0.8918,
1424
+ "step": 201
1425
+ },
1426
+ {
1427
+ "epoch": 1.13,
1428
+ "grad_norm": 0.1316324333822756,
1429
+ "learning_rate": 4.699218710774499e-05,
1430
+ "loss": 0.8911,
1431
+ "step": 202
1432
+ },
1433
+ {
1434
+ "epoch": 1.14,
1435
+ "grad_norm": 0.14152097644548858,
1436
+ "learning_rate": 4.68300075199203e-05,
1437
+ "loss": 0.8989,
1438
+ "step": 203
1439
+ },
1440
+ {
1441
+ "epoch": 1.15,
1442
+ "grad_norm": 0.1270200473456502,
1443
+ "learning_rate": 4.6667106990588066e-05,
1444
+ "loss": 0.8855,
1445
+ "step": 204
1446
+ },
1447
+ {
1448
+ "epoch": 1.15,
1449
+ "grad_norm": 0.1498866663693573,
1450
+ "learning_rate": 4.650349249786481e-05,
1451
+ "loss": 0.8915,
1452
+ "step": 205
1453
+ },
1454
+ {
1455
+ "epoch": 1.16,
1456
+ "grad_norm": 0.13575618100316017,
1457
+ "learning_rate": 4.633917105045082e-05,
1458
+ "loss": 0.8868,
1459
+ "step": 206
1460
+ },
1461
+ {
1462
+ "epoch": 1.16,
1463
+ "grad_norm": 0.12252093912010241,
1464
+ "learning_rate": 4.617414968733002e-05,
1465
+ "loss": 0.8993,
1466
+ "step": 207
1467
+ },
1468
+ {
1469
+ "epoch": 1.17,
1470
+ "grad_norm": 0.17129363052080313,
1471
+ "learning_rate": 4.6008435477468346e-05,
1472
+ "loss": 0.8979,
1473
+ "step": 208
1474
+ },
1475
+ {
1476
+ "epoch": 1.17,
1477
+ "grad_norm": 0.13086986947624585,
1478
+ "learning_rate": 4.584203551951104e-05,
1479
+ "loss": 0.9042,
1480
+ "step": 209
1481
+ },
1482
+ {
1483
+ "epoch": 1.18,
1484
+ "grad_norm": 0.1289023381093015,
1485
+ "learning_rate": 4.567495694147847e-05,
1486
+ "loss": 0.9079,
1487
+ "step": 210
1488
+ },
1489
+ {
1490
+ "epoch": 1.19,
1491
+ "grad_norm": 0.12646977417389443,
1492
+ "learning_rate": 4.5507206900460824e-05,
1493
+ "loss": 0.8805,
1494
+ "step": 211
1495
+ },
1496
+ {
1497
+ "epoch": 1.19,
1498
+ "grad_norm": 0.135408603034375,
1499
+ "learning_rate": 4.533879258231156e-05,
1500
+ "loss": 0.9036,
1501
+ "step": 212
1502
+ },
1503
+ {
1504
+ "epoch": 1.2,
1505
+ "grad_norm": 0.13151933660920634,
1506
+ "learning_rate": 4.516972120133954e-05,
1507
+ "loss": 0.8963,
1508
+ "step": 213
1509
+ },
1510
+ {
1511
+ "epoch": 1.2,
1512
+ "grad_norm": 0.14642201202966043,
1513
+ "learning_rate": 4.5e-05,
1514
+ "loss": 0.8884,
1515
+ "step": 214
1516
+ },
1517
+ {
1518
+ "epoch": 1.21,
1519
+ "grad_norm": 0.13215103394112293,
1520
+ "learning_rate": 4.4829636248584336e-05,
1521
+ "loss": 0.8954,
1522
+ "step": 215
1523
+ },
1524
+ {
1525
+ "epoch": 1.21,
1526
+ "grad_norm": 0.14081499259661354,
1527
+ "learning_rate": 4.4658637244908654e-05,
1528
+ "loss": 0.9078,
1529
+ "step": 216
1530
+ },
1531
+ {
1532
+ "epoch": 1.22,
1533
+ "grad_norm": 0.13325013129319507,
1534
+ "learning_rate": 4.448701031400112e-05,
1535
+ "loss": 0.889,
1536
+ "step": 217
1537
+ },
1538
+ {
1539
+ "epoch": 1.22,
1540
+ "grad_norm": 0.14567267671389802,
1541
+ "learning_rate": 4.431476280778825e-05,
1542
+ "loss": 0.8896,
1543
+ "step": 218
1544
+ },
1545
+ {
1546
+ "epoch": 1.23,
1547
+ "grad_norm": 0.1530402998595256,
1548
+ "learning_rate": 4.414190210477994e-05,
1549
+ "loss": 0.8863,
1550
+ "step": 219
1551
+ },
1552
+ {
1553
+ "epoch": 1.24,
1554
+ "grad_norm": 0.11883142231327087,
1555
+ "learning_rate": 4.396843560975334e-05,
1556
+ "loss": 0.8956,
1557
+ "step": 220
1558
+ },
1559
+ {
1560
+ "epoch": 1.24,
1561
+ "grad_norm": 0.1441333083283775,
1562
+ "learning_rate": 4.37943707534358e-05,
1563
+ "loss": 0.8886,
1564
+ "step": 221
1565
+ },
1566
+ {
1567
+ "epoch": 1.25,
1568
+ "grad_norm": 0.14323097711463215,
1569
+ "learning_rate": 4.3619714992186405e-05,
1570
+ "loss": 0.8934,
1571
+ "step": 222
1572
+ },
1573
+ {
1574
+ "epoch": 1.25,
1575
+ "grad_norm": 0.14847369219388526,
1576
+ "learning_rate": 4.344447580767668e-05,
1577
+ "loss": 0.8993,
1578
+ "step": 223
1579
+ },
1580
+ {
1581
+ "epoch": 1.26,
1582
+ "grad_norm": 0.12999113661604003,
1583
+ "learning_rate": 4.326866070657004e-05,
1584
+ "loss": 0.8837,
1585
+ "step": 224
1586
+ },
1587
+ {
1588
+ "epoch": 1.26,
1589
+ "grad_norm": 0.1590097564167615,
1590
+ "learning_rate": 4.309227722020026e-05,
1591
+ "loss": 0.893,
1592
+ "step": 225
1593
+ },
1594
+ {
1595
+ "epoch": 1.27,
1596
+ "grad_norm": 0.13787186379941546,
1597
+ "learning_rate": 4.291533290424886e-05,
1598
+ "loss": 0.8873,
1599
+ "step": 226
1600
+ },
1601
+ {
1602
+ "epoch": 1.28,
1603
+ "grad_norm": 0.13561372365659452,
1604
+ "learning_rate": 4.27378353384214e-05,
1605
+ "loss": 0.8909,
1606
+ "step": 227
1607
+ },
1608
+ {
1609
+ "epoch": 1.28,
1610
+ "grad_norm": 0.12883018074018207,
1611
+ "learning_rate": 4.2559792126122843e-05,
1612
+ "loss": 0.8859,
1613
+ "step": 228
1614
+ },
1615
+ {
1616
+ "epoch": 1.29,
1617
+ "grad_norm": 0.13365029333191705,
1618
+ "learning_rate": 4.238121089413184e-05,
1619
+ "loss": 0.9062,
1620
+ "step": 229
1621
+ },
1622
+ {
1623
+ "epoch": 1.29,
1624
+ "grad_norm": 0.14359777080953676,
1625
+ "learning_rate": 4.2202099292274015e-05,
1626
+ "loss": 0.8991,
1627
+ "step": 230
1628
+ },
1629
+ {
1630
+ "epoch": 1.3,
1631
+ "grad_norm": 0.127263628452024,
1632
+ "learning_rate": 4.2022464993094226e-05,
1633
+ "loss": 0.8945,
1634
+ "step": 231
1635
+ },
1636
+ {
1637
+ "epoch": 1.3,
1638
+ "grad_norm": 0.15508442928638333,
1639
+ "learning_rate": 4.184231569152802e-05,
1640
+ "loss": 0.8971,
1641
+ "step": 232
1642
+ },
1643
+ {
1644
+ "epoch": 1.31,
1645
+ "grad_norm": 0.1498552324218163,
1646
+ "learning_rate": 4.166165910457187e-05,
1647
+ "loss": 0.891,
1648
+ "step": 233
1649
+ },
1650
+ {
1651
+ "epoch": 1.31,
1652
+ "grad_norm": 0.13461134175675027,
1653
+ "learning_rate": 4.14805029709527e-05,
1654
+ "loss": 0.8877,
1655
+ "step": 234
1656
+ },
1657
+ {
1658
+ "epoch": 1.32,
1659
+ "grad_norm": 0.14946029731016597,
1660
+ "learning_rate": 4.1298855050796324e-05,
1661
+ "loss": 0.8903,
1662
+ "step": 235
1663
+ },
1664
+ {
1665
+ "epoch": 1.33,
1666
+ "grad_norm": 0.13552355920453768,
1667
+ "learning_rate": 4.1116723125295094e-05,
1668
+ "loss": 0.8973,
1669
+ "step": 236
1670
+ },
1671
+ {
1672
+ "epoch": 1.33,
1673
+ "grad_norm": 0.14705105872966423,
1674
+ "learning_rate": 4.09341149963745e-05,
1675
+ "loss": 0.8982,
1676
+ "step": 237
1677
+ },
1678
+ {
1679
+ "epoch": 1.34,
1680
+ "grad_norm": 0.16411157831669457,
1681
+ "learning_rate": 4.0751038486359e-05,
1682
+ "loss": 0.8982,
1683
+ "step": 238
1684
+ },
1685
+ {
1686
+ "epoch": 1.34,
1687
+ "grad_norm": 0.13269150518487338,
1688
+ "learning_rate": 4.056750143763701e-05,
1689
+ "loss": 0.8895,
1690
+ "step": 239
1691
+ },
1692
+ {
1693
+ "epoch": 1.35,
1694
+ "grad_norm": 0.14962049546760073,
1695
+ "learning_rate": 4.038351171232479e-05,
1696
+ "loss": 0.8854,
1697
+ "step": 240
1698
+ },
1699
+ {
1700
+ "epoch": 1.35,
1701
+ "grad_norm": 0.16086784383385563,
1702
+ "learning_rate": 4.019907719192982e-05,
1703
+ "loss": 0.8964,
1704
+ "step": 241
1705
+ },
1706
+ {
1707
+ "epoch": 1.36,
1708
+ "grad_norm": 0.14444863678847175,
1709
+ "learning_rate": 4.0014205777013125e-05,
1710
+ "loss": 0.8847,
1711
+ "step": 242
1712
+ },
1713
+ {
1714
+ "epoch": 1.37,
1715
+ "grad_norm": 0.12452999678280204,
1716
+ "learning_rate": 3.982890538685081e-05,
1717
+ "loss": 0.8896,
1718
+ "step": 243
1719
+ },
1720
+ {
1721
+ "epoch": 1.37,
1722
+ "grad_norm": 0.15069697551107591,
1723
+ "learning_rate": 3.964318395909485e-05,
1724
+ "loss": 0.8986,
1725
+ "step": 244
1726
+ },
1727
+ {
1728
+ "epoch": 1.38,
1729
+ "grad_norm": 0.12577792786818992,
1730
+ "learning_rate": 3.945704944943309e-05,
1731
+ "loss": 0.884,
1732
+ "step": 245
1733
+ },
1734
+ {
1735
+ "epoch": 1.38,
1736
+ "grad_norm": 0.12286285953881854,
1737
+ "learning_rate": 3.927050983124842e-05,
1738
+ "loss": 0.8818,
1739
+ "step": 246
1740
+ },
1741
+ {
1742
+ "epoch": 1.39,
1743
+ "grad_norm": 0.13929979955363045,
1744
+ "learning_rate": 3.908357309527724e-05,
1745
+ "loss": 0.8866,
1746
+ "step": 247
1747
+ },
1748
+ {
1749
+ "epoch": 1.39,
1750
+ "grad_norm": 0.12285830808074812,
1751
+ "learning_rate": 3.889624724926713e-05,
1752
+ "loss": 0.8812,
1753
+ "step": 248
1754
+ },
1755
+ {
1756
+ "epoch": 1.4,
1757
+ "grad_norm": 0.12424668641448619,
1758
+ "learning_rate": 3.870854031763387e-05,
1759
+ "loss": 0.8928,
1760
+ "step": 249
1761
+ },
1762
+ {
1763
+ "epoch": 1.4,
1764
+ "grad_norm": 0.12954214774856612,
1765
+ "learning_rate": 3.852046034111769e-05,
1766
+ "loss": 0.8853,
1767
+ "step": 250
1768
+ },
1769
+ {
1770
+ "epoch": 1.41,
1771
+ "grad_norm": 0.1266013073117186,
1772
+ "learning_rate": 3.8332015376438775e-05,
1773
+ "loss": 0.8844,
1774
+ "step": 251
1775
+ },
1776
+ {
1777
+ "epoch": 1.42,
1778
+ "grad_norm": 0.12067596509699888,
1779
+ "learning_rate": 3.8143213495952224e-05,
1780
+ "loss": 0.8916,
1781
+ "step": 252
1782
+ },
1783
+ {
1784
+ "epoch": 1.42,
1785
+ "grad_norm": 0.11623622829833351,
1786
+ "learning_rate": 3.795406278730224e-05,
1787
+ "loss": 0.8859,
1788
+ "step": 253
1789
+ },
1790
+ {
1791
+ "epoch": 1.43,
1792
+ "grad_norm": 0.11551209984961364,
1793
+ "learning_rate": 3.776457135307562e-05,
1794
+ "loss": 0.8868,
1795
+ "step": 254
1796
+ },
1797
+ {
1798
+ "epoch": 1.43,
1799
+ "grad_norm": 0.11536856636572211,
1800
+ "learning_rate": 3.757474731045474e-05,
1801
+ "loss": 0.8828,
1802
+ "step": 255
1803
+ },
1804
+ {
1805
+ "epoch": 1.44,
1806
+ "grad_norm": 0.12375743852295125,
1807
+ "learning_rate": 3.738459879086979e-05,
1808
+ "loss": 0.8902,
1809
+ "step": 256
1810
+ },
1811
+ {
1812
+ "epoch": 1.44,
1813
+ "grad_norm": 0.12571384064192775,
1814
+ "learning_rate": 3.71941339396505e-05,
1815
+ "loss": 0.8885,
1816
+ "step": 257
1817
+ },
1818
+ {
1819
+ "epoch": 1.45,
1820
+ "grad_norm": 0.13081609687043957,
1821
+ "learning_rate": 3.7003360915677164e-05,
1822
+ "loss": 0.8954,
1823
+ "step": 258
1824
+ },
1825
+ {
1826
+ "epoch": 1.46,
1827
+ "grad_norm": 0.12420083605022122,
1828
+ "learning_rate": 3.68122878910312e-05,
1829
+ "loss": 0.8914,
1830
+ "step": 259
1831
+ },
1832
+ {
1833
+ "epoch": 1.46,
1834
+ "grad_norm": 0.13533339586556395,
1835
+ "learning_rate": 3.6620923050645045e-05,
1836
+ "loss": 0.8901,
1837
+ "step": 260
1838
+ },
1839
+ {
1840
+ "epoch": 1.47,
1841
+ "grad_norm": 0.11714010649732222,
1842
+ "learning_rate": 3.6429274591951526e-05,
1843
+ "loss": 0.899,
1844
+ "step": 261
1845
+ },
1846
+ {
1847
+ "epoch": 1.47,
1848
+ "grad_norm": 0.13060719113804842,
1849
+ "learning_rate": 3.6237350724532775e-05,
1850
+ "loss": 0.8987,
1851
+ "step": 262
1852
+ },
1853
+ {
1854
+ "epoch": 1.48,
1855
+ "grad_norm": 0.11664362421142428,
1856
+ "learning_rate": 3.6045159669768514e-05,
1857
+ "loss": 0.8835,
1858
+ "step": 263
1859
+ },
1860
+ {
1861
+ "epoch": 1.48,
1862
+ "grad_norm": 0.12473979951909206,
1863
+ "learning_rate": 3.5852709660483855e-05,
1864
+ "loss": 0.8824,
1865
+ "step": 264
1866
+ },
1867
+ {
1868
+ "epoch": 1.49,
1869
+ "grad_norm": 0.12297968050330141,
1870
+ "learning_rate": 3.566000894059666e-05,
1871
+ "loss": 0.8863,
1872
+ "step": 265
1873
+ },
1874
+ {
1875
+ "epoch": 1.49,
1876
+ "grad_norm": 0.13406322104382662,
1877
+ "learning_rate": 3.5467065764764434e-05,
1878
+ "loss": 0.8884,
1879
+ "step": 266
1880
+ },
1881
+ {
1882
+ "epoch": 1.5,
1883
+ "grad_norm": 0.12098972585519244,
1884
+ "learning_rate": 3.527388839803064e-05,
1885
+ "loss": 0.8857,
1886
+ "step": 267
1887
+ },
1888
+ {
1889
+ "epoch": 1.51,
1890
+ "grad_norm": 0.12310957119462022,
1891
+ "learning_rate": 3.508048511547073e-05,
1892
+ "loss": 0.8835,
1893
+ "step": 268
1894
+ },
1895
+ {
1896
+ "epoch": 1.51,
1897
+ "grad_norm": 0.11925523073706316,
1898
+ "learning_rate": 3.4886864201837666e-05,
1899
+ "loss": 0.8814,
1900
+ "step": 269
1901
+ },
1902
+ {
1903
+ "epoch": 1.52,
1904
+ "grad_norm": 0.13519867806471875,
1905
+ "learning_rate": 3.469303395120693e-05,
1906
+ "loss": 0.8783,
1907
+ "step": 270
1908
+ },
1909
+ {
1910
+ "epoch": 1.52,
1911
+ "grad_norm": 0.13530142559876104,
1912
+ "learning_rate": 3.449900266662135e-05,
1913
+ "loss": 0.8914,
1914
+ "step": 271
1915
+ },
1916
+ {
1917
+ "epoch": 1.53,
1918
+ "grad_norm": 0.14035599743211918,
1919
+ "learning_rate": 3.430477865973538e-05,
1920
+ "loss": 0.8884,
1921
+ "step": 272
1922
+ },
1923
+ {
1924
+ "epoch": 1.53,
1925
+ "grad_norm": 0.1405423762523139,
1926
+ "learning_rate": 3.4110370250459046e-05,
1927
+ "loss": 0.8803,
1928
+ "step": 273
1929
+ },
1930
+ {
1931
+ "epoch": 1.54,
1932
+ "grad_norm": 0.13704705253451768,
1933
+ "learning_rate": 3.3915785766601555e-05,
1934
+ "loss": 0.8773,
1935
+ "step": 274
1936
+ },
1937
+ {
1938
+ "epoch": 1.54,
1939
+ "grad_norm": 0.13457615586156477,
1940
+ "learning_rate": 3.372103354351456e-05,
1941
+ "loss": 0.8749,
1942
+ "step": 275
1943
+ },
1944
+ {
1945
+ "epoch": 1.55,
1946
+ "grad_norm": 0.11095858575701167,
1947
+ "learning_rate": 3.3526121923735136e-05,
1948
+ "loss": 0.8845,
1949
+ "step": 276
1950
+ },
1951
+ {
1952
+ "epoch": 1.56,
1953
+ "grad_norm": 0.13597531439993377,
1954
+ "learning_rate": 3.333105925662833e-05,
1955
+ "loss": 0.8928,
1956
+ "step": 277
1957
+ },
1958
+ {
1959
+ "epoch": 1.56,
1960
+ "grad_norm": 0.10509488873902952,
1961
+ "learning_rate": 3.313585389802961e-05,
1962
+ "loss": 0.8949,
1963
+ "step": 278
1964
+ },
1965
+ {
1966
+ "epoch": 1.57,
1967
+ "grad_norm": 0.13788943155574296,
1968
+ "learning_rate": 3.294051420988683e-05,
1969
+ "loss": 0.8848,
1970
+ "step": 279
1971
+ },
1972
+ {
1973
+ "epoch": 1.57,
1974
+ "grad_norm": 0.1128347052438993,
1975
+ "learning_rate": 3.274504855990208e-05,
1976
+ "loss": 0.89,
1977
+ "step": 280
1978
+ },
1979
+ {
1980
+ "epoch": 1.58,
1981
+ "grad_norm": 0.129080918181206,
1982
+ "learning_rate": 3.254946532117325e-05,
1983
+ "loss": 0.889,
1984
+ "step": 281
1985
+ },
1986
+ {
1987
+ "epoch": 1.58,
1988
+ "grad_norm": 0.13286996791565725,
1989
+ "learning_rate": 3.235377287183535e-05,
1990
+ "loss": 0.8852,
1991
+ "step": 282
1992
+ },
1993
+ {
1994
+ "epoch": 1.59,
1995
+ "grad_norm": 0.13354457708434664,
1996
+ "learning_rate": 3.2157979594701584e-05,
1997
+ "loss": 0.8788,
1998
+ "step": 283
1999
+ },
2000
+ {
2001
+ "epoch": 1.6,
2002
+ "grad_norm": 0.1336273953738305,
2003
+ "learning_rate": 3.1962093876904294e-05,
2004
+ "loss": 0.8878,
2005
+ "step": 284
2006
+ },
2007
+ {
2008
+ "epoch": 1.6,
2009
+ "grad_norm": 0.13151737304049846,
2010
+ "learning_rate": 3.176612410953567e-05,
2011
+ "loss": 0.8844,
2012
+ "step": 285
2013
+ },
2014
+ {
2015
+ "epoch": 1.61,
2016
+ "grad_norm": 0.1313340614662638,
2017
+ "learning_rate": 3.157007868728832e-05,
2018
+ "loss": 0.8882,
2019
+ "step": 286
2020
+ },
2021
+ {
2022
+ "epoch": 1.61,
2023
+ "grad_norm": 0.12675708891920084,
2024
+ "learning_rate": 3.1373966008095624e-05,
2025
+ "loss": 0.876,
2026
+ "step": 287
2027
+ },
2028
+ {
2029
+ "epoch": 1.62,
2030
+ "grad_norm": 0.13203447973990534,
2031
+ "learning_rate": 3.117779447277206e-05,
2032
+ "loss": 0.8866,
2033
+ "step": 288
2034
+ },
2035
+ {
2036
+ "epoch": 1.62,
2037
+ "grad_norm": 0.11627365297753998,
2038
+ "learning_rate": 3.098157248465329e-05,
2039
+ "loss": 0.8797,
2040
+ "step": 289
2041
+ },
2042
+ {
2043
+ "epoch": 1.63,
2044
+ "grad_norm": 0.12182727484026588,
2045
+ "learning_rate": 3.07853084492362e-05,
2046
+ "loss": 0.8867,
2047
+ "step": 290
2048
+ },
2049
+ {
2050
+ "epoch": 1.63,
2051
+ "grad_norm": 0.11050534350252515,
2052
+ "learning_rate": 3.0589010773818843e-05,
2053
+ "loss": 0.8714,
2054
+ "step": 291
2055
+ },
2056
+ {
2057
+ "epoch": 1.64,
2058
+ "grad_norm": 0.11908194959446168,
2059
+ "learning_rate": 3.0392687867140333e-05,
2060
+ "loss": 0.8805,
2061
+ "step": 292
2062
+ },
2063
+ {
2064
+ "epoch": 1.65,
2065
+ "grad_norm": 0.13824206533219605,
2066
+ "learning_rate": 3.019634813902056e-05,
2067
+ "loss": 0.8919,
2068
+ "step": 293
2069
+ },
2070
+ {
2071
+ "epoch": 1.65,
2072
+ "grad_norm": 0.10624499343917188,
2073
+ "learning_rate": 3e-05,
2074
+ "loss": 0.8872,
2075
+ "step": 294
2076
+ },
2077
+ {
2078
+ "epoch": 1.66,
2079
+ "grad_norm": 0.13416938377497178,
2080
+ "learning_rate": 2.9803651860979446e-05,
2081
+ "loss": 0.8882,
2082
+ "step": 295
2083
+ },
2084
+ {
2085
+ "epoch": 1.66,
2086
+ "grad_norm": 0.10669414306939697,
2087
+ "learning_rate": 2.9607312132859672e-05,
2088
+ "loss": 0.8886,
2089
+ "step": 296
2090
+ },
2091
+ {
2092
+ "epoch": 1.67,
2093
+ "grad_norm": 0.13304203015528449,
2094
+ "learning_rate": 2.9410989226181155e-05,
2095
+ "loss": 0.8858,
2096
+ "step": 297
2097
+ },
2098
+ {
2099
+ "epoch": 1.67,
2100
+ "grad_norm": 0.10943791939152962,
2101
+ "learning_rate": 2.9214691550763813e-05,
2102
+ "loss": 0.8862,
2103
+ "step": 298
2104
+ },
2105
+ {
2106
+ "epoch": 1.68,
2107
+ "grad_norm": 0.12414652355477009,
2108
+ "learning_rate": 2.901842751534672e-05,
2109
+ "loss": 0.8771,
2110
+ "step": 299
2111
+ },
2112
+ {
2113
+ "epoch": 1.69,
2114
+ "grad_norm": 0.12513686726484888,
2115
+ "learning_rate": 2.882220552722795e-05,
2116
+ "loss": 0.8884,
2117
+ "step": 300
2118
+ },
2119
+ {
2120
+ "epoch": 1.69,
2121
+ "grad_norm": 0.12991814913620842,
2122
+ "learning_rate": 2.8626033991904384e-05,
2123
+ "loss": 0.8912,
2124
+ "step": 301
2125
+ },
2126
+ {
2127
+ "epoch": 1.7,
2128
+ "grad_norm": 0.12469358939363877,
2129
+ "learning_rate": 2.8429921312711687e-05,
2130
+ "loss": 0.8755,
2131
+ "step": 302
2132
+ },
2133
+ {
2134
+ "epoch": 1.7,
2135
+ "grad_norm": 0.12431368690357643,
2136
+ "learning_rate": 2.8233875890464327e-05,
2137
+ "loss": 0.8758,
2138
+ "step": 303
2139
+ },
2140
+ {
2141
+ "epoch": 1.71,
2142
+ "grad_norm": 0.13987522452787626,
2143
+ "learning_rate": 2.8037906123095708e-05,
2144
+ "loss": 0.8869,
2145
+ "step": 304
2146
+ },
2147
+ {
2148
+ "epoch": 1.71,
2149
+ "grad_norm": 0.1238266615524893,
2150
+ "learning_rate": 2.7842020405298415e-05,
2151
+ "loss": 0.8783,
2152
+ "step": 305
2153
+ },
2154
+ {
2155
+ "epoch": 1.72,
2156
+ "grad_norm": 0.13322189651231403,
2157
+ "learning_rate": 2.7646227128164657e-05,
2158
+ "loss": 0.885,
2159
+ "step": 306
2160
+ },
2161
+ {
2162
+ "epoch": 1.72,
2163
+ "grad_norm": 0.11424790429145636,
2164
+ "learning_rate": 2.7450534678826753e-05,
2165
+ "loss": 0.8763,
2166
+ "step": 307
2167
+ },
2168
+ {
2169
+ "epoch": 1.73,
2170
+ "grad_norm": 0.13119287845211466,
2171
+ "learning_rate": 2.725495144009793e-05,
2172
+ "loss": 0.8767,
2173
+ "step": 308
2174
+ },
2175
+ {
2176
+ "epoch": 1.74,
2177
+ "grad_norm": 0.11575683203124575,
2178
+ "learning_rate": 2.705948579011318e-05,
2179
+ "loss": 0.8729,
2180
+ "step": 309
2181
+ },
2182
+ {
2183
+ "epoch": 1.74,
2184
+ "grad_norm": 0.14050128592489014,
2185
+ "learning_rate": 2.6864146101970402e-05,
2186
+ "loss": 0.8798,
2187
+ "step": 310
2188
+ },
2189
+ {
2190
+ "epoch": 1.75,
2191
+ "grad_norm": 0.11824436396680635,
2192
+ "learning_rate": 2.6668940743371674e-05,
2193
+ "loss": 0.8835,
2194
+ "step": 311
2195
+ },
2196
+ {
2197
+ "epoch": 1.75,
2198
+ "grad_norm": 0.12959459755274583,
2199
+ "learning_rate": 2.6473878076264875e-05,
2200
+ "loss": 0.8751,
2201
+ "step": 312
2202
+ },
2203
+ {
2204
+ "epoch": 1.76,
2205
+ "grad_norm": 0.11622350222131687,
2206
+ "learning_rate": 2.627896645648545e-05,
2207
+ "loss": 0.8931,
2208
+ "step": 313
2209
+ },
2210
+ {
2211
+ "epoch": 1.76,
2212
+ "grad_norm": 0.12314437146185572,
2213
+ "learning_rate": 2.608421423339846e-05,
2214
+ "loss": 0.8865,
2215
+ "step": 314
2216
+ },
2217
+ {
2218
+ "epoch": 1.77,
2219
+ "grad_norm": 0.11421490206501421,
2220
+ "learning_rate": 2.5889629749540966e-05,
2221
+ "loss": 0.8824,
2222
+ "step": 315
2223
+ },
2224
+ {
2225
+ "epoch": 1.78,
2226
+ "grad_norm": 0.1116631859004068,
2227
+ "learning_rate": 2.5695221340264626e-05,
2228
+ "loss": 0.8803,
2229
+ "step": 316
2230
+ },
2231
+ {
2232
+ "epoch": 1.78,
2233
+ "grad_norm": 0.10817788865943528,
2234
+ "learning_rate": 2.5500997333378646e-05,
2235
+ "loss": 0.8794,
2236
+ "step": 317
2237
+ },
2238
+ {
2239
+ "epoch": 1.79,
2240
+ "grad_norm": 0.11523773392162422,
2241
+ "learning_rate": 2.530696604879307e-05,
2242
+ "loss": 0.8832,
2243
+ "step": 318
2244
+ },
2245
+ {
2246
+ "epoch": 1.79,
2247
+ "grad_norm": 0.10502311886881133,
2248
+ "learning_rate": 2.5113135798162342e-05,
2249
+ "loss": 0.8816,
2250
+ "step": 319
2251
+ },
2252
+ {
2253
+ "epoch": 1.8,
2254
+ "grad_norm": 0.11059154509363399,
2255
+ "learning_rate": 2.4919514884529262e-05,
2256
+ "loss": 0.8787,
2257
+ "step": 320
2258
+ },
2259
+ {
2260
+ "epoch": 1.8,
2261
+ "grad_norm": 0.10388990188288608,
2262
+ "learning_rate": 2.4726111601969365e-05,
2263
+ "loss": 0.8824,
2264
+ "step": 321
2265
+ },
2266
+ {
2267
+ "epoch": 1.81,
2268
+ "grad_norm": 0.112930784152048,
2269
+ "learning_rate": 2.4532934235235574e-05,
2270
+ "loss": 0.8744,
2271
+ "step": 322
2272
+ },
2273
+ {
2274
+ "epoch": 1.81,
2275
+ "grad_norm": 0.11158721627694267,
2276
+ "learning_rate": 2.433999105940335e-05,
2277
+ "loss": 0.8847,
2278
+ "step": 323
2279
+ },
2280
+ {
2281
+ "epoch": 1.82,
2282
+ "grad_norm": 0.10338686368057544,
2283
+ "learning_rate": 2.4147290339516156e-05,
2284
+ "loss": 0.877,
2285
+ "step": 324
2286
+ },
2287
+ {
2288
+ "epoch": 1.83,
2289
+ "grad_norm": 0.1071678174366947,
2290
+ "learning_rate": 2.3954840330231487e-05,
2291
+ "loss": 0.884,
2292
+ "step": 325
2293
+ },
2294
+ {
2295
+ "epoch": 1.83,
2296
+ "grad_norm": 0.11226132071651221,
2297
+ "learning_rate": 2.3762649275467226e-05,
2298
+ "loss": 0.8747,
2299
+ "step": 326
2300
+ },
2301
+ {
2302
+ "epoch": 1.84,
2303
+ "grad_norm": 0.10930430786464529,
2304
+ "learning_rate": 2.3570725408048483e-05,
2305
+ "loss": 0.883,
2306
+ "step": 327
2307
+ },
2308
+ {
2309
+ "epoch": 1.84,
2310
+ "grad_norm": 0.10447714161398361,
2311
+ "learning_rate": 2.337907694935497e-05,
2312
+ "loss": 0.8797,
2313
+ "step": 328
2314
+ },
2315
+ {
2316
+ "epoch": 1.85,
2317
+ "grad_norm": 0.10888020274939628,
2318
+ "learning_rate": 2.3187712108968808e-05,
2319
+ "loss": 0.8779,
2320
+ "step": 329
2321
+ },
2322
+ {
2323
+ "epoch": 1.85,
2324
+ "grad_norm": 0.10873276266965934,
2325
+ "learning_rate": 2.2996639084322848e-05,
2326
+ "loss": 0.8716,
2327
+ "step": 330
2328
+ },
2329
+ {
2330
+ "epoch": 1.86,
2331
+ "grad_norm": 0.1051804371547165,
2332
+ "learning_rate": 2.2805866060349513e-05,
2333
+ "loss": 0.8925,
2334
+ "step": 331
2335
+ },
2336
+ {
2337
+ "epoch": 1.87,
2338
+ "grad_norm": 0.1073547759858754,
2339
+ "learning_rate": 2.261540120913021e-05,
2340
+ "loss": 0.8763,
2341
+ "step": 332
2342
+ },
2343
+ {
2344
+ "epoch": 1.87,
2345
+ "grad_norm": 0.10582013545010713,
2346
+ "learning_rate": 2.242525268954526e-05,
2347
+ "loss": 0.8801,
2348
+ "step": 333
2349
+ },
2350
+ {
2351
+ "epoch": 1.88,
2352
+ "grad_norm": 0.11360898822883193,
2353
+ "learning_rate": 2.2235428646924375e-05,
2354
+ "loss": 0.8861,
2355
+ "step": 334
2356
+ },
2357
+ {
2358
+ "epoch": 1.88,
2359
+ "grad_norm": 0.10517104031304181,
2360
+ "learning_rate": 2.2045937212697755e-05,
2361
+ "loss": 0.875,
2362
+ "step": 335
2363
+ },
2364
+ {
2365
+ "epoch": 1.89,
2366
+ "grad_norm": 0.11733374572446251,
2367
+ "learning_rate": 2.1856786504047774e-05,
2368
+ "loss": 0.8771,
2369
+ "step": 336
2370
+ },
2371
+ {
2372
+ "epoch": 1.89,
2373
+ "grad_norm": 0.10165485322637723,
2374
+ "learning_rate": 2.1667984623561237e-05,
2375
+ "loss": 0.8772,
2376
+ "step": 337
2377
+ },
2378
+ {
2379
+ "epoch": 1.9,
2380
+ "grad_norm": 0.12525581851096768,
2381
+ "learning_rate": 2.147953965888232e-05,
2382
+ "loss": 0.8893,
2383
+ "step": 338
2384
+ },
2385
+ {
2386
+ "epoch": 1.9,
2387
+ "grad_norm": 0.0991811666289482,
2388
+ "learning_rate": 2.1291459682366136e-05,
2389
+ "loss": 0.8777,
2390
+ "step": 339
2391
+ },
2392
+ {
2393
+ "epoch": 1.91,
2394
+ "grad_norm": 0.10828713812832842,
2395
+ "learning_rate": 2.1103752750732875e-05,
2396
+ "loss": 0.8834,
2397
+ "step": 340
2398
+ },
2399
+ {
2400
+ "epoch": 1.92,
2401
+ "grad_norm": 0.11106845149741414,
2402
+ "learning_rate": 2.091642690472277e-05,
2403
+ "loss": 0.874,
2404
+ "step": 341
2405
+ },
2406
+ {
2407
+ "epoch": 1.92,
2408
+ "grad_norm": 0.10116959541779709,
2409
+ "learning_rate": 2.072949016875158e-05,
2410
+ "loss": 0.8677,
2411
+ "step": 342
2412
+ },
2413
+ {
2414
+ "epoch": 1.93,
2415
+ "grad_norm": 0.11266898014907221,
2416
+ "learning_rate": 2.054295055056692e-05,
2417
+ "loss": 0.8926,
2418
+ "step": 343
2419
+ },
2420
+ {
2421
+ "epoch": 1.93,
2422
+ "grad_norm": 0.10220798261268464,
2423
+ "learning_rate": 2.035681604090516e-05,
2424
+ "loss": 0.8645,
2425
+ "step": 344
2426
+ },
2427
+ {
2428
+ "epoch": 1.94,
2429
+ "grad_norm": 0.1060144270004459,
2430
+ "learning_rate": 2.0171094613149198e-05,
2431
+ "loss": 0.8799,
2432
+ "step": 345
2433
+ },
2434
+ {
2435
+ "epoch": 1.94,
2436
+ "grad_norm": 0.10968581060830482,
2437
+ "learning_rate": 1.9985794222986876e-05,
2438
+ "loss": 0.8763,
2439
+ "step": 346
2440
+ },
2441
+ {
2442
+ "epoch": 1.95,
2443
+ "grad_norm": 0.11650138283261792,
2444
+ "learning_rate": 1.980092280807017e-05,
2445
+ "loss": 0.884,
2446
+ "step": 347
2447
+ },
2448
+ {
2449
+ "epoch": 1.96,
2450
+ "grad_norm": 0.10120824187747386,
2451
+ "learning_rate": 1.9616488287675206e-05,
2452
+ "loss": 0.8749,
2453
+ "step": 348
2454
+ },
2455
+ {
2456
+ "epoch": 1.96,
2457
+ "grad_norm": 0.11604072229024492,
2458
+ "learning_rate": 1.9432498562362997e-05,
2459
+ "loss": 0.8796,
2460
+ "step": 349
2461
+ },
2462
+ {
2463
+ "epoch": 1.97,
2464
+ "grad_norm": 0.10278157857112963,
2465
+ "learning_rate": 1.924896151364099e-05,
2466
+ "loss": 0.8793,
2467
+ "step": 350
2468
+ },
2469
+ {
2470
+ "epoch": 1.97,
2471
+ "grad_norm": 0.10712761878171682,
2472
+ "learning_rate": 1.906588500362551e-05,
2473
+ "loss": 0.8801,
2474
+ "step": 351
2475
+ },
2476
+ {
2477
+ "epoch": 1.98,
2478
+ "grad_norm": 0.10593823683108748,
2479
+ "learning_rate": 1.888327687470491e-05,
2480
+ "loss": 0.8783,
2481
+ "step": 352
2482
+ },
2483
+ {
2484
+ "epoch": 1.98,
2485
+ "grad_norm": 0.10348796531289477,
2486
+ "learning_rate": 1.8701144949203677e-05,
2487
+ "loss": 0.8786,
2488
+ "step": 353
2489
+ },
2490
+ {
2491
+ "epoch": 1.99,
2492
+ "grad_norm": 0.1074116166391481,
2493
+ "learning_rate": 1.8519497029047307e-05,
2494
+ "loss": 0.8778,
2495
+ "step": 354
2496
+ },
2497
+ {
2498
+ "epoch": 1.99,
2499
+ "grad_norm": 0.11058482340970019,
2500
+ "learning_rate": 1.833834089542813e-05,
2501
+ "loss": 0.8713,
2502
+ "step": 355
2503
+ },
2504
+ {
2505
+ "epoch": 2.0,
2506
+ "grad_norm": 0.11020651242885636,
2507
+ "learning_rate": 1.8157684308471988e-05,
2508
+ "loss": 0.8687,
2509
+ "step": 356
2510
+ },
2511
+ {
2512
+ "epoch": 2.0,
2513
+ "eval_loss": 0.8850164413452148,
2514
+ "eval_runtime": 325.5979,
2515
+ "eval_samples_per_second": 40.218,
2516
+ "eval_steps_per_second": 0.061,
2517
+ "step": 356
2518
+ },
2519
+ {
2520
+ "epoch": 2.01,
2521
+ "grad_norm": 0.12003832997057942,
2522
+ "learning_rate": 1.7977535006905776e-05,
2523
+ "loss": 0.8401,
2524
+ "step": 357
2525
+ },
2526
+ {
2527
+ "epoch": 2.01,
2528
+ "grad_norm": 0.10675891351152236,
2529
+ "learning_rate": 1.7797900707726e-05,
2530
+ "loss": 0.8486,
2531
+ "step": 358
2532
+ },
2533
+ {
2534
+ "epoch": 2.02,
2535
+ "grad_norm": 0.13479155868015855,
2536
+ "learning_rate": 1.761878910586816e-05,
2537
+ "loss": 0.8508,
2538
+ "step": 359
2539
+ },
2540
+ {
2541
+ "epoch": 2.02,
2542
+ "grad_norm": 0.11372338660989749,
2543
+ "learning_rate": 1.7440207873877165e-05,
2544
+ "loss": 0.8415,
2545
+ "step": 360
2546
+ },
2547
+ {
2548
+ "epoch": 2.03,
2549
+ "grad_norm": 0.11106742867861737,
2550
+ "learning_rate": 1.7262164661578614e-05,
2551
+ "loss": 0.8546,
2552
+ "step": 361
2553
+ },
2554
+ {
2555
+ "epoch": 2.03,
2556
+ "grad_norm": 0.10946128209473692,
2557
+ "learning_rate": 1.708466709575114e-05,
2558
+ "loss": 0.8477,
2559
+ "step": 362
2560
+ },
2561
+ {
2562
+ "epoch": 2.04,
2563
+ "grad_norm": 0.10202238774555161,
2564
+ "learning_rate": 1.6907722779799732e-05,
2565
+ "loss": 0.8498,
2566
+ "step": 363
2567
+ },
2568
+ {
2569
+ "epoch": 2.04,
2570
+ "grad_norm": 0.11732343725531236,
2571
+ "learning_rate": 1.6731339293429967e-05,
2572
+ "loss": 0.8462,
2573
+ "step": 364
2574
+ },
2575
+ {
2576
+ "epoch": 2.05,
2577
+ "grad_norm": 0.10655979023984713,
2578
+ "learning_rate": 1.6555524192323327e-05,
2579
+ "loss": 0.8497,
2580
+ "step": 365
2581
+ },
2582
+ {
2583
+ "epoch": 2.06,
2584
+ "grad_norm": 0.1124168518526382,
2585
+ "learning_rate": 1.6380285007813596e-05,
2586
+ "loss": 0.8479,
2587
+ "step": 366
2588
+ },
2589
+ {
2590
+ "epoch": 2.06,
2591
+ "grad_norm": 0.10356449268514911,
2592
+ "learning_rate": 1.6205629246564205e-05,
2593
+ "loss": 0.852,
2594
+ "step": 367
2595
+ },
2596
+ {
2597
+ "epoch": 2.07,
2598
+ "grad_norm": 0.11466802705572018,
2599
+ "learning_rate": 1.6031564390246658e-05,
2600
+ "loss": 0.8469,
2601
+ "step": 368
2602
+ },
2603
+ {
2604
+ "epoch": 2.07,
2605
+ "grad_norm": 0.10022099638563725,
2606
+ "learning_rate": 1.585809789522007e-05,
2607
+ "loss": 0.8553,
2608
+ "step": 369
2609
+ },
2610
+ {
2611
+ "epoch": 2.08,
2612
+ "grad_norm": 0.11768853322439309,
2613
+ "learning_rate": 1.5685237192211747e-05,
2614
+ "loss": 0.8587,
2615
+ "step": 370
2616
+ },
2617
+ {
2618
+ "epoch": 2.08,
2619
+ "grad_norm": 0.09746799497107858,
2620
+ "learning_rate": 1.551298968599889e-05,
2621
+ "loss": 0.8461,
2622
+ "step": 371
2623
+ },
2624
+ {
2625
+ "epoch": 2.09,
2626
+ "grad_norm": 0.10423547308687264,
2627
+ "learning_rate": 1.534136275509136e-05,
2628
+ "loss": 0.8485,
2629
+ "step": 372
2630
+ },
2631
+ {
2632
+ "epoch": 2.1,
2633
+ "grad_norm": 0.1064669106822173,
2634
+ "learning_rate": 1.517036375141567e-05,
2635
+ "loss": 0.8495,
2636
+ "step": 373
2637
+ },
2638
+ {
2639
+ "epoch": 2.1,
2640
+ "grad_norm": 0.09616752021785893,
2641
+ "learning_rate": 1.5000000000000007e-05,
2642
+ "loss": 0.8438,
2643
+ "step": 374
2644
+ },
2645
+ {
2646
+ "epoch": 2.11,
2647
+ "grad_norm": 0.11068107117718663,
2648
+ "learning_rate": 1.4830278798660467e-05,
2649
+ "loss": 0.8563,
2650
+ "step": 375
2651
+ },
2652
+ {
2653
+ "epoch": 2.11,
2654
+ "grad_norm": 0.11633212251432552,
2655
+ "learning_rate": 1.4661207417688442e-05,
2656
+ "loss": 0.8541,
2657
+ "step": 376
2658
+ },
2659
+ {
2660
+ "epoch": 2.12,
2661
+ "grad_norm": 0.09985284219816701,
2662
+ "learning_rate": 1.4492793099539175e-05,
2663
+ "loss": 0.855,
2664
+ "step": 377
2665
+ },
2666
+ {
2667
+ "epoch": 2.12,
2668
+ "grad_norm": 0.11580112488775175,
2669
+ "learning_rate": 1.4325043058521537e-05,
2670
+ "loss": 0.8563,
2671
+ "step": 378
2672
+ },
2673
+ {
2674
+ "epoch": 2.13,
2675
+ "grad_norm": 0.0968466939956641,
2676
+ "learning_rate": 1.415796448048896e-05,
2677
+ "loss": 0.8526,
2678
+ "step": 379
2679
+ },
2680
+ {
2681
+ "epoch": 2.13,
2682
+ "grad_norm": 0.10786154660093858,
2683
+ "learning_rate": 1.3991564522531655e-05,
2684
+ "loss": 0.8485,
2685
+ "step": 380
2686
+ },
2687
+ {
2688
+ "epoch": 2.14,
2689
+ "grad_norm": 0.09923333146234646,
2690
+ "learning_rate": 1.3825850312669992e-05,
2691
+ "loss": 0.8513,
2692
+ "step": 381
2693
+ },
2694
+ {
2695
+ "epoch": 2.15,
2696
+ "grad_norm": 0.10427648708884239,
2697
+ "learning_rate": 1.3660828949549189e-05,
2698
+ "loss": 0.8486,
2699
+ "step": 382
2700
+ },
2701
+ {
2702
+ "epoch": 2.15,
2703
+ "grad_norm": 0.12292462282144442,
2704
+ "learning_rate": 1.34965075021352e-05,
2705
+ "loss": 0.8481,
2706
+ "step": 383
2707
+ },
2708
+ {
2709
+ "epoch": 2.16,
2710
+ "grad_norm": 0.1028967402644067,
2711
+ "learning_rate": 1.3332893009411942e-05,
2712
+ "loss": 0.8505,
2713
+ "step": 384
2714
+ },
2715
+ {
2716
+ "epoch": 2.16,
2717
+ "grad_norm": 0.10410540672946561,
2718
+ "learning_rate": 1.3169992480079712e-05,
2719
+ "loss": 0.8513,
2720
+ "step": 385
2721
+ },
2722
+ {
2723
+ "epoch": 2.17,
2724
+ "grad_norm": 0.09846244373078215,
2725
+ "learning_rate": 1.3007812892255022e-05,
2726
+ "loss": 0.8474,
2727
+ "step": 386
2728
+ },
2729
+ {
2730
+ "epoch": 2.17,
2731
+ "grad_norm": 0.1030135486730131,
2732
+ "learning_rate": 1.2846361193171636e-05,
2733
+ "loss": 0.8539,
2734
+ "step": 387
2735
+ },
2736
+ {
2737
+ "epoch": 2.18,
2738
+ "grad_norm": 0.09722668637542492,
2739
+ "learning_rate": 1.2685644298882995e-05,
2740
+ "loss": 0.8469,
2741
+ "step": 388
2742
+ },
2743
+ {
2744
+ "epoch": 2.19,
2745
+ "grad_norm": 0.10002106907707546,
2746
+ "learning_rate": 1.2525669093965938e-05,
2747
+ "loss": 0.8538,
2748
+ "step": 389
2749
+ },
2750
+ {
2751
+ "epoch": 2.19,
2752
+ "grad_norm": 0.1013552416026544,
2753
+ "learning_rate": 1.2366442431225809e-05,
2754
+ "loss": 0.8402,
2755
+ "step": 390
2756
+ },
2757
+ {
2758
+ "epoch": 2.2,
2759
+ "grad_norm": 0.10045607980672773,
2760
+ "learning_rate": 1.2207971131402889e-05,
2761
+ "loss": 0.8538,
2762
+ "step": 391
2763
+ },
2764
+ {
2765
+ "epoch": 2.2,
2766
+ "grad_norm": 0.10092477315351785,
2767
+ "learning_rate": 1.2050261982880229e-05,
2768
+ "loss": 0.8493,
2769
+ "step": 392
2770
+ },
2771
+ {
2772
+ "epoch": 2.21,
2773
+ "grad_norm": 0.09631205966968917,
2774
+ "learning_rate": 1.1893321741392857e-05,
2775
+ "loss": 0.844,
2776
+ "step": 393
2777
+ },
2778
+ {
2779
+ "epoch": 2.21,
2780
+ "grad_norm": 0.09480564837357011,
2781
+ "learning_rate": 1.173715712973838e-05,
2782
+ "loss": 0.8516,
2783
+ "step": 394
2784
+ },
2785
+ {
2786
+ "epoch": 2.22,
2787
+ "grad_norm": 0.10519104968879345,
2788
+ "learning_rate": 1.1581774837489004e-05,
2789
+ "loss": 0.8489,
2790
+ "step": 395
2791
+ },
2792
+ {
2793
+ "epoch": 2.22,
2794
+ "grad_norm": 0.09383687719586851,
2795
+ "learning_rate": 1.1427181520704977e-05,
2796
+ "loss": 0.8423,
2797
+ "step": 396
2798
+ },
2799
+ {
2800
+ "epoch": 2.23,
2801
+ "grad_norm": 0.09314803679639443,
2802
+ "learning_rate": 1.1273383801649465e-05,
2803
+ "loss": 0.855,
2804
+ "step": 397
2805
+ },
2806
+ {
2807
+ "epoch": 2.24,
2808
+ "grad_norm": 0.09785284085871111,
2809
+ "learning_rate": 1.1120388268504882e-05,
2810
+ "loss": 0.8592,
2811
+ "step": 398
2812
+ },
2813
+ {
2814
+ "epoch": 2.24,
2815
+ "grad_norm": 0.10014383710533643,
2816
+ "learning_rate": 1.0968201475090638e-05,
2817
+ "loss": 0.8527,
2818
+ "step": 399
2819
+ },
2820
+ {
2821
+ "epoch": 2.25,
2822
+ "grad_norm": 0.09342386714963961,
2823
+ "learning_rate": 1.081682994058246e-05,
2824
+ "loss": 0.8518,
2825
+ "step": 400
2826
+ },
2827
+ {
2828
+ "epoch": 2.25,
2829
+ "grad_norm": 0.09637941328010605,
2830
+ "learning_rate": 1.0666280149233084e-05,
2831
+ "loss": 0.8611,
2832
+ "step": 401
2833
+ },
2834
+ {
2835
+ "epoch": 2.26,
2836
+ "grad_norm": 0.09382962878187683,
2837
+ "learning_rate": 1.0516558550094494e-05,
2838
+ "loss": 0.8534,
2839
+ "step": 402
2840
+ },
2841
+ {
2842
+ "epoch": 2.26,
2843
+ "grad_norm": 0.09961841204532747,
2844
+ "learning_rate": 1.036767155674169e-05,
2845
+ "loss": 0.8616,
2846
+ "step": 403
2847
+ },
2848
+ {
2849
+ "epoch": 2.27,
2850
+ "grad_norm": 0.09436634787644145,
2851
+ "learning_rate": 1.0219625546997936e-05,
2852
+ "loss": 0.8484,
2853
+ "step": 404
2854
+ },
2855
+ {
2856
+ "epoch": 2.28,
2857
+ "grad_norm": 0.09434782225624004,
2858
+ "learning_rate": 1.0072426862661559e-05,
2859
+ "loss": 0.8543,
2860
+ "step": 405
2861
+ },
2862
+ {
2863
+ "epoch": 2.28,
2864
+ "grad_norm": 0.09240933321648483,
2865
+ "learning_rate": 9.926081809234262e-06,
2866
+ "loss": 0.8521,
2867
+ "step": 406
2868
+ },
2869
+ {
2870
+ "epoch": 2.29,
2871
+ "grad_norm": 0.09529963951322644,
2872
+ "learning_rate": 9.780596655651062e-06,
2873
+ "loss": 0.8502,
2874
+ "step": 407
2875
+ },
2876
+ {
2877
+ "epoch": 2.29,
2878
+ "grad_norm": 0.09779115542409987,
2879
+ "learning_rate": 9.635977634011746e-06,
2880
+ "loss": 0.8538,
2881
+ "step": 408
2882
+ },
2883
+ {
2884
+ "epoch": 2.3,
2885
+ "grad_norm": 0.0953318034742421,
2886
+ "learning_rate": 9.492230939313859e-06,
2887
+ "loss": 0.8462,
2888
+ "step": 409
2889
+ },
2890
+ {
2891
+ "epoch": 2.3,
2892
+ "grad_norm": 0.09533554193414427,
2893
+ "learning_rate": 9.349362729187376e-06,
2894
+ "loss": 0.8505,
2895
+ "step": 410
2896
+ },
2897
+ {
2898
+ "epoch": 2.31,
2899
+ "grad_norm": 0.09108747344016899,
2900
+ "learning_rate": 9.207379123630928e-06,
2901
+ "loss": 0.8364,
2902
+ "step": 411
2903
+ },
2904
+ {
2905
+ "epoch": 2.31,
2906
+ "grad_norm": 0.09157394780369447,
2907
+ "learning_rate": 9.066286204749602e-06,
2908
+ "loss": 0.8542,
2909
+ "step": 412
2910
+ },
2911
+ {
2912
+ "epoch": 2.32,
2913
+ "grad_norm": 0.09626988034418861,
2914
+ "learning_rate": 8.926090016494452e-06,
2915
+ "loss": 0.8395,
2916
+ "step": 413
2917
+ },
2918
+ {
2919
+ "epoch": 2.33,
2920
+ "grad_norm": 0.0932671686945301,
2921
+ "learning_rate": 8.786796564403577e-06,
2922
+ "loss": 0.8545,
2923
+ "step": 414
2924
+ },
2925
+ {
2926
+ "epoch": 2.33,
2927
+ "grad_norm": 0.09177787384063951,
2928
+ "learning_rate": 8.648411815344862e-06,
2929
+ "loss": 0.8478,
2930
+ "step": 415
2931
+ },
2932
+ {
2933
+ "epoch": 2.34,
2934
+ "grad_norm": 0.09283856078392008,
2935
+ "learning_rate": 8.510941697260372e-06,
2936
+ "loss": 0.8482,
2937
+ "step": 416
2938
+ },
2939
+ {
2940
+ "epoch": 2.34,
2941
+ "grad_norm": 0.09505027194291998,
2942
+ "learning_rate": 8.374392098912435e-06,
2943
+ "loss": 0.8515,
2944
+ "step": 417
2945
+ },
2946
+ {
2947
+ "epoch": 2.35,
2948
+ "grad_norm": 0.09061937534466419,
2949
+ "learning_rate": 8.238768869631379e-06,
2950
+ "loss": 0.8419,
2951
+ "step": 418
2952
+ },
2953
+ {
2954
+ "epoch": 2.35,
2955
+ "grad_norm": 0.09687410932411479,
2956
+ "learning_rate": 8.104077819064939e-06,
2957
+ "loss": 0.8455,
2958
+ "step": 419
2959
+ },
2960
+ {
2961
+ "epoch": 2.36,
2962
+ "grad_norm": 0.0919488286995836,
2963
+ "learning_rate": 7.97032471692944e-06,
2964
+ "loss": 0.8543,
2965
+ "step": 420
2966
+ },
2967
+ {
2968
+ "epoch": 2.37,
2969
+ "grad_norm": 0.0917348716297872,
2970
+ "learning_rate": 7.837515292762618e-06,
2971
+ "loss": 0.84,
2972
+ "step": 421
2973
+ },
2974
+ {
2975
+ "epoch": 2.37,
2976
+ "grad_norm": 0.0970295444546269,
2977
+ "learning_rate": 7.70565523567817e-06,
2978
+ "loss": 0.8483,
2979
+ "step": 422
2980
+ },
2981
+ {
2982
+ "epoch": 2.38,
2983
+ "grad_norm": 0.09120283035787119,
2984
+ "learning_rate": 7.5747501941220924e-06,
2985
+ "loss": 0.8495,
2986
+ "step": 423
2987
+ },
2988
+ {
2989
+ "epoch": 2.38,
2990
+ "grad_norm": 0.09237994609930707,
2991
+ "learning_rate": 7.444805775630682e-06,
2992
+ "loss": 0.8426,
2993
+ "step": 424
2994
+ },
2995
+ {
2996
+ "epoch": 2.39,
2997
+ "grad_norm": 0.0913397657548416,
2998
+ "learning_rate": 7.315827546590318e-06,
2999
+ "loss": 0.8512,
3000
+ "step": 425
3001
+ },
3002
+ {
3003
+ "epoch": 2.39,
3004
+ "grad_norm": 0.09498181364054688,
3005
+ "learning_rate": 7.187821031999073e-06,
3006
+ "loss": 0.8481,
3007
+ "step": 426
3008
+ },
3009
+ {
3010
+ "epoch": 2.4,
3011
+ "grad_norm": 0.10078078724917741,
3012
+ "learning_rate": 7.0607917152299905e-06,
3013
+ "loss": 0.8455,
3014
+ "step": 427
3015
+ },
3016
+ {
3017
+ "epoch": 2.4,
3018
+ "grad_norm": 0.09145562291129593,
3019
+ "learning_rate": 6.9347450377962165e-06,
3020
+ "loss": 0.8491,
3021
+ "step": 428
3022
+ },
3023
+ {
3024
+ "epoch": 2.41,
3025
+ "grad_norm": 0.09158338626328179,
3026
+ "learning_rate": 6.8096863991178906e-06,
3027
+ "loss": 0.8561,
3028
+ "step": 429
3029
+ },
3030
+ {
3031
+ "epoch": 2.42,
3032
+ "grad_norm": 0.09207229087759045,
3033
+ "learning_rate": 6.685621156290873e-06,
3034
+ "loss": 0.8467,
3035
+ "step": 430
3036
+ },
3037
+ {
3038
+ "epoch": 2.42,
3039
+ "grad_norm": 0.08985403499699009,
3040
+ "learning_rate": 6.562554623857251e-06,
3041
+ "loss": 0.8446,
3042
+ "step": 431
3043
+ },
3044
+ {
3045
+ "epoch": 2.43,
3046
+ "grad_norm": 0.09232970248222189,
3047
+ "learning_rate": 6.440492073577659e-06,
3048
+ "loss": 0.8412,
3049
+ "step": 432
3050
+ },
3051
+ {
3052
+ "epoch": 2.43,
3053
+ "grad_norm": 0.08869732511140344,
3054
+ "learning_rate": 6.319438734205503e-06,
3055
+ "loss": 0.8533,
3056
+ "step": 433
3057
+ },
3058
+ {
3059
+ "epoch": 2.44,
3060
+ "grad_norm": 0.09159355570132911,
3061
+ "learning_rate": 6.199399791262949e-06,
3062
+ "loss": 0.8426,
3063
+ "step": 434
3064
+ },
3065
+ {
3066
+ "epoch": 2.44,
3067
+ "grad_norm": 0.08776078153410961,
3068
+ "learning_rate": 6.08038038681879e-06,
3069
+ "loss": 0.8468,
3070
+ "step": 435
3071
+ },
3072
+ {
3073
+ "epoch": 2.45,
3074
+ "grad_norm": 0.08961006632692203,
3075
+ "learning_rate": 5.962385619268184e-06,
3076
+ "loss": 0.8443,
3077
+ "step": 436
3078
+ },
3079
+ {
3080
+ "epoch": 2.46,
3081
+ "grad_norm": 0.09327133481154146,
3082
+ "learning_rate": 5.845420543114255e-06,
3083
+ "loss": 0.8507,
3084
+ "step": 437
3085
+ },
3086
+ {
3087
+ "epoch": 2.46,
3088
+ "grad_norm": 0.08783358817348458,
3089
+ "learning_rate": 5.72949016875158e-06,
3090
+ "loss": 0.8444,
3091
+ "step": 438
3092
+ },
3093
+ {
3094
+ "epoch": 2.47,
3095
+ "grad_norm": 0.09080799921546491,
3096
+ "learning_rate": 5.614599462251546e-06,
3097
+ "loss": 0.8504,
3098
+ "step": 439
3099
+ },
3100
+ {
3101
+ "epoch": 2.47,
3102
+ "grad_norm": 0.09120455202673097,
3103
+ "learning_rate": 5.500753345149633e-06,
3104
+ "loss": 0.8511,
3105
+ "step": 440
3106
+ },
3107
+ {
3108
+ "epoch": 2.48,
3109
+ "grad_norm": 0.0883979066402917,
3110
+ "learning_rate": 5.387956694234592e-06,
3111
+ "loss": 0.8346,
3112
+ "step": 441
3113
+ },
3114
+ {
3115
+ "epoch": 2.48,
3116
+ "grad_norm": 0.08782824261068842,
3117
+ "learning_rate": 5.2762143413395296e-06,
3118
+ "loss": 0.8499,
3119
+ "step": 442
3120
+ },
3121
+ {
3122
+ "epoch": 2.49,
3123
+ "grad_norm": 0.09088620542561385,
3124
+ "learning_rate": 5.165531073134936e-06,
3125
+ "loss": 0.8496,
3126
+ "step": 443
3127
+ },
3128
+ {
3129
+ "epoch": 2.49,
3130
+ "grad_norm": 0.08784209719890995,
3131
+ "learning_rate": 5.05591163092364e-06,
3132
+ "loss": 0.8406,
3133
+ "step": 444
3134
+ },
3135
+ {
3136
+ "epoch": 2.5,
3137
+ "grad_norm": 0.09132953206887262,
3138
+ "learning_rate": 4.9473607104377105e-06,
3139
+ "loss": 0.8459,
3140
+ "step": 445
3141
+ },
3142
+ {
3143
+ "epoch": 2.51,
3144
+ "grad_norm": 0.08772978666632109,
3145
+ "learning_rate": 4.839882961637282e-06,
3146
+ "loss": 0.8505,
3147
+ "step": 446
3148
+ },
3149
+ {
3150
+ "epoch": 2.51,
3151
+ "grad_norm": 0.09985768462831238,
3152
+ "learning_rate": 4.733482988511407e-06,
3153
+ "loss": 0.8534,
3154
+ "step": 447
3155
+ },
3156
+ {
3157
+ "epoch": 2.52,
3158
+ "grad_norm": 0.08761275010174724,
3159
+ "learning_rate": 4.628165348880804e-06,
3160
+ "loss": 0.8524,
3161
+ "step": 448
3162
+ },
3163
+ {
3164
+ "epoch": 2.52,
3165
+ "grad_norm": 0.09041594806081453,
3166
+ "learning_rate": 4.523934554202636e-06,
3167
+ "loss": 0.8529,
3168
+ "step": 449
3169
+ },
3170
+ {
3171
+ "epoch": 2.53,
3172
+ "grad_norm": 0.08661829609254855,
3173
+ "learning_rate": 4.4207950693772345e-06,
3174
+ "loss": 0.8409,
3175
+ "step": 450
3176
+ },
3177
+ {
3178
+ "epoch": 2.53,
3179
+ "grad_norm": 0.0895302020625196,
3180
+ "learning_rate": 4.3187513125568586e-06,
3181
+ "loss": 0.8453,
3182
+ "step": 451
3183
+ },
3184
+ {
3185
+ "epoch": 2.54,
3186
+ "grad_norm": 0.08974158495627368,
3187
+ "learning_rate": 4.217807654956419e-06,
3188
+ "loss": 0.8427,
3189
+ "step": 452
3190
+ },
3191
+ {
3192
+ "epoch": 2.54,
3193
+ "grad_norm": 0.0881031632024463,
3194
+ "learning_rate": 4.117968420666245e-06,
3195
+ "loss": 0.8518,
3196
+ "step": 453
3197
+ },
3198
+ {
3199
+ "epoch": 2.55,
3200
+ "grad_norm": 0.08911231289710217,
3201
+ "learning_rate": 4.019237886466839e-06,
3202
+ "loss": 0.8479,
3203
+ "step": 454
3204
+ },
3205
+ {
3206
+ "epoch": 2.56,
3207
+ "grad_norm": 0.08758751320687048,
3208
+ "learning_rate": 3.921620281645688e-06,
3209
+ "loss": 0.8428,
3210
+ "step": 455
3211
+ },
3212
+ {
3213
+ "epoch": 2.56,
3214
+ "grad_norm": 0.08762936227381564,
3215
+ "learning_rate": 3.825119787816085e-06,
3216
+ "loss": 0.8541,
3217
+ "step": 456
3218
+ },
3219
+ {
3220
+ "epoch": 2.57,
3221
+ "grad_norm": 0.09070004531926247,
3222
+ "learning_rate": 3.7297405387380066e-06,
3223
+ "loss": 0.8606,
3224
+ "step": 457
3225
+ },
3226
+ {
3227
+ "epoch": 2.57,
3228
+ "grad_norm": 0.0901394728812162,
3229
+ "learning_rate": 3.635486620141042e-06,
3230
+ "loss": 0.8439,
3231
+ "step": 458
3232
+ },
3233
+ {
3234
+ "epoch": 2.58,
3235
+ "grad_norm": 0.09035508650424855,
3236
+ "learning_rate": 3.542362069549352e-06,
3237
+ "loss": 0.8597,
3238
+ "step": 459
3239
+ },
3240
+ {
3241
+ "epoch": 2.58,
3242
+ "grad_norm": 0.08987208626466826,
3243
+ "learning_rate": 3.450370876108747e-06,
3244
+ "loss": 0.8549,
3245
+ "step": 460
3246
+ },
3247
+ {
3248
+ "epoch": 2.59,
3249
+ "grad_norm": 0.08584439292379578,
3250
+ "learning_rate": 3.3595169804157834e-06,
3251
+ "loss": 0.8511,
3252
+ "step": 461
3253
+ },
3254
+ {
3255
+ "epoch": 2.6,
3256
+ "grad_norm": 0.0885850408197027,
3257
+ "learning_rate": 3.2698042743489666e-06,
3258
+ "loss": 0.8538,
3259
+ "step": 462
3260
+ },
3261
+ {
3262
+ "epoch": 2.6,
3263
+ "grad_norm": 0.09187965878611364,
3264
+ "learning_rate": 3.1812366009020366e-06,
3265
+ "loss": 0.8509,
3266
+ "step": 463
3267
+ },
3268
+ {
3269
+ "epoch": 2.61,
3270
+ "grad_norm": 0.08932729014366118,
3271
+ "learning_rate": 3.0938177540193523e-06,
3272
+ "loss": 0.8422,
3273
+ "step": 464
3274
+ },
3275
+ {
3276
+ "epoch": 2.61,
3277
+ "grad_norm": 0.0850824069799195,
3278
+ "learning_rate": 3.0075514784333613e-06,
3279
+ "loss": 0.8388,
3280
+ "step": 465
3281
+ },
3282
+ {
3283
+ "epoch": 2.62,
3284
+ "grad_norm": 0.08738926155401434,
3285
+ "learning_rate": 2.922441469504188e-06,
3286
+ "loss": 0.8561,
3287
+ "step": 466
3288
+ },
3289
+ {
3290
+ "epoch": 2.62,
3291
+ "grad_norm": 0.08920484001133727,
3292
+ "learning_rate": 2.8384913730613404e-06,
3293
+ "loss": 0.8557,
3294
+ "step": 467
3295
+ },
3296
+ {
3297
+ "epoch": 2.63,
3298
+ "grad_norm": 0.08819170118822395,
3299
+ "learning_rate": 2.7557047852475594e-06,
3300
+ "loss": 0.858,
3301
+ "step": 468
3302
+ },
3303
+ {
3304
+ "epoch": 2.63,
3305
+ "grad_norm": 0.08869175897009467,
3306
+ "learning_rate": 2.674085252364723e-06,
3307
+ "loss": 0.8536,
3308
+ "step": 469
3309
+ },
3310
+ {
3311
+ "epoch": 2.64,
3312
+ "grad_norm": 0.09053609127800862,
3313
+ "learning_rate": 2.5936362707219708e-06,
3314
+ "loss": 0.8483,
3315
+ "step": 470
3316
+ },
3317
+ {
3318
+ "epoch": 2.65,
3319
+ "grad_norm": 0.08867818730722615,
3320
+ "learning_rate": 2.5143612864859246e-06,
3321
+ "loss": 0.8532,
3322
+ "step": 471
3323
+ },
3324
+ {
3325
+ "epoch": 2.65,
3326
+ "grad_norm": 0.09139340482992017,
3327
+ "learning_rate": 2.4362636955330543e-06,
3328
+ "loss": 0.8498,
3329
+ "step": 472
3330
+ },
3331
+ {
3332
+ "epoch": 2.66,
3333
+ "grad_norm": 0.09525892817222975,
3334
+ "learning_rate": 2.3593468433042278e-06,
3335
+ "loss": 0.8529,
3336
+ "step": 473
3337
+ },
3338
+ {
3339
+ "epoch": 2.66,
3340
+ "grad_norm": 0.08789827680089286,
3341
+ "learning_rate": 2.2836140246613977e-06,
3342
+ "loss": 0.8521,
3343
+ "step": 474
3344
+ },
3345
+ {
3346
+ "epoch": 2.67,
3347
+ "grad_norm": 0.08648818427629963,
3348
+ "learning_rate": 2.209068483746457e-06,
3349
+ "loss": 0.8422,
3350
+ "step": 475
3351
+ },
3352
+ {
3353
+ "epoch": 2.67,
3354
+ "grad_norm": 0.08960144249428664,
3355
+ "learning_rate": 2.135713413842273e-06,
3356
+ "loss": 0.8505,
3357
+ "step": 476
3358
+ },
3359
+ {
3360
+ "epoch": 2.68,
3361
+ "grad_norm": 0.08754870519933383,
3362
+ "learning_rate": 2.063551957235893e-06,
3363
+ "loss": 0.8402,
3364
+ "step": 477
3365
+ },
3366
+ {
3367
+ "epoch": 2.69,
3368
+ "grad_norm": 0.08550557755721046,
3369
+ "learning_rate": 1.992587205083951e-06,
3370
+ "loss": 0.8575,
3371
+ "step": 478
3372
+ },
3373
+ {
3374
+ "epoch": 2.69,
3375
+ "grad_norm": 0.08826896212264684,
3376
+ "learning_rate": 1.922822197280234e-06,
3377
+ "loss": 0.8541,
3378
+ "step": 479
3379
+ },
3380
+ {
3381
+ "epoch": 2.7,
3382
+ "grad_norm": 0.08668882825496892,
3383
+ "learning_rate": 1.8542599223254786e-06,
3384
+ "loss": 0.846,
3385
+ "step": 480
3386
+ },
3387
+ {
3388
+ "epoch": 2.7,
3389
+ "grad_norm": 0.08475522972496816,
3390
+ "learning_rate": 1.7869033171993575e-06,
3391
+ "loss": 0.8407,
3392
+ "step": 481
3393
+ },
3394
+ {
3395
+ "epoch": 2.71,
3396
+ "grad_norm": 0.08771710800870834,
3397
+ "learning_rate": 1.7207552672346471e-06,
3398
+ "loss": 0.8503,
3399
+ "step": 482
3400
+ },
3401
+ {
3402
+ "epoch": 2.71,
3403
+ "grad_norm": 0.09111144297342101,
3404
+ "learning_rate": 1.6558186059936587e-06,
3405
+ "loss": 0.8479,
3406
+ "step": 483
3407
+ },
3408
+ {
3409
+ "epoch": 2.72,
3410
+ "grad_norm": 0.08620883215580287,
3411
+ "learning_rate": 1.5920961151468327e-06,
3412
+ "loss": 0.8458,
3413
+ "step": 484
3414
+ },
3415
+ {
3416
+ "epoch": 2.72,
3417
+ "grad_norm": 0.08471255274938502,
3418
+ "learning_rate": 1.5295905243535847e-06,
3419
+ "loss": 0.8537,
3420
+ "step": 485
3421
+ },
3422
+ {
3423
+ "epoch": 2.73,
3424
+ "grad_norm": 0.08449994501906344,
3425
+ "learning_rate": 1.4683045111453942e-06,
3426
+ "loss": 0.8492,
3427
+ "step": 486
3428
+ },
3429
+ {
3430
+ "epoch": 2.74,
3431
+ "grad_norm": 0.08739419896581681,
3432
+ "learning_rate": 1.408240700811091e-06,
3433
+ "loss": 0.8524,
3434
+ "step": 487
3435
+ },
3436
+ {
3437
+ "epoch": 2.74,
3438
+ "grad_norm": 0.08352102275911637,
3439
+ "learning_rate": 1.3494016662844011e-06,
3440
+ "loss": 0.8476,
3441
+ "step": 488
3442
+ },
3443
+ {
3444
+ "epoch": 2.75,
3445
+ "grad_norm": 0.08352827574514726,
3446
+ "learning_rate": 1.2917899280337354e-06,
3447
+ "loss": 0.8551,
3448
+ "step": 489
3449
+ },
3450
+ {
3451
+ "epoch": 2.75,
3452
+ "grad_norm": 0.08439329456211196,
3453
+ "learning_rate": 1.2354079539542085e-06,
3454
+ "loss": 0.8416,
3455
+ "step": 490
3456
+ },
3457
+ {
3458
+ "epoch": 2.76,
3459
+ "grad_norm": 0.08562104303556034,
3460
+ "learning_rate": 1.1802581592619444e-06,
3461
+ "loss": 0.8501,
3462
+ "step": 491
3463
+ },
3464
+ {
3465
+ "epoch": 2.76,
3466
+ "grad_norm": 0.0896431543921245,
3467
+ "learning_rate": 1.126342906390585e-06,
3468
+ "loss": 0.8461,
3469
+ "step": 492
3470
+ },
3471
+ {
3472
+ "epoch": 2.77,
3473
+ "grad_norm": 0.08543853113029239,
3474
+ "learning_rate": 1.0736645048901217e-06,
3475
+ "loss": 0.8586,
3476
+ "step": 493
3477
+ },
3478
+ {
3479
+ "epoch": 2.78,
3480
+ "grad_norm": 0.08898225127672299,
3481
+ "learning_rate": 1.022225211327954e-06,
3482
+ "loss": 0.846,
3483
+ "step": 494
3484
+ },
3485
+ {
3486
+ "epoch": 2.78,
3487
+ "grad_norm": 0.08623358584808383,
3488
+ "learning_rate": 9.720272291922072e-07,
3489
+ "loss": 0.8406,
3490
+ "step": 495
3491
+ },
3492
+ {
3493
+ "epoch": 2.79,
3494
+ "grad_norm": 0.08391526822617813,
3495
+ "learning_rate": 9.230727087973712e-07,
3496
+ "loss": 0.8451,
3497
+ "step": 496
3498
+ },
3499
+ {
3500
+ "epoch": 2.79,
3501
+ "grad_norm": 0.08567392350575599,
3502
+ "learning_rate": 8.753637471921572e-07,
3503
+ "loss": 0.8408,
3504
+ "step": 497
3505
+ },
3506
+ {
3507
+ "epoch": 2.8,
3508
+ "grad_norm": 0.08908923286438868,
3509
+ "learning_rate": 8.289023880697033e-07,
3510
+ "loss": 0.8418,
3511
+ "step": 498
3512
+ },
3513
+ {
3514
+ "epoch": 2.8,
3515
+ "grad_norm": 0.08472167209293709,
3516
+ "learning_rate": 7.83690621679991e-07,
3517
+ "loss": 0.8448,
3518
+ "step": 499
3519
+ },
3520
+ {
3521
+ "epoch": 2.81,
3522
+ "grad_norm": 0.0841111177111636,
3523
+ "learning_rate": 7.397303847446202e-07,
3524
+ "loss": 0.8482,
3525
+ "step": 500
3526
+ },
3527
+ {
3528
+ "epoch": 2.81,
3529
+ "grad_norm": 0.09207138444341484,
3530
+ "learning_rate": 6.970235603738284e-07,
3531
+ "loss": 0.8455,
3532
+ "step": 501
3533
+ },
3534
+ {
3535
+ "epoch": 2.82,
3536
+ "grad_norm": 0.08300698846765277,
3537
+ "learning_rate": 6.555719779858294e-07,
3538
+ "loss": 0.8393,
3539
+ "step": 502
3540
+ },
3541
+ {
3542
+ "epoch": 2.83,
3543
+ "grad_norm": 0.08410038120955553,
3544
+ "learning_rate": 6.153774132284584e-07,
3545
+ "loss": 0.8426,
3546
+ "step": 503
3547
+ },
3548
+ {
3549
+ "epoch": 2.83,
3550
+ "grad_norm": 0.08542102355669302,
3551
+ "learning_rate": 5.764415879030871e-07,
3552
+ "loss": 0.8538,
3553
+ "step": 504
3554
+ },
3555
+ {
3556
+ "epoch": 2.84,
3557
+ "grad_norm": 0.08431102358812953,
3558
+ "learning_rate": 5.387661698908852e-07,
3559
+ "loss": 0.8454,
3560
+ "step": 505
3561
+ },
3562
+ {
3563
+ "epoch": 2.84,
3564
+ "grad_norm": 0.08201034861685735,
3565
+ "learning_rate": 5.023527730813649e-07,
3566
+ "loss": 0.8492,
3567
+ "step": 506
3568
+ },
3569
+ {
3570
+ "epoch": 2.85,
3571
+ "grad_norm": 0.08426514879217514,
3572
+ "learning_rate": 4.672029573032521e-07,
3573
+ "loss": 0.8539,
3574
+ "step": 507
3575
+ },
3576
+ {
3577
+ "epoch": 2.85,
3578
+ "grad_norm": 0.08363012777621809,
3579
+ "learning_rate": 4.333182282576675e-07,
3580
+ "loss": 0.8507,
3581
+ "step": 508
3582
+ },
3583
+ {
3584
+ "epoch": 2.86,
3585
+ "grad_norm": 0.08403404035574012,
3586
+ "learning_rate": 4.0070003745363073e-07,
3587
+ "loss": 0.8417,
3588
+ "step": 509
3589
+ },
3590
+ {
3591
+ "epoch": 2.87,
3592
+ "grad_norm": 0.08556297885057308,
3593
+ "learning_rate": 3.6934978214587026e-07,
3594
+ "loss": 0.8542,
3595
+ "step": 510
3596
+ },
3597
+ {
3598
+ "epoch": 2.87,
3599
+ "grad_norm": 0.0830271098839072,
3600
+ "learning_rate": 3.392688052749782e-07,
3601
+ "loss": 0.8384,
3602
+ "step": 511
3603
+ },
3604
+ {
3605
+ "epoch": 2.88,
3606
+ "grad_norm": 0.08332845498822317,
3607
+ "learning_rate": 3.1045839540989273e-07,
3608
+ "loss": 0.8497,
3609
+ "step": 512
3610
+ },
3611
+ {
3612
+ "epoch": 2.88,
3613
+ "grad_norm": 0.08163756862158367,
3614
+ "learning_rate": 2.829197866926825e-07,
3615
+ "loss": 0.8436,
3616
+ "step": 513
3617
+ },
3618
+ {
3619
+ "epoch": 2.89,
3620
+ "grad_norm": 0.08239779756966806,
3621
+ "learning_rate": 2.5665415878568855e-07,
3622
+ "loss": 0.8419,
3623
+ "step": 514
3624
+ },
3625
+ {
3626
+ "epoch": 2.89,
3627
+ "grad_norm": 0.0846233275518859,
3628
+ "learning_rate": 2.3166263682098844e-07,
3629
+ "loss": 0.8569,
3630
+ "step": 515
3631
+ },
3632
+ {
3633
+ "epoch": 2.9,
3634
+ "grad_norm": 0.08295765011201144,
3635
+ "learning_rate": 2.0794629135221123e-07,
3636
+ "loss": 0.852,
3637
+ "step": 516
3638
+ },
3639
+ {
3640
+ "epoch": 2.9,
3641
+ "grad_norm": 0.08898478620134848,
3642
+ "learning_rate": 1.8550613830865758e-07,
3643
+ "loss": 0.8524,
3644
+ "step": 517
3645
+ },
3646
+ {
3647
+ "epoch": 2.91,
3648
+ "grad_norm": 0.08253980942709409,
3649
+ "learning_rate": 1.6434313895180132e-07,
3650
+ "loss": 0.8439,
3651
+ "step": 518
3652
+ },
3653
+ {
3654
+ "epoch": 2.92,
3655
+ "grad_norm": 0.08658080433487944,
3656
+ "learning_rate": 1.4445819983409546e-07,
3657
+ "loss": 0.8478,
3658
+ "step": 519
3659
+ },
3660
+ {
3661
+ "epoch": 2.92,
3662
+ "grad_norm": 0.085494836912567,
3663
+ "learning_rate": 1.2585217276015026e-07,
3664
+ "loss": 0.854,
3665
+ "step": 520
3666
+ },
3667
+ {
3668
+ "epoch": 2.93,
3669
+ "grad_norm": 0.08520015820412867,
3670
+ "learning_rate": 1.085258547502388e-07,
3671
+ "loss": 0.8527,
3672
+ "step": 521
3673
+ },
3674
+ {
3675
+ "epoch": 2.93,
3676
+ "grad_norm": 0.0835695495145572,
3677
+ "learning_rate": 9.247998800616108e-08,
3678
+ "loss": 0.8514,
3679
+ "step": 522
3680
+ },
3681
+ {
3682
+ "epoch": 2.94,
3683
+ "grad_norm": 0.08360415935414345,
3684
+ "learning_rate": 7.771525987944284e-08,
3685
+ "loss": 0.8497,
3686
+ "step": 523
3687
+ },
3688
+ {
3689
+ "epoch": 2.94,
3690
+ "grad_norm": 0.0828425664078141,
3691
+ "learning_rate": 6.423230284189563e-08,
3692
+ "loss": 0.8495,
3693
+ "step": 524
3694
+ },
3695
+ {
3696
+ "epoch": 2.95,
3697
+ "grad_norm": 0.08252226826743013,
3698
+ "learning_rate": 5.203169445852529e-08,
3699
+ "loss": 0.8412,
3700
+ "step": 525
3701
+ },
3702
+ {
3703
+ "epoch": 2.96,
3704
+ "grad_norm": 0.08292638372084292,
3705
+ "learning_rate": 4.1113957362785e-08,
3706
+ "loss": 0.8575,
3707
+ "step": 526
3708
+ },
3709
+ {
3710
+ "epoch": 2.96,
3711
+ "grad_norm": 0.08188311347471468,
3712
+ "learning_rate": 3.147955923419654e-08,
3713
+ "loss": 0.841,
3714
+ "step": 527
3715
+ },
3716
+ {
3717
+ "epoch": 2.97,
3718
+ "grad_norm": 0.0836376120816663,
3719
+ "learning_rate": 2.3128912778312972e-08,
3720
+ "loss": 0.8484,
3721
+ "step": 528
3722
+ },
3723
+ {
3724
+ "epoch": 2.97,
3725
+ "grad_norm": 0.08186296368729573,
3726
+ "learning_rate": 1.6062375709029465e-08,
3727
+ "loss": 0.8544,
3728
+ "step": 529
3729
+ },
3730
+ {
3731
+ "epoch": 2.98,
3732
+ "grad_norm": 0.08278337963328566,
3733
+ "learning_rate": 1.0280250733282203e-08,
3734
+ "loss": 0.8478,
3735
+ "step": 530
3736
+ },
3737
+ {
3738
+ "epoch": 2.98,
3739
+ "grad_norm": 0.08279493662178963,
3740
+ "learning_rate": 5.7827855380554465e-09,
3741
+ "loss": 0.8465,
3742
+ "step": 531
3743
+ },
3744
+ {
3745
+ "epoch": 2.99,
3746
+ "grad_norm": 0.09221939825908573,
3747
+ "learning_rate": 2.570172779789992e-09,
3748
+ "loss": 0.8473,
3749
+ "step": 532
3750
+ },
3751
+ {
3752
+ "epoch": 2.99,
3753
+ "grad_norm": 0.09063451115383812,
3754
+ "learning_rate": 6.425500761231274e-10,
3755
+ "loss": 0.8581,
3756
+ "step": 533
3757
+ },
3758
+ {
3759
+ "epoch": 3.0,
3760
+ "grad_norm": 0.08387090563875321,
3761
+ "learning_rate": 0.0,
3762
+ "loss": 0.8385,
3763
+ "step": 534
3764
+ },
3765
+ {
3766
+ "epoch": 3.0,
3767
+ "eval_loss": 0.8818458318710327,
3768
+ "eval_runtime": 352.6305,
3769
+ "eval_samples_per_second": 37.135,
3770
+ "eval_steps_per_second": 0.057,
3771
+ "step": 534
3772
+ },
3773
+ {
3774
+ "epoch": 3.0,
3775
+ "step": 534,
3776
+ "total_flos": 4217873142644736.0,
3777
+ "train_loss": 0.9222131907270196,
3778
+ "train_runtime": 33956.1835,
3779
+ "train_samples_per_second": 10.545,
3780
+ "train_steps_per_second": 0.016
3781
+ }
3782
+ ],
3783
+ "logging_steps": 1,
3784
+ "max_steps": 534,
3785
+ "num_input_tokens_seen": 0,
3786
+ "num_train_epochs": 3,
3787
+ "save_steps": 500,
3788
+ "total_flos": 4217873142644736.0,
3789
+ "train_batch_size": 42,
3790
+ "trial_name": null,
3791
+ "trial_params": null
3792
+ }