JaepaX commited on
Commit
4cba801
1 Parent(s): f79a80c

whisper-tiny-french-best

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "openai/whisper-tiny",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
  "apply_spec_augment": false,
@@ -34,7 +34,7 @@
34
  "mask_time_length": 10,
35
  "mask_time_min_masks": 2,
36
  "mask_time_prob": 0.05,
37
- "max_length": 448,
38
  "max_source_positions": 1500,
39
  "max_target_positions": 448,
40
  "median_filter_width": 7,
 
1
  {
2
+ "_name_or_path": "/home/david/whisper_transferlearning/whisper-tiny-fr-micro-train/checkpoint-4011",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
  "apply_spec_augment": false,
 
34
  "mask_time_length": 10,
35
  "mask_time_min_masks": 2,
36
  "mask_time_prob": 0.05,
37
+ "max_length": 225,
38
  "max_source_positions": 1500,
39
  "max_target_positions": 448,
40
  "median_filter_width": 7,
generation_config.json ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alignment_heads": [
3
+ [
4
+ 2,
5
+ 2
6
+ ],
7
+ [
8
+ 3,
9
+ 0
10
+ ],
11
+ [
12
+ 3,
13
+ 2
14
+ ],
15
+ [
16
+ 3,
17
+ 3
18
+ ],
19
+ [
20
+ 3,
21
+ 4
22
+ ],
23
+ [
24
+ 3,
25
+ 5
26
+ ]
27
+ ],
28
+ "begin_suppress_tokens": [
29
+ 220,
30
+ 50257
31
+ ],
32
+ "bos_token_id": 50257,
33
+ "decoder_start_token_id": 50258,
34
+ "eos_token_id": 50257,
35
+ "forced_decoder_ids": [
36
+ [
37
+ 1,
38
+ null
39
+ ],
40
+ [
41
+ 2,
42
+ 50359
43
+ ]
44
+ ],
45
+ "is_multilingual": true,
46
+ "lang_to_id": {
47
+ "<|af|>": 50327,
48
+ "<|am|>": 50334,
49
+ "<|ar|>": 50272,
50
+ "<|as|>": 50350,
51
+ "<|az|>": 50304,
52
+ "<|ba|>": 50355,
53
+ "<|be|>": 50330,
54
+ "<|bg|>": 50292,
55
+ "<|bn|>": 50302,
56
+ "<|bo|>": 50347,
57
+ "<|br|>": 50309,
58
+ "<|bs|>": 50315,
59
+ "<|ca|>": 50270,
60
+ "<|cs|>": 50283,
61
+ "<|cy|>": 50297,
62
+ "<|da|>": 50285,
63
+ "<|de|>": 50261,
64
+ "<|el|>": 50281,
65
+ "<|en|>": 50259,
66
+ "<|es|>": 50262,
67
+ "<|et|>": 50307,
68
+ "<|eu|>": 50310,
69
+ "<|fa|>": 50300,
70
+ "<|fi|>": 50277,
71
+ "<|fo|>": 50338,
72
+ "<|fr|>": 50265,
73
+ "<|gl|>": 50319,
74
+ "<|gu|>": 50333,
75
+ "<|haw|>": 50352,
76
+ "<|ha|>": 50354,
77
+ "<|he|>": 50279,
78
+ "<|hi|>": 50276,
79
+ "<|hr|>": 50291,
80
+ "<|ht|>": 50339,
81
+ "<|hu|>": 50286,
82
+ "<|hy|>": 50312,
83
+ "<|id|>": 50275,
84
+ "<|is|>": 50311,
85
+ "<|it|>": 50274,
86
+ "<|ja|>": 50266,
87
+ "<|jw|>": 50356,
88
+ "<|ka|>": 50329,
89
+ "<|kk|>": 50316,
90
+ "<|km|>": 50323,
91
+ "<|kn|>": 50306,
92
+ "<|ko|>": 50264,
93
+ "<|la|>": 50294,
94
+ "<|lb|>": 50345,
95
+ "<|ln|>": 50353,
96
+ "<|lo|>": 50336,
97
+ "<|lt|>": 50293,
98
+ "<|lv|>": 50301,
99
+ "<|mg|>": 50349,
100
+ "<|mi|>": 50295,
101
+ "<|mk|>": 50308,
102
+ "<|ml|>": 50296,
103
+ "<|mn|>": 50314,
104
+ "<|mr|>": 50320,
105
+ "<|ms|>": 50282,
106
+ "<|mt|>": 50343,
107
+ "<|my|>": 50346,
108
+ "<|ne|>": 50313,
109
+ "<|nl|>": 50271,
110
+ "<|nn|>": 50342,
111
+ "<|no|>": 50288,
112
+ "<|oc|>": 50328,
113
+ "<|pa|>": 50321,
114
+ "<|pl|>": 50269,
115
+ "<|ps|>": 50340,
116
+ "<|pt|>": 50267,
117
+ "<|ro|>": 50284,
118
+ "<|ru|>": 50263,
119
+ "<|sa|>": 50344,
120
+ "<|sd|>": 50332,
121
+ "<|si|>": 50322,
122
+ "<|sk|>": 50298,
123
+ "<|sl|>": 50305,
124
+ "<|sn|>": 50324,
125
+ "<|so|>": 50326,
126
+ "<|sq|>": 50317,
127
+ "<|sr|>": 50303,
128
+ "<|su|>": 50357,
129
+ "<|sv|>": 50273,
130
+ "<|sw|>": 50318,
131
+ "<|ta|>": 50287,
132
+ "<|te|>": 50299,
133
+ "<|tg|>": 50331,
134
+ "<|th|>": 50289,
135
+ "<|tk|>": 50341,
136
+ "<|tl|>": 50348,
137
+ "<|tr|>": 50268,
138
+ "<|tt|>": 50351,
139
+ "<|uk|>": 50280,
140
+ "<|ur|>": 50290,
141
+ "<|uz|>": 50337,
142
+ "<|vi|>": 50278,
143
+ "<|yi|>": 50335,
144
+ "<|yo|>": 50325,
145
+ "<|zh|>": 50260
146
+ },
147
+ "max_initial_timestamp_index": 50,
148
+ "max_length": 448,
149
+ "no_timestamps_token_id": 50363,
150
+ "pad_token_id": 50257,
151
+ "prev_sot_token_id": 50361,
152
+ "return_timestamps": false,
153
+ "suppress_tokens": [
154
+ 1,
155
+ 2,
156
+ 7,
157
+ 8,
158
+ 9,
159
+ 10,
160
+ 14,
161
+ 25,
162
+ 26,
163
+ 27,
164
+ 28,
165
+ 29,
166
+ 31,
167
+ 58,
168
+ 59,
169
+ 60,
170
+ 61,
171
+ 62,
172
+ 63,
173
+ 90,
174
+ 91,
175
+ 92,
176
+ 93,
177
+ 359,
178
+ 503,
179
+ 522,
180
+ 542,
181
+ 873,
182
+ 893,
183
+ 902,
184
+ 918,
185
+ 922,
186
+ 931,
187
+ 1350,
188
+ 1853,
189
+ 1982,
190
+ 2460,
191
+ 2627,
192
+ 3246,
193
+ 3253,
194
+ 3268,
195
+ 3536,
196
+ 3846,
197
+ 3961,
198
+ 4183,
199
+ 4667,
200
+ 6585,
201
+ 6647,
202
+ 7273,
203
+ 9061,
204
+ 9383,
205
+ 10428,
206
+ 10929,
207
+ 11938,
208
+ 12033,
209
+ 12331,
210
+ 12562,
211
+ 13793,
212
+ 14157,
213
+ 14635,
214
+ 15265,
215
+ 15618,
216
+ 16553,
217
+ 16604,
218
+ 18362,
219
+ 18956,
220
+ 20075,
221
+ 21675,
222
+ 22520,
223
+ 26130,
224
+ 26161,
225
+ 26435,
226
+ 28279,
227
+ 29464,
228
+ 31650,
229
+ 32302,
230
+ 32470,
231
+ 36865,
232
+ 42863,
233
+ 47425,
234
+ 49870,
235
+ 50254,
236
+ 50258,
237
+ 50358,
238
+ 50359,
239
+ 50360,
240
+ 50361,
241
+ 50362
242
+ ],
243
+ "task_to_id": {
244
+ "transcribe": 50359,
245
+ "translate": 50358
246
+ },
247
+ "transformers_version": "4.39.0.dev0"
248
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d85a1141a4d7adcf43748aa52a77c71482fedd73209e63b8be5e614fd7fab9b8
3
  size 151061672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8a01f1d65b730056fb17ec3c47d6c8a516514af823e90762c0ea776681317ca
3
  size 151061672
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8443b77c4337bbc7185cb4261cfa2f54a84c28a96bf7acb29ee096032eeffd21
3
+ size 297616186
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc8c23bb98182f0bef92f94a61119e975ef413899af692a02bd4c55dd77c668a
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e18199a3aebbe1a403fa2f78f07d52ae9c9a98107a8fe4f7708b609b11090705
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 37.65067359962184,
3
+ "best_model_checkpoint": "./whisper-tiny-fr-micro-train/checkpoint-8222",
4
+ "epoch": 0.08564583333333334,
5
+ "eval_steps": 4111,
6
+ "global_step": 8222,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 19.87736701965332,
14
+ "learning_rate": 5e-09,
15
+ "loss": 0.9486,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "grad_norm": 18.398113250732422,
21
+ "learning_rate": 1e-08,
22
+ "loss": 0.9083,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.0,
27
+ "grad_norm": 16.93355941772461,
28
+ "learning_rate": 1.5e-08,
29
+ "loss": 0.8204,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.0,
34
+ "grad_norm": 11.64875602722168,
35
+ "learning_rate": 2e-08,
36
+ "loss": 0.7006,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.0,
41
+ "grad_norm": 8.734498977661133,
42
+ "learning_rate": 2.5e-08,
43
+ "loss": 0.648,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.0,
48
+ "grad_norm": 7.067263603210449,
49
+ "learning_rate": 3e-08,
50
+ "loss": 0.537,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "grad_norm": 7.97986364364624,
56
+ "learning_rate": 3.4999999999999996e-08,
57
+ "loss": 0.5752,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.0,
62
+ "grad_norm": 7.728494167327881,
63
+ "learning_rate": 4e-08,
64
+ "loss": 0.5863,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.0,
69
+ "grad_norm": 9.38297176361084,
70
+ "learning_rate": 4.5e-08,
71
+ "loss": 0.5172,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.0,
76
+ "grad_norm": 7.568984508514404,
77
+ "learning_rate": 5e-08,
78
+ "loss": 0.4682,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.0,
83
+ "grad_norm": 8.042702674865723,
84
+ "learning_rate": 5.5e-08,
85
+ "loss": 0.4214,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.0,
90
+ "grad_norm": 7.661986827850342,
91
+ "learning_rate": 6e-08,
92
+ "loss": 0.4431,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.0,
97
+ "grad_norm": 9.154074668884277,
98
+ "learning_rate": 6.5e-08,
99
+ "loss": 0.4056,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.0,
104
+ "grad_norm": 8.346107482910156,
105
+ "learning_rate": 6.999999999999999e-08,
106
+ "loss": 0.4079,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.0,
111
+ "grad_norm": 6.246629238128662,
112
+ "learning_rate": 7.5e-08,
113
+ "loss": 0.3897,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.0,
118
+ "grad_norm": 7.129103183746338,
119
+ "learning_rate": 8e-08,
120
+ "loss": 0.3536,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.0,
125
+ "grad_norm": 6.834921836853027,
126
+ "learning_rate": 8.5e-08,
127
+ "loss": 0.3401,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.0,
132
+ "grad_norm": 8.863313674926758,
133
+ "learning_rate": 9e-08,
134
+ "loss": 0.3627,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.0,
139
+ "grad_norm": 7.284473896026611,
140
+ "learning_rate": 9.499999999999999e-08,
141
+ "loss": 0.356,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.01,
146
+ "grad_norm": 5.816940784454346,
147
+ "learning_rate": 1e-07,
148
+ "loss": 0.3539,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.01,
153
+ "grad_norm": 8.022565841674805,
154
+ "learning_rate": 9.997382198952879e-08,
155
+ "loss": 0.3577,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 0.01,
160
+ "grad_norm": 7.51448917388916,
161
+ "learning_rate": 9.994764397905758e-08,
162
+ "loss": 0.3513,
163
+ "step": 550
164
+ },
165
+ {
166
+ "epoch": 0.01,
167
+ "grad_norm": 7.016752243041992,
168
+ "learning_rate": 9.992146596858639e-08,
169
+ "loss": 0.3687,
170
+ "step": 575
171
+ },
172
+ {
173
+ "epoch": 0.01,
174
+ "grad_norm": 6.761058330535889,
175
+ "learning_rate": 9.989528795811518e-08,
176
+ "loss": 0.3495,
177
+ "step": 600
178
+ },
179
+ {
180
+ "epoch": 0.01,
181
+ "grad_norm": 6.693453311920166,
182
+ "learning_rate": 9.986910994764397e-08,
183
+ "loss": 0.3325,
184
+ "step": 625
185
+ },
186
+ {
187
+ "epoch": 0.01,
188
+ "grad_norm": 6.049990653991699,
189
+ "learning_rate": 9.984293193717277e-08,
190
+ "loss": 0.3609,
191
+ "step": 650
192
+ },
193
+ {
194
+ "epoch": 0.01,
195
+ "grad_norm": 6.6787109375,
196
+ "learning_rate": 9.981675392670157e-08,
197
+ "loss": 0.3317,
198
+ "step": 675
199
+ },
200
+ {
201
+ "epoch": 0.01,
202
+ "grad_norm": 6.8440937995910645,
203
+ "learning_rate": 9.979057591623035e-08,
204
+ "loss": 0.332,
205
+ "step": 700
206
+ },
207
+ {
208
+ "epoch": 0.01,
209
+ "grad_norm": 6.892059326171875,
210
+ "learning_rate": 9.976439790575916e-08,
211
+ "loss": 0.3379,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 0.01,
216
+ "grad_norm": 8.142931938171387,
217
+ "learning_rate": 9.973821989528795e-08,
218
+ "loss": 0.3308,
219
+ "step": 750
220
+ },
221
+ {
222
+ "epoch": 0.01,
223
+ "grad_norm": 7.152769565582275,
224
+ "learning_rate": 9.971204188481675e-08,
225
+ "loss": 0.3352,
226
+ "step": 775
227
+ },
228
+ {
229
+ "epoch": 0.01,
230
+ "grad_norm": 6.680529594421387,
231
+ "learning_rate": 9.968586387434554e-08,
232
+ "loss": 0.3343,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 0.01,
237
+ "grad_norm": 6.2912445068359375,
238
+ "learning_rate": 9.965968586387435e-08,
239
+ "loss": 0.3233,
240
+ "step": 825
241
+ },
242
+ {
243
+ "epoch": 0.01,
244
+ "grad_norm": 6.337522983551025,
245
+ "learning_rate": 9.963350785340313e-08,
246
+ "loss": 0.3147,
247
+ "step": 850
248
+ },
249
+ {
250
+ "epoch": 0.01,
251
+ "grad_norm": 7.505101203918457,
252
+ "learning_rate": 9.960732984293193e-08,
253
+ "loss": 0.3384,
254
+ "step": 875
255
+ },
256
+ {
257
+ "epoch": 0.01,
258
+ "grad_norm": 6.080435276031494,
259
+ "learning_rate": 9.958115183246073e-08,
260
+ "loss": 0.3397,
261
+ "step": 900
262
+ },
263
+ {
264
+ "epoch": 0.01,
265
+ "grad_norm": 7.185611248016357,
266
+ "learning_rate": 9.955497382198953e-08,
267
+ "loss": 0.3448,
268
+ "step": 925
269
+ },
270
+ {
271
+ "epoch": 0.01,
272
+ "grad_norm": 6.759621620178223,
273
+ "learning_rate": 9.952879581151831e-08,
274
+ "loss": 0.3193,
275
+ "step": 950
276
+ },
277
+ {
278
+ "epoch": 0.01,
279
+ "grad_norm": 5.393013000488281,
280
+ "learning_rate": 9.950261780104712e-08,
281
+ "loss": 0.3134,
282
+ "step": 975
283
+ },
284
+ {
285
+ "epoch": 0.01,
286
+ "grad_norm": 6.275601387023926,
287
+ "learning_rate": 9.947643979057591e-08,
288
+ "loss": 0.3227,
289
+ "step": 1000
290
+ },
291
+ {
292
+ "epoch": 0.01,
293
+ "grad_norm": 8.202522277832031,
294
+ "learning_rate": 9.94502617801047e-08,
295
+ "loss": 0.3366,
296
+ "step": 1025
297
+ },
298
+ {
299
+ "epoch": 0.01,
300
+ "grad_norm": 6.580430507659912,
301
+ "learning_rate": 9.94240837696335e-08,
302
+ "loss": 0.4026,
303
+ "step": 1050
304
+ },
305
+ {
306
+ "epoch": 0.01,
307
+ "grad_norm": 8.456107139587402,
308
+ "learning_rate": 9.93979057591623e-08,
309
+ "loss": 0.3868,
310
+ "step": 1075
311
+ },
312
+ {
313
+ "epoch": 0.01,
314
+ "grad_norm": 8.176070213317871,
315
+ "learning_rate": 9.937172774869109e-08,
316
+ "loss": 0.4297,
317
+ "step": 1100
318
+ },
319
+ {
320
+ "epoch": 0.01,
321
+ "grad_norm": 9.07604694366455,
322
+ "learning_rate": 9.934554973821989e-08,
323
+ "loss": 0.4861,
324
+ "step": 1125
325
+ },
326
+ {
327
+ "epoch": 0.01,
328
+ "grad_norm": 8.36545181274414,
329
+ "learning_rate": 9.931937172774869e-08,
330
+ "loss": 0.4939,
331
+ "step": 1150
332
+ },
333
+ {
334
+ "epoch": 0.01,
335
+ "grad_norm": 9.944046974182129,
336
+ "learning_rate": 9.929319371727748e-08,
337
+ "loss": 0.5198,
338
+ "step": 1175
339
+ },
340
+ {
341
+ "epoch": 0.01,
342
+ "grad_norm": 12.03496265411377,
343
+ "learning_rate": 9.926701570680629e-08,
344
+ "loss": 0.5353,
345
+ "step": 1200
346
+ },
347
+ {
348
+ "epoch": 0.01,
349
+ "grad_norm": 14.10308837890625,
350
+ "learning_rate": 9.924083769633508e-08,
351
+ "loss": 0.5005,
352
+ "step": 1225
353
+ },
354
+ {
355
+ "epoch": 0.01,
356
+ "grad_norm": 12.214973449707031,
357
+ "learning_rate": 9.921465968586387e-08,
358
+ "loss": 0.5879,
359
+ "step": 1250
360
+ },
361
+ {
362
+ "epoch": 0.01,
363
+ "grad_norm": 11.323634147644043,
364
+ "learning_rate": 9.918848167539266e-08,
365
+ "loss": 0.5031,
366
+ "step": 1275
367
+ },
368
+ {
369
+ "epoch": 0.01,
370
+ "grad_norm": 10.742391586303711,
371
+ "learning_rate": 9.916230366492147e-08,
372
+ "loss": 0.5495,
373
+ "step": 1300
374
+ },
375
+ {
376
+ "epoch": 0.01,
377
+ "grad_norm": 14.457928657531738,
378
+ "learning_rate": 9.913612565445025e-08,
379
+ "loss": 0.5263,
380
+ "step": 1325
381
+ },
382
+ {
383
+ "epoch": 0.01,
384
+ "grad_norm": 11.978686332702637,
385
+ "learning_rate": 9.910994764397906e-08,
386
+ "loss": 0.5477,
387
+ "step": 1350
388
+ },
389
+ {
390
+ "epoch": 0.01,
391
+ "grad_norm": 11.699676513671875,
392
+ "learning_rate": 9.908376963350785e-08,
393
+ "loss": 0.5293,
394
+ "step": 1375
395
+ },
396
+ {
397
+ "epoch": 0.01,
398
+ "grad_norm": 11.737068176269531,
399
+ "learning_rate": 9.905759162303664e-08,
400
+ "loss": 0.5622,
401
+ "step": 1400
402
+ },
403
+ {
404
+ "epoch": 0.01,
405
+ "grad_norm": 10.408597946166992,
406
+ "learning_rate": 9.903141361256544e-08,
407
+ "loss": 0.5639,
408
+ "step": 1425
409
+ },
410
+ {
411
+ "epoch": 0.02,
412
+ "grad_norm": 11.709553718566895,
413
+ "learning_rate": 9.900523560209424e-08,
414
+ "loss": 0.5421,
415
+ "step": 1450
416
+ },
417
+ {
418
+ "epoch": 0.02,
419
+ "grad_norm": 10.107832908630371,
420
+ "learning_rate": 9.897905759162302e-08,
421
+ "loss": 0.5291,
422
+ "step": 1475
423
+ },
424
+ {
425
+ "epoch": 0.02,
426
+ "grad_norm": 11.955233573913574,
427
+ "learning_rate": 9.895287958115183e-08,
428
+ "loss": 0.5508,
429
+ "step": 1500
430
+ },
431
+ {
432
+ "epoch": 0.02,
433
+ "grad_norm": 12.00100040435791,
434
+ "learning_rate": 9.892670157068062e-08,
435
+ "loss": 0.5271,
436
+ "step": 1525
437
+ },
438
+ {
439
+ "epoch": 0.02,
440
+ "grad_norm": 11.11552619934082,
441
+ "learning_rate": 9.890052356020942e-08,
442
+ "loss": 0.5338,
443
+ "step": 1550
444
+ },
445
+ {
446
+ "epoch": 0.02,
447
+ "grad_norm": 8.751993179321289,
448
+ "learning_rate": 9.887434554973821e-08,
449
+ "loss": 0.5178,
450
+ "step": 1575
451
+ },
452
+ {
453
+ "epoch": 0.02,
454
+ "grad_norm": 10.523124694824219,
455
+ "learning_rate": 9.884816753926702e-08,
456
+ "loss": 0.5574,
457
+ "step": 1600
458
+ },
459
+ {
460
+ "epoch": 0.02,
461
+ "grad_norm": 14.987282752990723,
462
+ "learning_rate": 9.88219895287958e-08,
463
+ "loss": 0.5406,
464
+ "step": 1625
465
+ },
466
+ {
467
+ "epoch": 0.02,
468
+ "grad_norm": 12.370256423950195,
469
+ "learning_rate": 9.87958115183246e-08,
470
+ "loss": 0.4805,
471
+ "step": 1650
472
+ },
473
+ {
474
+ "epoch": 0.02,
475
+ "grad_norm": 9.747725486755371,
476
+ "learning_rate": 9.87696335078534e-08,
477
+ "loss": 0.5259,
478
+ "step": 1675
479
+ },
480
+ {
481
+ "epoch": 0.02,
482
+ "grad_norm": 12.413991928100586,
483
+ "learning_rate": 9.87434554973822e-08,
484
+ "loss": 0.5225,
485
+ "step": 1700
486
+ },
487
+ {
488
+ "epoch": 0.02,
489
+ "grad_norm": 11.440505981445312,
490
+ "learning_rate": 9.871727748691098e-08,
491
+ "loss": 0.5511,
492
+ "step": 1725
493
+ },
494
+ {
495
+ "epoch": 0.02,
496
+ "grad_norm": 11.07944107055664,
497
+ "learning_rate": 9.869109947643979e-08,
498
+ "loss": 0.4913,
499
+ "step": 1750
500
+ },
501
+ {
502
+ "epoch": 0.02,
503
+ "grad_norm": 12.481764793395996,
504
+ "learning_rate": 9.866492146596858e-08,
505
+ "loss": 0.5618,
506
+ "step": 1775
507
+ },
508
+ {
509
+ "epoch": 0.02,
510
+ "grad_norm": 10.33045768737793,
511
+ "learning_rate": 9.863874345549738e-08,
512
+ "loss": 0.5099,
513
+ "step": 1800
514
+ },
515
+ {
516
+ "epoch": 0.02,
517
+ "grad_norm": 11.342964172363281,
518
+ "learning_rate": 9.861256544502617e-08,
519
+ "loss": 0.5454,
520
+ "step": 1825
521
+ },
522
+ {
523
+ "epoch": 0.02,
524
+ "grad_norm": 10.811851501464844,
525
+ "learning_rate": 9.858638743455498e-08,
526
+ "loss": 0.5118,
527
+ "step": 1850
528
+ },
529
+ {
530
+ "epoch": 0.02,
531
+ "grad_norm": 12.243831634521484,
532
+ "learning_rate": 9.856020942408377e-08,
533
+ "loss": 0.5191,
534
+ "step": 1875
535
+ },
536
+ {
537
+ "epoch": 0.02,
538
+ "grad_norm": 10.51577377319336,
539
+ "learning_rate": 9.853403141361256e-08,
540
+ "loss": 0.4916,
541
+ "step": 1900
542
+ },
543
+ {
544
+ "epoch": 0.02,
545
+ "grad_norm": 9.325318336486816,
546
+ "learning_rate": 9.850785340314135e-08,
547
+ "loss": 0.5178,
548
+ "step": 1925
549
+ },
550
+ {
551
+ "epoch": 0.02,
552
+ "grad_norm": 9.348186492919922,
553
+ "learning_rate": 9.848167539267015e-08,
554
+ "loss": 0.5066,
555
+ "step": 1950
556
+ },
557
+ {
558
+ "epoch": 0.02,
559
+ "grad_norm": 11.930258750915527,
560
+ "learning_rate": 9.845549738219895e-08,
561
+ "loss": 0.4747,
562
+ "step": 1975
563
+ },
564
+ {
565
+ "epoch": 0.02,
566
+ "grad_norm": 11.170626640319824,
567
+ "learning_rate": 9.842931937172775e-08,
568
+ "loss": 0.5285,
569
+ "step": 2000
570
+ },
571
+ {
572
+ "epoch": 0.02,
573
+ "grad_norm": 10.741945266723633,
574
+ "learning_rate": 9.840314136125654e-08,
575
+ "loss": 0.5686,
576
+ "step": 2025
577
+ },
578
+ {
579
+ "epoch": 0.02,
580
+ "grad_norm": 10.902135848999023,
581
+ "learning_rate": 9.837696335078533e-08,
582
+ "loss": 0.5909,
583
+ "step": 2050
584
+ },
585
+ {
586
+ "epoch": 0.02,
587
+ "grad_norm": 8.929906845092773,
588
+ "learning_rate": 9.835078534031414e-08,
589
+ "loss": 0.578,
590
+ "step": 2075
591
+ },
592
+ {
593
+ "epoch": 0.02,
594
+ "grad_norm": 11.585110664367676,
595
+ "learning_rate": 9.832460732984292e-08,
596
+ "loss": 0.8891,
597
+ "step": 2100
598
+ },
599
+ {
600
+ "epoch": 0.02,
601
+ "grad_norm": 11.564123153686523,
602
+ "learning_rate": 9.829842931937173e-08,
603
+ "loss": 1.0214,
604
+ "step": 2125
605
+ },
606
+ {
607
+ "epoch": 0.02,
608
+ "grad_norm": 10.137656211853027,
609
+ "learning_rate": 9.827225130890052e-08,
610
+ "loss": 0.7967,
611
+ "step": 2150
612
+ },
613
+ {
614
+ "epoch": 0.02,
615
+ "grad_norm": 9.988815307617188,
616
+ "learning_rate": 9.824607329842931e-08,
617
+ "loss": 0.7586,
618
+ "step": 2175
619
+ },
620
+ {
621
+ "epoch": 0.02,
622
+ "grad_norm": 8.484098434448242,
623
+ "learning_rate": 9.82198952879581e-08,
624
+ "loss": 0.6455,
625
+ "step": 2200
626
+ },
627
+ {
628
+ "epoch": 0.02,
629
+ "grad_norm": 8.596495628356934,
630
+ "learning_rate": 9.819371727748691e-08,
631
+ "loss": 0.6966,
632
+ "step": 2225
633
+ },
634
+ {
635
+ "epoch": 0.02,
636
+ "grad_norm": 8.861817359924316,
637
+ "learning_rate": 9.816753926701569e-08,
638
+ "loss": 0.8732,
639
+ "step": 2250
640
+ },
641
+ {
642
+ "epoch": 0.02,
643
+ "grad_norm": 9.609010696411133,
644
+ "learning_rate": 9.81413612565445e-08,
645
+ "loss": 0.803,
646
+ "step": 2275
647
+ },
648
+ {
649
+ "epoch": 0.02,
650
+ "grad_norm": 8.113046646118164,
651
+ "learning_rate": 9.811518324607329e-08,
652
+ "loss": 0.8018,
653
+ "step": 2300
654
+ },
655
+ {
656
+ "epoch": 0.02,
657
+ "grad_norm": 7.831557750701904,
658
+ "learning_rate": 9.808900523560209e-08,
659
+ "loss": 0.7681,
660
+ "step": 2325
661
+ },
662
+ {
663
+ "epoch": 0.02,
664
+ "grad_norm": 9.451202392578125,
665
+ "learning_rate": 9.806282722513088e-08,
666
+ "loss": 0.6863,
667
+ "step": 2350
668
+ },
669
+ {
670
+ "epoch": 0.02,
671
+ "grad_norm": 6.153475284576416,
672
+ "learning_rate": 9.803664921465969e-08,
673
+ "loss": 0.5344,
674
+ "step": 2375
675
+ },
676
+ {
677
+ "epoch": 0.03,
678
+ "grad_norm": 6.556187152862549,
679
+ "learning_rate": 9.801047120418847e-08,
680
+ "loss": 0.5072,
681
+ "step": 2400
682
+ },
683
+ {
684
+ "epoch": 0.03,
685
+ "grad_norm": 6.695789337158203,
686
+ "learning_rate": 9.798429319371727e-08,
687
+ "loss": 0.4882,
688
+ "step": 2425
689
+ },
690
+ {
691
+ "epoch": 0.03,
692
+ "grad_norm": 5.124952793121338,
693
+ "learning_rate": 9.795811518324607e-08,
694
+ "loss": 0.4023,
695
+ "step": 2450
696
+ },
697
+ {
698
+ "epoch": 0.03,
699
+ "grad_norm": 5.724789142608643,
700
+ "learning_rate": 9.793193717277487e-08,
701
+ "loss": 0.436,
702
+ "step": 2475
703
+ },
704
+ {
705
+ "epoch": 0.03,
706
+ "grad_norm": 6.060319423675537,
707
+ "learning_rate": 9.790575916230365e-08,
708
+ "loss": 0.3939,
709
+ "step": 2500
710
+ },
711
+ {
712
+ "epoch": 0.03,
713
+ "grad_norm": 5.216397285461426,
714
+ "learning_rate": 9.787958115183246e-08,
715
+ "loss": 0.3305,
716
+ "step": 2525
717
+ },
718
+ {
719
+ "epoch": 0.03,
720
+ "grad_norm": 5.101900577545166,
721
+ "learning_rate": 9.785340314136125e-08,
722
+ "loss": 0.3212,
723
+ "step": 2550
724
+ },
725
+ {
726
+ "epoch": 0.03,
727
+ "grad_norm": 4.3815484046936035,
728
+ "learning_rate": 9.782722513089004e-08,
729
+ "loss": 0.3153,
730
+ "step": 2575
731
+ },
732
+ {
733
+ "epoch": 0.03,
734
+ "grad_norm": 7.231525897979736,
735
+ "learning_rate": 9.780104712041885e-08,
736
+ "loss": 0.4177,
737
+ "step": 2600
738
+ },
739
+ {
740
+ "epoch": 0.03,
741
+ "grad_norm": 5.76875638961792,
742
+ "learning_rate": 9.777486910994764e-08,
743
+ "loss": 0.5512,
744
+ "step": 2625
745
+ },
746
+ {
747
+ "epoch": 0.03,
748
+ "grad_norm": 5.580086708068848,
749
+ "learning_rate": 9.774869109947644e-08,
750
+ "loss": 0.5036,
751
+ "step": 2650
752
+ },
753
+ {
754
+ "epoch": 0.03,
755
+ "grad_norm": 6.0146894454956055,
756
+ "learning_rate": 9.772251308900523e-08,
757
+ "loss": 0.5229,
758
+ "step": 2675
759
+ },
760
+ {
761
+ "epoch": 0.03,
762
+ "grad_norm": 5.919321060180664,
763
+ "learning_rate": 9.769633507853404e-08,
764
+ "loss": 0.6031,
765
+ "step": 2700
766
+ },
767
+ {
768
+ "epoch": 0.03,
769
+ "grad_norm": 7.249564170837402,
770
+ "learning_rate": 9.767015706806282e-08,
771
+ "loss": 0.6529,
772
+ "step": 2725
773
+ },
774
+ {
775
+ "epoch": 0.03,
776
+ "grad_norm": 5.037733554840088,
777
+ "learning_rate": 9.764397905759162e-08,
778
+ "loss": 0.5845,
779
+ "step": 2750
780
+ },
781
+ {
782
+ "epoch": 0.03,
783
+ "grad_norm": 5.596995830535889,
784
+ "learning_rate": 9.761780104712042e-08,
785
+ "loss": 0.4844,
786
+ "step": 2775
787
+ },
788
+ {
789
+ "epoch": 0.03,
790
+ "grad_norm": 6.465144157409668,
791
+ "learning_rate": 9.759162303664921e-08,
792
+ "loss": 0.7091,
793
+ "step": 2800
794
+ },
795
+ {
796
+ "epoch": 0.03,
797
+ "grad_norm": 8.079314231872559,
798
+ "learning_rate": 9.7565445026178e-08,
799
+ "loss": 1.0112,
800
+ "step": 2825
801
+ },
802
+ {
803
+ "epoch": 0.03,
804
+ "grad_norm": 6.964105606079102,
805
+ "learning_rate": 9.753926701570681e-08,
806
+ "loss": 0.8826,
807
+ "step": 2850
808
+ },
809
+ {
810
+ "epoch": 0.03,
811
+ "grad_norm": 5.710263252258301,
812
+ "learning_rate": 9.751308900523559e-08,
813
+ "loss": 0.7581,
814
+ "step": 2875
815
+ },
816
+ {
817
+ "epoch": 0.03,
818
+ "grad_norm": 4.680161476135254,
819
+ "learning_rate": 9.74869109947644e-08,
820
+ "loss": 0.6771,
821
+ "step": 2900
822
+ },
823
+ {
824
+ "epoch": 0.03,
825
+ "grad_norm": 6.1621198654174805,
826
+ "learning_rate": 9.746073298429319e-08,
827
+ "loss": 0.5596,
828
+ "step": 2925
829
+ },
830
+ {
831
+ "epoch": 0.03,
832
+ "grad_norm": 5.297918796539307,
833
+ "learning_rate": 9.743455497382198e-08,
834
+ "loss": 0.4256,
835
+ "step": 2950
836
+ },
837
+ {
838
+ "epoch": 0.03,
839
+ "grad_norm": 5.257850646972656,
840
+ "learning_rate": 9.740837696335078e-08,
841
+ "loss": 0.3977,
842
+ "step": 2975
843
+ },
844
+ {
845
+ "epoch": 0.03,
846
+ "grad_norm": 5.193603992462158,
847
+ "learning_rate": 9.738219895287958e-08,
848
+ "loss": 0.3957,
849
+ "step": 3000
850
+ },
851
+ {
852
+ "epoch": 0.03,
853
+ "grad_norm": 4.629751682281494,
854
+ "learning_rate": 9.735602094240836e-08,
855
+ "loss": 0.3431,
856
+ "step": 3025
857
+ },
858
+ {
859
+ "epoch": 0.03,
860
+ "grad_norm": 4.40083122253418,
861
+ "learning_rate": 9.732984293193717e-08,
862
+ "loss": 0.3242,
863
+ "step": 3050
864
+ },
865
+ {
866
+ "epoch": 0.03,
867
+ "grad_norm": 4.38535737991333,
868
+ "learning_rate": 9.730366492146596e-08,
869
+ "loss": 0.3124,
870
+ "step": 3075
871
+ },
872
+ {
873
+ "epoch": 0.03,
874
+ "grad_norm": 4.32893180847168,
875
+ "learning_rate": 9.727748691099476e-08,
876
+ "loss": 0.2964,
877
+ "step": 3100
878
+ },
879
+ {
880
+ "epoch": 0.03,
881
+ "grad_norm": 4.1819586753845215,
882
+ "learning_rate": 9.725130890052355e-08,
883
+ "loss": 0.2668,
884
+ "step": 3125
885
+ },
886
+ {
887
+ "epoch": 0.03,
888
+ "grad_norm": 4.960391521453857,
889
+ "learning_rate": 9.722513089005235e-08,
890
+ "loss": 0.2789,
891
+ "step": 3150
892
+ },
893
+ {
894
+ "epoch": 0.03,
895
+ "grad_norm": 4.490744113922119,
896
+ "learning_rate": 9.719895287958115e-08,
897
+ "loss": 0.2566,
898
+ "step": 3175
899
+ },
900
+ {
901
+ "epoch": 0.03,
902
+ "grad_norm": 4.861118316650391,
903
+ "learning_rate": 9.717277486910994e-08,
904
+ "loss": 0.279,
905
+ "step": 3200
906
+ },
907
+ {
908
+ "epoch": 0.03,
909
+ "grad_norm": 5.386078357696533,
910
+ "learning_rate": 9.714659685863873e-08,
911
+ "loss": 0.2882,
912
+ "step": 3225
913
+ },
914
+ {
915
+ "epoch": 0.03,
916
+ "grad_norm": 6.265291213989258,
917
+ "learning_rate": 9.712041884816754e-08,
918
+ "loss": 0.3766,
919
+ "step": 3250
920
+ },
921
+ {
922
+ "epoch": 0.03,
923
+ "grad_norm": 6.723097801208496,
924
+ "learning_rate": 9.709424083769633e-08,
925
+ "loss": 0.5386,
926
+ "step": 3275
927
+ },
928
+ {
929
+ "epoch": 0.03,
930
+ "grad_norm": 5.949530601501465,
931
+ "learning_rate": 9.706806282722513e-08,
932
+ "loss": 0.4818,
933
+ "step": 3300
934
+ },
935
+ {
936
+ "epoch": 0.03,
937
+ "grad_norm": 5.125253200531006,
938
+ "learning_rate": 9.704188481675392e-08,
939
+ "loss": 0.4862,
940
+ "step": 3325
941
+ },
942
+ {
943
+ "epoch": 0.03,
944
+ "grad_norm": 5.845962047576904,
945
+ "learning_rate": 9.701570680628271e-08,
946
+ "loss": 0.4502,
947
+ "step": 3350
948
+ },
949
+ {
950
+ "epoch": 0.04,
951
+ "grad_norm": 5.178995609283447,
952
+ "learning_rate": 9.698952879581152e-08,
953
+ "loss": 0.4086,
954
+ "step": 3375
955
+ },
956
+ {
957
+ "epoch": 0.04,
958
+ "grad_norm": 4.950035095214844,
959
+ "learning_rate": 9.696335078534031e-08,
960
+ "loss": 0.4167,
961
+ "step": 3400
962
+ },
963
+ {
964
+ "epoch": 0.04,
965
+ "grad_norm": 7.225176811218262,
966
+ "learning_rate": 9.693717277486911e-08,
967
+ "loss": 0.3945,
968
+ "step": 3425
969
+ },
970
+ {
971
+ "epoch": 0.04,
972
+ "grad_norm": 6.313861846923828,
973
+ "learning_rate": 9.69109947643979e-08,
974
+ "loss": 0.4021,
975
+ "step": 3450
976
+ },
977
+ {
978
+ "epoch": 0.04,
979
+ "grad_norm": 5.976010799407959,
980
+ "learning_rate": 9.68848167539267e-08,
981
+ "loss": 0.3821,
982
+ "step": 3475
983
+ },
984
+ {
985
+ "epoch": 0.04,
986
+ "grad_norm": 6.867140769958496,
987
+ "learning_rate": 9.685863874345549e-08,
988
+ "loss": 0.3901,
989
+ "step": 3500
990
+ },
991
+ {
992
+ "epoch": 0.04,
993
+ "grad_norm": 5.82126522064209,
994
+ "learning_rate": 9.683246073298429e-08,
995
+ "loss": 0.3798,
996
+ "step": 3525
997
+ },
998
+ {
999
+ "epoch": 0.04,
1000
+ "grad_norm": 5.741916656494141,
1001
+ "learning_rate": 9.680628272251309e-08,
1002
+ "loss": 0.3842,
1003
+ "step": 3550
1004
+ },
1005
+ {
1006
+ "epoch": 0.04,
1007
+ "grad_norm": 5.43148946762085,
1008
+ "learning_rate": 9.678010471204188e-08,
1009
+ "loss": 0.3762,
1010
+ "step": 3575
1011
+ },
1012
+ {
1013
+ "epoch": 0.04,
1014
+ "grad_norm": 4.983076095581055,
1015
+ "learning_rate": 9.675392670157067e-08,
1016
+ "loss": 0.3496,
1017
+ "step": 3600
1018
+ },
1019
+ {
1020
+ "epoch": 0.04,
1021
+ "grad_norm": 5.233561992645264,
1022
+ "learning_rate": 9.672774869109948e-08,
1023
+ "loss": 0.3225,
1024
+ "step": 3625
1025
+ },
1026
+ {
1027
+ "epoch": 0.04,
1028
+ "grad_norm": 4.534473419189453,
1029
+ "learning_rate": 9.670157068062826e-08,
1030
+ "loss": 0.3009,
1031
+ "step": 3650
1032
+ },
1033
+ {
1034
+ "epoch": 0.04,
1035
+ "grad_norm": 5.9857306480407715,
1036
+ "learning_rate": 9.667539267015707e-08,
1037
+ "loss": 0.4075,
1038
+ "step": 3675
1039
+ },
1040
+ {
1041
+ "epoch": 0.04,
1042
+ "grad_norm": 7.715012073516846,
1043
+ "learning_rate": 9.664921465968586e-08,
1044
+ "loss": 0.5876,
1045
+ "step": 3700
1046
+ },
1047
+ {
1048
+ "epoch": 0.04,
1049
+ "grad_norm": 7.0109405517578125,
1050
+ "learning_rate": 9.662303664921465e-08,
1051
+ "loss": 0.6805,
1052
+ "step": 3725
1053
+ },
1054
+ {
1055
+ "epoch": 0.04,
1056
+ "grad_norm": 6.128580093383789,
1057
+ "learning_rate": 9.659685863874345e-08,
1058
+ "loss": 0.5924,
1059
+ "step": 3750
1060
+ },
1061
+ {
1062
+ "epoch": 0.04,
1063
+ "grad_norm": 5.6104865074157715,
1064
+ "learning_rate": 9.657068062827225e-08,
1065
+ "loss": 0.4888,
1066
+ "step": 3775
1067
+ },
1068
+ {
1069
+ "epoch": 0.04,
1070
+ "grad_norm": 5.251614093780518,
1071
+ "learning_rate": 9.654450261780103e-08,
1072
+ "loss": 0.4286,
1073
+ "step": 3800
1074
+ },
1075
+ {
1076
+ "epoch": 0.04,
1077
+ "grad_norm": 5.3208184242248535,
1078
+ "learning_rate": 9.651832460732984e-08,
1079
+ "loss": 0.3889,
1080
+ "step": 3825
1081
+ },
1082
+ {
1083
+ "epoch": 0.04,
1084
+ "grad_norm": 5.454063892364502,
1085
+ "learning_rate": 9.649214659685863e-08,
1086
+ "loss": 0.4181,
1087
+ "step": 3850
1088
+ },
1089
+ {
1090
+ "epoch": 0.04,
1091
+ "grad_norm": 5.304567813873291,
1092
+ "learning_rate": 9.646596858638742e-08,
1093
+ "loss": 0.3816,
1094
+ "step": 3875
1095
+ },
1096
+ {
1097
+ "epoch": 0.04,
1098
+ "grad_norm": 4.866218566894531,
1099
+ "learning_rate": 9.643979057591623e-08,
1100
+ "loss": 0.3475,
1101
+ "step": 3900
1102
+ },
1103
+ {
1104
+ "epoch": 0.04,
1105
+ "grad_norm": 4.873610019683838,
1106
+ "learning_rate": 9.641361256544502e-08,
1107
+ "loss": 0.3369,
1108
+ "step": 3925
1109
+ },
1110
+ {
1111
+ "epoch": 0.04,
1112
+ "grad_norm": 4.69268798828125,
1113
+ "learning_rate": 9.638743455497382e-08,
1114
+ "loss": 0.3515,
1115
+ "step": 3950
1116
+ },
1117
+ {
1118
+ "epoch": 0.04,
1119
+ "grad_norm": 5.367419719696045,
1120
+ "learning_rate": 9.636125654450261e-08,
1121
+ "loss": 0.3328,
1122
+ "step": 3975
1123
+ },
1124
+ {
1125
+ "epoch": 0.04,
1126
+ "grad_norm": 4.6179728507995605,
1127
+ "learning_rate": 9.633507853403142e-08,
1128
+ "loss": 0.3536,
1129
+ "step": 4000
1130
+ },
1131
+ {
1132
+ "epoch": 0.04,
1133
+ "grad_norm": 4.710158348083496,
1134
+ "learning_rate": 9.630890052356021e-08,
1135
+ "loss": 0.3446,
1136
+ "step": 4025
1137
+ },
1138
+ {
1139
+ "epoch": 0.04,
1140
+ "grad_norm": 4.824263095855713,
1141
+ "learning_rate": 9.6282722513089e-08,
1142
+ "loss": 0.3488,
1143
+ "step": 4050
1144
+ },
1145
+ {
1146
+ "epoch": 0.04,
1147
+ "grad_norm": 5.474529266357422,
1148
+ "learning_rate": 9.62565445026178e-08,
1149
+ "loss": 0.3648,
1150
+ "step": 4075
1151
+ },
1152
+ {
1153
+ "epoch": 0.04,
1154
+ "grad_norm": 5.825191497802734,
1155
+ "learning_rate": 9.62303664921466e-08,
1156
+ "loss": 0.3782,
1157
+ "step": 4100
1158
+ },
1159
+ {
1160
+ "epoch": 0.04,
1161
+ "eval_loss": 0.7290233969688416,
1162
+ "eval_runtime": 275.8897,
1163
+ "eval_samples_per_second": 9.801,
1164
+ "eval_steps_per_second": 1.225,
1165
+ "eval_wer": 39.41739541479556,
1166
+ "step": 4111
1167
+ },
1168
+ {
1169
+ "epoch": 0.04,
1170
+ "grad_norm": 4.9900221824646,
1171
+ "learning_rate": 9.620418848167538e-08,
1172
+ "loss": 0.3366,
1173
+ "step": 4125
1174
+ },
1175
+ {
1176
+ "epoch": 0.04,
1177
+ "grad_norm": 6.866960525512695,
1178
+ "learning_rate": 9.617801047120419e-08,
1179
+ "loss": 0.384,
1180
+ "step": 4150
1181
+ },
1182
+ {
1183
+ "epoch": 0.04,
1184
+ "grad_norm": 6.261806011199951,
1185
+ "learning_rate": 9.615183246073298e-08,
1186
+ "loss": 0.4945,
1187
+ "step": 4175
1188
+ },
1189
+ {
1190
+ "epoch": 0.04,
1191
+ "grad_norm": 6.251190185546875,
1192
+ "learning_rate": 9.612565445026178e-08,
1193
+ "loss": 0.5011,
1194
+ "step": 4200
1195
+ },
1196
+ {
1197
+ "epoch": 0.04,
1198
+ "grad_norm": 7.063992023468018,
1199
+ "learning_rate": 9.609947643979057e-08,
1200
+ "loss": 0.4765,
1201
+ "step": 4225
1202
+ },
1203
+ {
1204
+ "epoch": 0.04,
1205
+ "grad_norm": 6.903201103210449,
1206
+ "learning_rate": 9.607329842931938e-08,
1207
+ "loss": 0.4501,
1208
+ "step": 4250
1209
+ },
1210
+ {
1211
+ "epoch": 0.04,
1212
+ "grad_norm": 6.0563788414001465,
1213
+ "learning_rate": 9.604712041884816e-08,
1214
+ "loss": 0.459,
1215
+ "step": 4275
1216
+ },
1217
+ {
1218
+ "epoch": 0.04,
1219
+ "grad_norm": 6.9955363273620605,
1220
+ "learning_rate": 9.602094240837696e-08,
1221
+ "loss": 0.4237,
1222
+ "step": 4300
1223
+ },
1224
+ {
1225
+ "epoch": 0.05,
1226
+ "grad_norm": 6.026924133300781,
1227
+ "learning_rate": 9.599476439790576e-08,
1228
+ "loss": 0.3898,
1229
+ "step": 4325
1230
+ },
1231
+ {
1232
+ "epoch": 0.05,
1233
+ "grad_norm": 5.720476150512695,
1234
+ "learning_rate": 9.596858638743455e-08,
1235
+ "loss": 0.4239,
1236
+ "step": 4350
1237
+ },
1238
+ {
1239
+ "epoch": 0.05,
1240
+ "grad_norm": 6.680058479309082,
1241
+ "learning_rate": 9.594240837696334e-08,
1242
+ "loss": 0.4321,
1243
+ "step": 4375
1244
+ },
1245
+ {
1246
+ "epoch": 0.05,
1247
+ "grad_norm": 8.304168701171875,
1248
+ "learning_rate": 9.591623036649215e-08,
1249
+ "loss": 0.5794,
1250
+ "step": 4400
1251
+ },
1252
+ {
1253
+ "epoch": 0.05,
1254
+ "grad_norm": 8.107504844665527,
1255
+ "learning_rate": 9.589005235602093e-08,
1256
+ "loss": 0.8686,
1257
+ "step": 4425
1258
+ },
1259
+ {
1260
+ "epoch": 0.05,
1261
+ "grad_norm": 8.911792755126953,
1262
+ "learning_rate": 9.586387434554973e-08,
1263
+ "loss": 0.9618,
1264
+ "step": 4450
1265
+ },
1266
+ {
1267
+ "epoch": 0.05,
1268
+ "grad_norm": 6.706320285797119,
1269
+ "learning_rate": 9.583769633507853e-08,
1270
+ "loss": 0.8042,
1271
+ "step": 4475
1272
+ },
1273
+ {
1274
+ "epoch": 0.05,
1275
+ "grad_norm": 6.71433687210083,
1276
+ "learning_rate": 9.581151832460732e-08,
1277
+ "loss": 0.6088,
1278
+ "step": 4500
1279
+ },
1280
+ {
1281
+ "epoch": 0.05,
1282
+ "grad_norm": 6.675333023071289,
1283
+ "learning_rate": 9.578534031413611e-08,
1284
+ "loss": 0.551,
1285
+ "step": 4525
1286
+ },
1287
+ {
1288
+ "epoch": 0.05,
1289
+ "grad_norm": 6.766154766082764,
1290
+ "learning_rate": 9.575916230366492e-08,
1291
+ "loss": 0.5113,
1292
+ "step": 4550
1293
+ },
1294
+ {
1295
+ "epoch": 0.05,
1296
+ "grad_norm": 6.36196231842041,
1297
+ "learning_rate": 9.573298429319371e-08,
1298
+ "loss": 0.4785,
1299
+ "step": 4575
1300
+ },
1301
+ {
1302
+ "epoch": 0.05,
1303
+ "grad_norm": 6.1699395179748535,
1304
+ "learning_rate": 9.570680628272251e-08,
1305
+ "loss": 0.4663,
1306
+ "step": 4600
1307
+ },
1308
+ {
1309
+ "epoch": 0.05,
1310
+ "grad_norm": 6.362920761108398,
1311
+ "learning_rate": 9.56806282722513e-08,
1312
+ "loss": 0.4394,
1313
+ "step": 4625
1314
+ },
1315
+ {
1316
+ "epoch": 0.05,
1317
+ "grad_norm": 6.1348347663879395,
1318
+ "learning_rate": 9.56544502617801e-08,
1319
+ "loss": 0.4343,
1320
+ "step": 4650
1321
+ },
1322
+ {
1323
+ "epoch": 0.05,
1324
+ "grad_norm": 6.3059563636779785,
1325
+ "learning_rate": 9.56282722513089e-08,
1326
+ "loss": 0.4535,
1327
+ "step": 4675
1328
+ },
1329
+ {
1330
+ "epoch": 0.05,
1331
+ "grad_norm": 7.463464260101318,
1332
+ "learning_rate": 9.560209424083769e-08,
1333
+ "loss": 0.7034,
1334
+ "step": 4700
1335
+ },
1336
+ {
1337
+ "epoch": 0.05,
1338
+ "grad_norm": 7.093417644500732,
1339
+ "learning_rate": 9.557591623036649e-08,
1340
+ "loss": 0.8337,
1341
+ "step": 4725
1342
+ },
1343
+ {
1344
+ "epoch": 0.05,
1345
+ "grad_norm": 6.7604193687438965,
1346
+ "learning_rate": 9.554973821989528e-08,
1347
+ "loss": 0.7934,
1348
+ "step": 4750
1349
+ },
1350
+ {
1351
+ "epoch": 0.05,
1352
+ "grad_norm": 6.093296051025391,
1353
+ "learning_rate": 9.552356020942409e-08,
1354
+ "loss": 0.7055,
1355
+ "step": 4775
1356
+ },
1357
+ {
1358
+ "epoch": 0.05,
1359
+ "grad_norm": 6.788339138031006,
1360
+ "learning_rate": 9.549738219895288e-08,
1361
+ "loss": 0.6884,
1362
+ "step": 4800
1363
+ },
1364
+ {
1365
+ "epoch": 0.05,
1366
+ "grad_norm": 6.2128496170043945,
1367
+ "learning_rate": 9.547120418848167e-08,
1368
+ "loss": 0.5722,
1369
+ "step": 4825
1370
+ },
1371
+ {
1372
+ "epoch": 0.05,
1373
+ "grad_norm": 6.026149272918701,
1374
+ "learning_rate": 9.544502617801047e-08,
1375
+ "loss": 0.5802,
1376
+ "step": 4850
1377
+ },
1378
+ {
1379
+ "epoch": 0.05,
1380
+ "grad_norm": 6.711429119110107,
1381
+ "learning_rate": 9.541884816753927e-08,
1382
+ "loss": 0.5129,
1383
+ "step": 4875
1384
+ },
1385
+ {
1386
+ "epoch": 0.05,
1387
+ "grad_norm": 6.273972988128662,
1388
+ "learning_rate": 9.539267015706805e-08,
1389
+ "loss": 0.4283,
1390
+ "step": 4900
1391
+ },
1392
+ {
1393
+ "epoch": 0.05,
1394
+ "grad_norm": 5.497582912445068,
1395
+ "learning_rate": 9.536649214659686e-08,
1396
+ "loss": 0.4075,
1397
+ "step": 4925
1398
+ },
1399
+ {
1400
+ "epoch": 0.05,
1401
+ "grad_norm": 5.759308815002441,
1402
+ "learning_rate": 9.534031413612565e-08,
1403
+ "loss": 0.4438,
1404
+ "step": 4950
1405
+ },
1406
+ {
1407
+ "epoch": 0.05,
1408
+ "grad_norm": 6.2068305015563965,
1409
+ "learning_rate": 9.531413612565445e-08,
1410
+ "loss": 0.4686,
1411
+ "step": 4975
1412
+ },
1413
+ {
1414
+ "epoch": 0.05,
1415
+ "grad_norm": 5.611216068267822,
1416
+ "learning_rate": 9.528795811518324e-08,
1417
+ "loss": 0.4714,
1418
+ "step": 5000
1419
+ },
1420
+ {
1421
+ "epoch": 0.05,
1422
+ "grad_norm": 5.2035040855407715,
1423
+ "learning_rate": 9.526178010471204e-08,
1424
+ "loss": 0.4933,
1425
+ "step": 5025
1426
+ },
1427
+ {
1428
+ "epoch": 0.05,
1429
+ "grad_norm": 6.796937942504883,
1430
+ "learning_rate": 9.523560209424082e-08,
1431
+ "loss": 0.518,
1432
+ "step": 5050
1433
+ },
1434
+ {
1435
+ "epoch": 0.05,
1436
+ "grad_norm": 5.768625259399414,
1437
+ "learning_rate": 9.520942408376963e-08,
1438
+ "loss": 0.5254,
1439
+ "step": 5075
1440
+ },
1441
+ {
1442
+ "epoch": 0.05,
1443
+ "grad_norm": 5.743659019470215,
1444
+ "learning_rate": 9.518324607329842e-08,
1445
+ "loss": 0.5098,
1446
+ "step": 5100
1447
+ },
1448
+ {
1449
+ "epoch": 0.05,
1450
+ "grad_norm": 6.624993801116943,
1451
+ "learning_rate": 9.515706806282722e-08,
1452
+ "loss": 0.4855,
1453
+ "step": 5125
1454
+ },
1455
+ {
1456
+ "epoch": 0.05,
1457
+ "grad_norm": 6.45778751373291,
1458
+ "learning_rate": 9.513089005235601e-08,
1459
+ "loss": 0.5223,
1460
+ "step": 5150
1461
+ },
1462
+ {
1463
+ "epoch": 0.05,
1464
+ "grad_norm": 5.325904369354248,
1465
+ "learning_rate": 9.510471204188482e-08,
1466
+ "loss": 0.5041,
1467
+ "step": 5175
1468
+ },
1469
+ {
1470
+ "epoch": 0.05,
1471
+ "grad_norm": 5.208452224731445,
1472
+ "learning_rate": 9.507853403141361e-08,
1473
+ "loss": 0.5157,
1474
+ "step": 5200
1475
+ },
1476
+ {
1477
+ "epoch": 0.05,
1478
+ "grad_norm": 5.317996501922607,
1479
+ "learning_rate": 9.50523560209424e-08,
1480
+ "loss": 0.5614,
1481
+ "step": 5225
1482
+ },
1483
+ {
1484
+ "epoch": 0.05,
1485
+ "grad_norm": 6.383024215698242,
1486
+ "learning_rate": 9.50261780104712e-08,
1487
+ "loss": 0.5186,
1488
+ "step": 5250
1489
+ },
1490
+ {
1491
+ "epoch": 0.05,
1492
+ "grad_norm": 4.965906620025635,
1493
+ "learning_rate": 9.499999999999999e-08,
1494
+ "loss": 0.4887,
1495
+ "step": 5275
1496
+ },
1497
+ {
1498
+ "epoch": 0.06,
1499
+ "grad_norm": 5.874698162078857,
1500
+ "learning_rate": 9.49738219895288e-08,
1501
+ "loss": 0.4882,
1502
+ "step": 5300
1503
+ },
1504
+ {
1505
+ "epoch": 0.06,
1506
+ "grad_norm": 5.321093559265137,
1507
+ "learning_rate": 9.494764397905759e-08,
1508
+ "loss": 0.4929,
1509
+ "step": 5325
1510
+ },
1511
+ {
1512
+ "epoch": 0.06,
1513
+ "grad_norm": 6.657257556915283,
1514
+ "learning_rate": 9.492146596858638e-08,
1515
+ "loss": 0.4467,
1516
+ "step": 5350
1517
+ },
1518
+ {
1519
+ "epoch": 0.06,
1520
+ "grad_norm": 5.798694133758545,
1521
+ "learning_rate": 9.489528795811518e-08,
1522
+ "loss": 0.5027,
1523
+ "step": 5375
1524
+ },
1525
+ {
1526
+ "epoch": 0.06,
1527
+ "grad_norm": 6.4486236572265625,
1528
+ "learning_rate": 9.486910994764398e-08,
1529
+ "loss": 0.5157,
1530
+ "step": 5400
1531
+ },
1532
+ {
1533
+ "epoch": 0.06,
1534
+ "grad_norm": 5.78603458404541,
1535
+ "learning_rate": 9.484293193717276e-08,
1536
+ "loss": 0.568,
1537
+ "step": 5425
1538
+ },
1539
+ {
1540
+ "epoch": 0.06,
1541
+ "grad_norm": 6.391395568847656,
1542
+ "learning_rate": 9.481675392670157e-08,
1543
+ "loss": 0.5768,
1544
+ "step": 5450
1545
+ },
1546
+ {
1547
+ "epoch": 0.06,
1548
+ "grad_norm": 6.703619003295898,
1549
+ "learning_rate": 9.479057591623036e-08,
1550
+ "loss": 0.5885,
1551
+ "step": 5475
1552
+ },
1553
+ {
1554
+ "epoch": 0.06,
1555
+ "grad_norm": 6.5529937744140625,
1556
+ "learning_rate": 9.476439790575916e-08,
1557
+ "loss": 0.5355,
1558
+ "step": 5500
1559
+ },
1560
+ {
1561
+ "epoch": 0.06,
1562
+ "grad_norm": 5.757615566253662,
1563
+ "learning_rate": 9.473821989528795e-08,
1564
+ "loss": 0.4787,
1565
+ "step": 5525
1566
+ },
1567
+ {
1568
+ "epoch": 0.06,
1569
+ "grad_norm": 5.5016703605651855,
1570
+ "learning_rate": 9.471204188481676e-08,
1571
+ "loss": 0.4435,
1572
+ "step": 5550
1573
+ },
1574
+ {
1575
+ "epoch": 0.06,
1576
+ "grad_norm": 6.2132368087768555,
1577
+ "learning_rate": 9.468586387434555e-08,
1578
+ "loss": 0.5157,
1579
+ "step": 5575
1580
+ },
1581
+ {
1582
+ "epoch": 0.06,
1583
+ "grad_norm": 5.654526710510254,
1584
+ "learning_rate": 9.465968586387434e-08,
1585
+ "loss": 0.5769,
1586
+ "step": 5600
1587
+ },
1588
+ {
1589
+ "epoch": 0.06,
1590
+ "grad_norm": 5.5138139724731445,
1591
+ "learning_rate": 9.463350785340314e-08,
1592
+ "loss": 0.5805,
1593
+ "step": 5625
1594
+ },
1595
+ {
1596
+ "epoch": 0.06,
1597
+ "grad_norm": 5.938875198364258,
1598
+ "learning_rate": 9.460732984293194e-08,
1599
+ "loss": 0.6114,
1600
+ "step": 5650
1601
+ },
1602
+ {
1603
+ "epoch": 0.06,
1604
+ "grad_norm": 4.941293239593506,
1605
+ "learning_rate": 9.458115183246072e-08,
1606
+ "loss": 0.5762,
1607
+ "step": 5675
1608
+ },
1609
+ {
1610
+ "epoch": 0.06,
1611
+ "grad_norm": 6.395961284637451,
1612
+ "learning_rate": 9.455497382198953e-08,
1613
+ "loss": 0.5745,
1614
+ "step": 5700
1615
+ },
1616
+ {
1617
+ "epoch": 0.06,
1618
+ "grad_norm": 5.585537910461426,
1619
+ "learning_rate": 9.452879581151832e-08,
1620
+ "loss": 0.5571,
1621
+ "step": 5725
1622
+ },
1623
+ {
1624
+ "epoch": 0.06,
1625
+ "grad_norm": 5.933156490325928,
1626
+ "learning_rate": 9.450261780104711e-08,
1627
+ "loss": 0.4811,
1628
+ "step": 5750
1629
+ },
1630
+ {
1631
+ "epoch": 0.06,
1632
+ "grad_norm": 7.242075443267822,
1633
+ "learning_rate": 9.447643979057591e-08,
1634
+ "loss": 0.464,
1635
+ "step": 5775
1636
+ },
1637
+ {
1638
+ "epoch": 0.06,
1639
+ "grad_norm": 5.613156318664551,
1640
+ "learning_rate": 9.445026178010471e-08,
1641
+ "loss": 0.5033,
1642
+ "step": 5800
1643
+ },
1644
+ {
1645
+ "epoch": 0.06,
1646
+ "grad_norm": 6.406403541564941,
1647
+ "learning_rate": 9.44240837696335e-08,
1648
+ "loss": 0.4344,
1649
+ "step": 5825
1650
+ },
1651
+ {
1652
+ "epoch": 0.06,
1653
+ "grad_norm": 4.892160415649414,
1654
+ "learning_rate": 9.43979057591623e-08,
1655
+ "loss": 0.4187,
1656
+ "step": 5850
1657
+ },
1658
+ {
1659
+ "epoch": 0.06,
1660
+ "grad_norm": 5.776142120361328,
1661
+ "learning_rate": 9.43717277486911e-08,
1662
+ "loss": 0.4124,
1663
+ "step": 5875
1664
+ },
1665
+ {
1666
+ "epoch": 0.06,
1667
+ "grad_norm": 5.972835063934326,
1668
+ "learning_rate": 9.434554973821989e-08,
1669
+ "loss": 0.4042,
1670
+ "step": 5900
1671
+ },
1672
+ {
1673
+ "epoch": 0.06,
1674
+ "grad_norm": 6.167374610900879,
1675
+ "learning_rate": 9.431937172774868e-08,
1676
+ "loss": 0.4208,
1677
+ "step": 5925
1678
+ },
1679
+ {
1680
+ "epoch": 0.06,
1681
+ "grad_norm": 6.850512504577637,
1682
+ "learning_rate": 9.429319371727749e-08,
1683
+ "loss": 0.3989,
1684
+ "step": 5950
1685
+ },
1686
+ {
1687
+ "epoch": 0.06,
1688
+ "grad_norm": 6.674014091491699,
1689
+ "learning_rate": 9.426701570680628e-08,
1690
+ "loss": 0.3725,
1691
+ "step": 5975
1692
+ },
1693
+ {
1694
+ "epoch": 0.06,
1695
+ "grad_norm": 7.24482536315918,
1696
+ "learning_rate": 9.424083769633507e-08,
1697
+ "loss": 0.376,
1698
+ "step": 6000
1699
+ },
1700
+ {
1701
+ "epoch": 0.06,
1702
+ "grad_norm": 6.7198710441589355,
1703
+ "learning_rate": 9.421465968586388e-08,
1704
+ "loss": 0.3846,
1705
+ "step": 6025
1706
+ },
1707
+ {
1708
+ "epoch": 0.06,
1709
+ "grad_norm": 6.8929829597473145,
1710
+ "learning_rate": 9.418848167539266e-08,
1711
+ "loss": 0.3857,
1712
+ "step": 6050
1713
+ },
1714
+ {
1715
+ "epoch": 0.06,
1716
+ "grad_norm": 8.093165397644043,
1717
+ "learning_rate": 9.416230366492147e-08,
1718
+ "loss": 0.3766,
1719
+ "step": 6075
1720
+ },
1721
+ {
1722
+ "epoch": 0.06,
1723
+ "grad_norm": 6.204592227935791,
1724
+ "learning_rate": 9.413612565445026e-08,
1725
+ "loss": 0.3779,
1726
+ "step": 6100
1727
+ },
1728
+ {
1729
+ "epoch": 0.06,
1730
+ "grad_norm": 5.946498870849609,
1731
+ "learning_rate": 9.410994764397905e-08,
1732
+ "loss": 0.3719,
1733
+ "step": 6125
1734
+ },
1735
+ {
1736
+ "epoch": 0.06,
1737
+ "grad_norm": 7.825682163238525,
1738
+ "learning_rate": 9.408376963350785e-08,
1739
+ "loss": 0.3891,
1740
+ "step": 6150
1741
+ },
1742
+ {
1743
+ "epoch": 0.06,
1744
+ "grad_norm": 7.207645416259766,
1745
+ "learning_rate": 9.405759162303665e-08,
1746
+ "loss": 0.3901,
1747
+ "step": 6175
1748
+ },
1749
+ {
1750
+ "epoch": 0.06,
1751
+ "grad_norm": 6.809023857116699,
1752
+ "learning_rate": 9.403141361256543e-08,
1753
+ "loss": 0.4059,
1754
+ "step": 6200
1755
+ },
1756
+ {
1757
+ "epoch": 0.06,
1758
+ "grad_norm": 6.104794979095459,
1759
+ "learning_rate": 9.400523560209424e-08,
1760
+ "loss": 0.4059,
1761
+ "step": 6225
1762
+ },
1763
+ {
1764
+ "epoch": 0.07,
1765
+ "grad_norm": 6.525493621826172,
1766
+ "learning_rate": 9.397905759162303e-08,
1767
+ "loss": 0.4047,
1768
+ "step": 6250
1769
+ },
1770
+ {
1771
+ "epoch": 0.07,
1772
+ "grad_norm": 6.874316215515137,
1773
+ "learning_rate": 9.395287958115183e-08,
1774
+ "loss": 0.514,
1775
+ "step": 6275
1776
+ },
1777
+ {
1778
+ "epoch": 0.07,
1779
+ "grad_norm": 5.96618127822876,
1780
+ "learning_rate": 9.392670157068062e-08,
1781
+ "loss": 0.4962,
1782
+ "step": 6300
1783
+ },
1784
+ {
1785
+ "epoch": 0.07,
1786
+ "grad_norm": 6.455708026885986,
1787
+ "learning_rate": 9.390052356020942e-08,
1788
+ "loss": 0.5045,
1789
+ "step": 6325
1790
+ },
1791
+ {
1792
+ "epoch": 0.07,
1793
+ "grad_norm": 6.469492435455322,
1794
+ "learning_rate": 9.387434554973822e-08,
1795
+ "loss": 0.8458,
1796
+ "step": 6350
1797
+ },
1798
+ {
1799
+ "epoch": 0.07,
1800
+ "grad_norm": 9.225332260131836,
1801
+ "learning_rate": 9.384816753926701e-08,
1802
+ "loss": 0.8835,
1803
+ "step": 6375
1804
+ },
1805
+ {
1806
+ "epoch": 0.07,
1807
+ "grad_norm": 6.529109954833984,
1808
+ "learning_rate": 9.38219895287958e-08,
1809
+ "loss": 0.7166,
1810
+ "step": 6400
1811
+ },
1812
+ {
1813
+ "epoch": 0.07,
1814
+ "grad_norm": 7.395893096923828,
1815
+ "learning_rate": 9.379581151832461e-08,
1816
+ "loss": 0.7075,
1817
+ "step": 6425
1818
+ },
1819
+ {
1820
+ "epoch": 0.07,
1821
+ "grad_norm": 8.16038990020752,
1822
+ "learning_rate": 9.376963350785339e-08,
1823
+ "loss": 0.9168,
1824
+ "step": 6450
1825
+ },
1826
+ {
1827
+ "epoch": 0.07,
1828
+ "grad_norm": 7.322926044464111,
1829
+ "learning_rate": 9.37434554973822e-08,
1830
+ "loss": 0.7444,
1831
+ "step": 6475
1832
+ },
1833
+ {
1834
+ "epoch": 0.07,
1835
+ "grad_norm": 7.18267297744751,
1836
+ "learning_rate": 9.371727748691099e-08,
1837
+ "loss": 0.6744,
1838
+ "step": 6500
1839
+ },
1840
+ {
1841
+ "epoch": 0.07,
1842
+ "grad_norm": 7.361169815063477,
1843
+ "learning_rate": 9.369109947643978e-08,
1844
+ "loss": 0.5583,
1845
+ "step": 6525
1846
+ },
1847
+ {
1848
+ "epoch": 0.07,
1849
+ "grad_norm": 8.085954666137695,
1850
+ "learning_rate": 9.366492146596858e-08,
1851
+ "loss": 0.5442,
1852
+ "step": 6550
1853
+ },
1854
+ {
1855
+ "epoch": 0.07,
1856
+ "grad_norm": 7.492279052734375,
1857
+ "learning_rate": 9.363874345549738e-08,
1858
+ "loss": 0.5684,
1859
+ "step": 6575
1860
+ },
1861
+ {
1862
+ "epoch": 0.07,
1863
+ "grad_norm": 6.951526641845703,
1864
+ "learning_rate": 9.361256544502618e-08,
1865
+ "loss": 0.5311,
1866
+ "step": 6600
1867
+ },
1868
+ {
1869
+ "epoch": 0.07,
1870
+ "grad_norm": 6.271228790283203,
1871
+ "learning_rate": 9.358638743455497e-08,
1872
+ "loss": 0.473,
1873
+ "step": 6625
1874
+ },
1875
+ {
1876
+ "epoch": 0.07,
1877
+ "grad_norm": 5.724484443664551,
1878
+ "learning_rate": 9.356020942408376e-08,
1879
+ "loss": 0.4471,
1880
+ "step": 6650
1881
+ },
1882
+ {
1883
+ "epoch": 0.07,
1884
+ "grad_norm": 5.2642669677734375,
1885
+ "learning_rate": 9.353403141361256e-08,
1886
+ "loss": 0.4008,
1887
+ "step": 6675
1888
+ },
1889
+ {
1890
+ "epoch": 0.07,
1891
+ "grad_norm": 5.970279216766357,
1892
+ "learning_rate": 9.350785340314136e-08,
1893
+ "loss": 0.3922,
1894
+ "step": 6700
1895
+ },
1896
+ {
1897
+ "epoch": 0.07,
1898
+ "grad_norm": 6.13707160949707,
1899
+ "learning_rate": 9.348167539267016e-08,
1900
+ "loss": 0.4149,
1901
+ "step": 6725
1902
+ },
1903
+ {
1904
+ "epoch": 0.07,
1905
+ "grad_norm": 5.1920061111450195,
1906
+ "learning_rate": 9.345549738219895e-08,
1907
+ "loss": 0.3732,
1908
+ "step": 6750
1909
+ },
1910
+ {
1911
+ "epoch": 0.07,
1912
+ "grad_norm": 6.059106349945068,
1913
+ "learning_rate": 9.342931937172774e-08,
1914
+ "loss": 0.3783,
1915
+ "step": 6775
1916
+ },
1917
+ {
1918
+ "epoch": 0.07,
1919
+ "grad_norm": 5.317996025085449,
1920
+ "learning_rate": 9.340314136125655e-08,
1921
+ "loss": 0.3701,
1922
+ "step": 6800
1923
+ },
1924
+ {
1925
+ "epoch": 0.07,
1926
+ "grad_norm": 5.347188472747803,
1927
+ "learning_rate": 9.337696335078533e-08,
1928
+ "loss": 0.3466,
1929
+ "step": 6825
1930
+ },
1931
+ {
1932
+ "epoch": 0.07,
1933
+ "grad_norm": 5.118027687072754,
1934
+ "learning_rate": 9.335078534031414e-08,
1935
+ "loss": 0.363,
1936
+ "step": 6850
1937
+ },
1938
+ {
1939
+ "epoch": 0.07,
1940
+ "grad_norm": 4.868067264556885,
1941
+ "learning_rate": 9.332460732984293e-08,
1942
+ "loss": 0.3696,
1943
+ "step": 6875
1944
+ },
1945
+ {
1946
+ "epoch": 0.07,
1947
+ "grad_norm": 5.714309215545654,
1948
+ "learning_rate": 9.329842931937172e-08,
1949
+ "loss": 0.3768,
1950
+ "step": 6900
1951
+ },
1952
+ {
1953
+ "epoch": 0.07,
1954
+ "grad_norm": 5.903509616851807,
1955
+ "learning_rate": 9.327225130890052e-08,
1956
+ "loss": 0.3625,
1957
+ "step": 6925
1958
+ },
1959
+ {
1960
+ "epoch": 0.07,
1961
+ "grad_norm": 5.700974941253662,
1962
+ "learning_rate": 9.324607329842932e-08,
1963
+ "loss": 0.3717,
1964
+ "step": 6950
1965
+ },
1966
+ {
1967
+ "epoch": 0.07,
1968
+ "grad_norm": 6.056822776794434,
1969
+ "learning_rate": 9.32198952879581e-08,
1970
+ "loss": 0.3601,
1971
+ "step": 6975
1972
+ },
1973
+ {
1974
+ "epoch": 0.07,
1975
+ "grad_norm": 6.140659809112549,
1976
+ "learning_rate": 9.319371727748691e-08,
1977
+ "loss": 0.3691,
1978
+ "step": 7000
1979
+ },
1980
+ {
1981
+ "epoch": 0.07,
1982
+ "grad_norm": 6.195953369140625,
1983
+ "learning_rate": 9.31675392670157e-08,
1984
+ "loss": 0.3632,
1985
+ "step": 7025
1986
+ },
1987
+ {
1988
+ "epoch": 0.07,
1989
+ "grad_norm": 4.96120023727417,
1990
+ "learning_rate": 9.314136125654451e-08,
1991
+ "loss": 0.3449,
1992
+ "step": 7050
1993
+ },
1994
+ {
1995
+ "epoch": 0.07,
1996
+ "grad_norm": 6.803286075592041,
1997
+ "learning_rate": 9.311518324607329e-08,
1998
+ "loss": 0.3601,
1999
+ "step": 7075
2000
+ },
2001
+ {
2002
+ "epoch": 0.07,
2003
+ "grad_norm": 5.16037654876709,
2004
+ "learning_rate": 9.30890052356021e-08,
2005
+ "loss": 0.3478,
2006
+ "step": 7100
2007
+ },
2008
+ {
2009
+ "epoch": 0.07,
2010
+ "grad_norm": 5.407104969024658,
2011
+ "learning_rate": 9.306282722513089e-08,
2012
+ "loss": 0.3498,
2013
+ "step": 7125
2014
+ },
2015
+ {
2016
+ "epoch": 0.07,
2017
+ "grad_norm": 5.451097011566162,
2018
+ "learning_rate": 9.303664921465968e-08,
2019
+ "loss": 0.3574,
2020
+ "step": 7150
2021
+ },
2022
+ {
2023
+ "epoch": 0.07,
2024
+ "grad_norm": 5.362937927246094,
2025
+ "learning_rate": 9.301047120418847e-08,
2026
+ "loss": 0.3477,
2027
+ "step": 7175
2028
+ },
2029
+ {
2030
+ "epoch": 0.07,
2031
+ "grad_norm": 5.407390117645264,
2032
+ "learning_rate": 9.298429319371728e-08,
2033
+ "loss": 0.3575,
2034
+ "step": 7200
2035
+ },
2036
+ {
2037
+ "epoch": 0.08,
2038
+ "grad_norm": 5.426994800567627,
2039
+ "learning_rate": 9.295811518324606e-08,
2040
+ "loss": 0.3454,
2041
+ "step": 7225
2042
+ },
2043
+ {
2044
+ "epoch": 0.08,
2045
+ "grad_norm": 6.192265510559082,
2046
+ "learning_rate": 9.293193717277487e-08,
2047
+ "loss": 0.36,
2048
+ "step": 7250
2049
+ },
2050
+ {
2051
+ "epoch": 0.08,
2052
+ "grad_norm": 5.969931125640869,
2053
+ "learning_rate": 9.290575916230366e-08,
2054
+ "loss": 0.3479,
2055
+ "step": 7275
2056
+ },
2057
+ {
2058
+ "epoch": 0.08,
2059
+ "grad_norm": 5.602126121520996,
2060
+ "learning_rate": 9.287958115183245e-08,
2061
+ "loss": 0.3527,
2062
+ "step": 7300
2063
+ },
2064
+ {
2065
+ "epoch": 0.08,
2066
+ "grad_norm": 6.191224575042725,
2067
+ "learning_rate": 9.285340314136125e-08,
2068
+ "loss": 0.3915,
2069
+ "step": 7325
2070
+ },
2071
+ {
2072
+ "epoch": 0.08,
2073
+ "grad_norm": 5.79760217666626,
2074
+ "learning_rate": 9.282722513089005e-08,
2075
+ "loss": 0.3922,
2076
+ "step": 7350
2077
+ },
2078
+ {
2079
+ "epoch": 0.08,
2080
+ "grad_norm": 8.519009590148926,
2081
+ "learning_rate": 9.280104712041885e-08,
2082
+ "loss": 0.4254,
2083
+ "step": 7375
2084
+ },
2085
+ {
2086
+ "epoch": 0.08,
2087
+ "grad_norm": 5.360806941986084,
2088
+ "learning_rate": 9.277486910994764e-08,
2089
+ "loss": 0.4391,
2090
+ "step": 7400
2091
+ },
2092
+ {
2093
+ "epoch": 0.08,
2094
+ "grad_norm": 5.539173603057861,
2095
+ "learning_rate": 9.274869109947645e-08,
2096
+ "loss": 0.3988,
2097
+ "step": 7425
2098
+ },
2099
+ {
2100
+ "epoch": 0.08,
2101
+ "grad_norm": 7.067492961883545,
2102
+ "learning_rate": 9.272251308900523e-08,
2103
+ "loss": 0.3779,
2104
+ "step": 7450
2105
+ },
2106
+ {
2107
+ "epoch": 0.08,
2108
+ "grad_norm": 5.135078430175781,
2109
+ "learning_rate": 9.269633507853403e-08,
2110
+ "loss": 0.3904,
2111
+ "step": 7475
2112
+ },
2113
+ {
2114
+ "epoch": 0.08,
2115
+ "grad_norm": 5.269252300262451,
2116
+ "learning_rate": 9.267015706806283e-08,
2117
+ "loss": 0.3597,
2118
+ "step": 7500
2119
+ },
2120
+ {
2121
+ "epoch": 0.08,
2122
+ "grad_norm": 7.094182014465332,
2123
+ "learning_rate": 9.264397905759162e-08,
2124
+ "loss": 0.3766,
2125
+ "step": 7525
2126
+ },
2127
+ {
2128
+ "epoch": 0.08,
2129
+ "grad_norm": 5.993140697479248,
2130
+ "learning_rate": 9.261780104712041e-08,
2131
+ "loss": 0.3377,
2132
+ "step": 7550
2133
+ },
2134
+ {
2135
+ "epoch": 0.08,
2136
+ "grad_norm": 6.09189510345459,
2137
+ "learning_rate": 9.259162303664922e-08,
2138
+ "loss": 0.3779,
2139
+ "step": 7575
2140
+ },
2141
+ {
2142
+ "epoch": 0.08,
2143
+ "grad_norm": 5.466849327087402,
2144
+ "learning_rate": 9.2565445026178e-08,
2145
+ "loss": 0.3602,
2146
+ "step": 7600
2147
+ },
2148
+ {
2149
+ "epoch": 0.08,
2150
+ "grad_norm": 5.297680854797363,
2151
+ "learning_rate": 9.25392670157068e-08,
2152
+ "loss": 0.3318,
2153
+ "step": 7625
2154
+ },
2155
+ {
2156
+ "epoch": 0.08,
2157
+ "grad_norm": 5.143691539764404,
2158
+ "learning_rate": 9.25130890052356e-08,
2159
+ "loss": 0.334,
2160
+ "step": 7650
2161
+ },
2162
+ {
2163
+ "epoch": 0.08,
2164
+ "grad_norm": 5.337982654571533,
2165
+ "learning_rate": 9.248691099476439e-08,
2166
+ "loss": 0.3343,
2167
+ "step": 7675
2168
+ },
2169
+ {
2170
+ "epoch": 0.08,
2171
+ "grad_norm": 5.539205551147461,
2172
+ "learning_rate": 9.246073298429318e-08,
2173
+ "loss": 0.3527,
2174
+ "step": 7700
2175
+ },
2176
+ {
2177
+ "epoch": 0.08,
2178
+ "grad_norm": 5.057958126068115,
2179
+ "learning_rate": 9.243455497382199e-08,
2180
+ "loss": 0.3441,
2181
+ "step": 7725
2182
+ },
2183
+ {
2184
+ "epoch": 0.08,
2185
+ "grad_norm": 5.447077751159668,
2186
+ "learning_rate": 9.240837696335077e-08,
2187
+ "loss": 0.3368,
2188
+ "step": 7750
2189
+ },
2190
+ {
2191
+ "epoch": 0.08,
2192
+ "grad_norm": 5.604344844818115,
2193
+ "learning_rate": 9.238219895287958e-08,
2194
+ "loss": 0.3357,
2195
+ "step": 7775
2196
+ },
2197
+ {
2198
+ "epoch": 0.08,
2199
+ "grad_norm": 6.193871021270752,
2200
+ "learning_rate": 9.235602094240837e-08,
2201
+ "loss": 0.3841,
2202
+ "step": 7800
2203
+ },
2204
+ {
2205
+ "epoch": 0.08,
2206
+ "grad_norm": 5.70228910446167,
2207
+ "learning_rate": 9.232984293193718e-08,
2208
+ "loss": 0.3991,
2209
+ "step": 7825
2210
+ },
2211
+ {
2212
+ "epoch": 0.08,
2213
+ "grad_norm": 6.8992743492126465,
2214
+ "learning_rate": 9.230366492146596e-08,
2215
+ "loss": 0.4435,
2216
+ "step": 7850
2217
+ },
2218
+ {
2219
+ "epoch": 0.08,
2220
+ "grad_norm": 7.393523693084717,
2221
+ "learning_rate": 9.227748691099476e-08,
2222
+ "loss": 0.4094,
2223
+ "step": 7875
2224
+ },
2225
+ {
2226
+ "epoch": 0.08,
2227
+ "grad_norm": 5.266127586364746,
2228
+ "learning_rate": 9.225130890052356e-08,
2229
+ "loss": 0.3806,
2230
+ "step": 7900
2231
+ },
2232
+ {
2233
+ "epoch": 0.08,
2234
+ "grad_norm": 5.960921287536621,
2235
+ "learning_rate": 9.222513089005235e-08,
2236
+ "loss": 0.3749,
2237
+ "step": 7925
2238
+ },
2239
+ {
2240
+ "epoch": 0.08,
2241
+ "grad_norm": 6.215056896209717,
2242
+ "learning_rate": 9.219895287958114e-08,
2243
+ "loss": 0.3956,
2244
+ "step": 7950
2245
+ },
2246
+ {
2247
+ "epoch": 0.08,
2248
+ "grad_norm": 4.992290019989014,
2249
+ "learning_rate": 9.217277486910995e-08,
2250
+ "loss": 0.414,
2251
+ "step": 7975
2252
+ },
2253
+ {
2254
+ "epoch": 0.08,
2255
+ "grad_norm": 5.627460479736328,
2256
+ "learning_rate": 9.214659685863874e-08,
2257
+ "loss": 0.4508,
2258
+ "step": 8000
2259
+ },
2260
+ {
2261
+ "epoch": 0.08,
2262
+ "grad_norm": 7.53002405166626,
2263
+ "learning_rate": 9.212041884816754e-08,
2264
+ "loss": 0.4771,
2265
+ "step": 8025
2266
+ },
2267
+ {
2268
+ "epoch": 0.08,
2269
+ "grad_norm": 6.5475172996521,
2270
+ "learning_rate": 9.209424083769633e-08,
2271
+ "loss": 0.4636,
2272
+ "step": 8050
2273
+ },
2274
+ {
2275
+ "epoch": 0.08,
2276
+ "grad_norm": 6.499009132385254,
2277
+ "learning_rate": 9.206806282722512e-08,
2278
+ "loss": 0.5024,
2279
+ "step": 8075
2280
+ },
2281
+ {
2282
+ "epoch": 0.08,
2283
+ "grad_norm": 5.928787708282471,
2284
+ "learning_rate": 9.204188481675393e-08,
2285
+ "loss": 0.482,
2286
+ "step": 8100
2287
+ },
2288
+ {
2289
+ "epoch": 0.08,
2290
+ "grad_norm": 6.647201061248779,
2291
+ "learning_rate": 9.201570680628272e-08,
2292
+ "loss": 0.4901,
2293
+ "step": 8125
2294
+ },
2295
+ {
2296
+ "epoch": 0.08,
2297
+ "grad_norm": 7.4282355308532715,
2298
+ "learning_rate": 9.198952879581152e-08,
2299
+ "loss": 0.509,
2300
+ "step": 8150
2301
+ },
2302
+ {
2303
+ "epoch": 0.09,
2304
+ "grad_norm": 8.04277229309082,
2305
+ "learning_rate": 9.196335078534031e-08,
2306
+ "loss": 0.5403,
2307
+ "step": 8175
2308
+ },
2309
+ {
2310
+ "epoch": 0.09,
2311
+ "grad_norm": 8.562540054321289,
2312
+ "learning_rate": 9.193717277486911e-08,
2313
+ "loss": 0.5798,
2314
+ "step": 8200
2315
+ },
2316
+ {
2317
+ "epoch": 0.09,
2318
+ "eval_loss": 0.7510205507278442,
2319
+ "eval_runtime": 275.1584,
2320
+ "eval_samples_per_second": 9.827,
2321
+ "eval_steps_per_second": 1.228,
2322
+ "eval_wer": 37.65067359962184,
2323
+ "step": 8222
2324
+ }
2325
+ ],
2326
+ "logging_steps": 25,
2327
+ "max_steps": 96000,
2328
+ "num_input_tokens_seen": 0,
2329
+ "num_train_epochs": 9223372036854775807,
2330
+ "save_steps": 4111,
2331
+ "total_flos": 3.23866357530624e+18,
2332
+ "train_batch_size": 16,
2333
+ "trial_name": null,
2334
+ "trial_params": null
2335
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ebe3f622be8c665bd9dad6021d079995de752a1259b99e6f88264273cc97b0c
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef16ea603d5237e32e103e4c50b54a70b01579c70603e7736e8f7cb016d8ef09
3
  size 5048