wangrongsheng commited on
Commit
0f16f1e
·
1 Parent(s): 2792134
README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: float16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.4.0
adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "./Qwen-14B-Chat",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "c_attn"
18
+ ],
19
+ "task_type": "CAUSAL_LM"
20
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f8bc51e048df5ba34d3ae72bebbaa86dd4da5a06de34a514834d9e5d6c6c10c
3
+ size 26242657
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "train_loss": 1.2461530792912217,
4
+ "train_runtime": 30560.3748,
5
+ "train_samples_per_second": 4.517,
6
+ "train_steps_per_second": 0.071
7
+ }
checkpoint-1000/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: float16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.4.0
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "./Qwen-14B-Chat",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "c_attn"
18
+ ],
19
+ "task_type": "CAUSAL_LM"
20
+ }
checkpoint-1000/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829bd89f4173976589d1b69f5c878ab86c2d7b9e825d7f36acc3767fd7fbed3d
3
+ size 26242657
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e1f2e79f1fd8938b95d2f5ac67a77bdef04659b59547d26bfaa6e5e0be354d
3
+ size 52496005
checkpoint-1000/qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28242f711bbe2bade0d1b941200359466639a388ffaa5ae87f80e2b210be5ef4
3
+ size 18679
checkpoint-1000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53a4add8376116454f4f747521df1d370a365bad8ac691065ced3f2dc972dd8a
3
+ size 18679
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3cf3e0db7ad9e9ba5dd577b3481b4ddbb58c5600860f207a1a04a3a4ee97a42
3
+ size 627
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>"
4
+ ],
5
+ "eos_token": "<|endoftext|>",
6
+ "pad_token": "<|endoftext|>"
7
+ }
checkpoint-1000/tokenization_qwen.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ SPECIAL_TOKENS = (
31
+ ENDOFTEXT,
32
+ IMSTART,
33
+ IMEND,
34
+ ) + EXTRAS
35
+
36
+
37
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
+ with open(tiktoken_bpe_file, "rb") as f:
39
+ contents = f.read()
40
+ return {
41
+ base64.b64decode(token): int(rank)
42
+ for token, rank in (line.split() for line in contents.splitlines() if line)
43
+ }
44
+
45
+ class QWenTokenizer(PreTrainedTokenizer):
46
+ """QWen tokenizer."""
47
+
48
+ vocab_files_names = VOCAB_FILES_NAMES
49
+
50
+ def __init__(
51
+ self,
52
+ vocab_file,
53
+ errors="replace",
54
+ **kwargs,
55
+ ):
56
+ super().__init__(**kwargs)
57
+
58
+ self.errors = errors # how to handle errors in decoding
59
+
60
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
+ self.special_tokens = {
62
+ token: index
63
+ for index, token in enumerate(
64
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
+ )
66
+ }
67
+
68
+ enc = tiktoken.Encoding(
69
+ "Qwen",
70
+ pat_str=PAT_STR,
71
+ mergeable_ranks=self.mergeable_ranks,
72
+ special_tokens=self.special_tokens,
73
+ )
74
+ assert (
75
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
76
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
77
+
78
+ self.decoder = {
79
+ v: k for k, v in self.mergeable_ranks.items()
80
+ } # type: dict[int, bytes|str]
81
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
82
+
83
+ self.tokenizer = enc # type: tiktoken.Encoding
84
+
85
+ self.eod_id = self.tokenizer.eot_token
86
+ self.im_start_id = self.special_tokens[IMSTART]
87
+ self.im_end_id = self.special_tokens[IMEND]
88
+
89
+ def __getstate__(self):
90
+ # for pickle lovers
91
+ state = self.__dict__.copy()
92
+ del state['tokenizer']
93
+ return state
94
+
95
+ def __setstate__(self, state):
96
+ # tokenizer is not python native; don't pass it; rebuild it
97
+ self.__dict__.update(state)
98
+ enc = tiktoken.Encoding(
99
+ "Qwen",
100
+ pat_str=PAT_STR,
101
+ mergeable_ranks=self.mergeable_ranks,
102
+ special_tokens=self.special_tokens,
103
+ )
104
+ self.tokenizer = enc
105
+
106
+
107
+ def __len__(self) -> int:
108
+ return self.tokenizer.n_vocab
109
+
110
+ def get_vocab(self) -> Dict[bytes, int]:
111
+ return self.mergeable_ranks
112
+
113
+ def convert_tokens_to_ids(
114
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
115
+ ) -> List[int]:
116
+ ids = []
117
+ if isinstance(tokens, (str, bytes)):
118
+ if tokens in self.special_tokens:
119
+ return self.special_tokens[tokens]
120
+ else:
121
+ return self.mergeable_ranks.get(tokens)
122
+ for token in tokens:
123
+ if token in self.special_tokens:
124
+ ids.append(self.special_tokens[token])
125
+ else:
126
+ ids.append(self.mergeable_ranks.get(token))
127
+ return ids
128
+
129
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
130
+ if not special_tokens and new_tokens:
131
+ raise ValueError('Adding regular tokens is not supported')
132
+ for token in new_tokens:
133
+ surface_form = token.content if isinstance(token, AddedToken) else token
134
+ if surface_form not in SPECIAL_TOKENS:
135
+ raise ValueError('Adding unknown special tokens is not supported')
136
+ return 0
137
+
138
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
139
+ """
140
+ Save only the vocabulary of the tokenizer (vocabulary).
141
+
142
+ Returns:
143
+ `Tuple(str)`: Paths to the files saved.
144
+ """
145
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
146
+ with open(file_path, "w", encoding="utf8") as w:
147
+ for k, v in self.mergeable_ranks.items():
148
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
149
+ w.write(line)
150
+ return (file_path,)
151
+
152
+ def tokenize(
153
+ self,
154
+ text: str,
155
+ allowed_special: Union[Set, str] = "all",
156
+ disallowed_special: Union[Collection, str] = (),
157
+ **kwargs,
158
+ ) -> List[Union[bytes, str]]:
159
+ """
160
+ Converts a string in a sequence of tokens.
161
+
162
+ Args:
163
+ text (`str`):
164
+ The sequence to be encoded.
165
+ allowed_special (`Literal["all"]` or `set`):
166
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
167
+ Default to "all".
168
+ disallowed_special (`Literal["all"]` or `Collection`):
169
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
170
+ Default to an empty tuple.
171
+
172
+ kwargs (additional keyword arguments, *optional*):
173
+ Will be passed to the underlying model specific encode method.
174
+
175
+ Returns:
176
+ `List[bytes|str]`: The list of tokens.
177
+ """
178
+ tokens = []
179
+ text = unicodedata.normalize("NFC", text)
180
+
181
+ # this implementation takes a detour: text -> token id -> token surface forms
182
+ for t in self.tokenizer.encode(
183
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
184
+ ):
185
+ tokens.append(self.decoder[t])
186
+ return tokens
187
+
188
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
189
+ """
190
+ Converts a sequence of tokens in a single string.
191
+ """
192
+ text = ""
193
+ temp = b""
194
+ for t in tokens:
195
+ if isinstance(t, str):
196
+ if temp:
197
+ text += temp.decode("utf-8", errors=self.errors)
198
+ temp = b""
199
+ text += t
200
+ elif isinstance(t, bytes):
201
+ temp += t
202
+ else:
203
+ raise TypeError("token should only be of type types or str")
204
+ if temp:
205
+ text += temp.decode("utf-8", errors=self.errors)
206
+ return text
207
+
208
+ @property
209
+ def vocab_size(self):
210
+ return self.tokenizer.n_vocab
211
+
212
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
213
+ """Converts an id to a token, special tokens included"""
214
+ if index in self.decoder:
215
+ return self.decoder[index]
216
+ raise ValueError("unknown ids")
217
+
218
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
219
+ """Converts a token to an id using the vocab, special tokens included"""
220
+ if token in self.special_tokens:
221
+ return self.special_tokens[token]
222
+ if token in self.mergeable_ranks:
223
+ return self.mergeable_ranks[token]
224
+ raise ValueError("unknown token")
225
+
226
+ def _tokenize(self, text: str, **kwargs):
227
+ """
228
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
229
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
230
+
231
+ Do NOT take care of added tokens.
232
+ """
233
+ raise NotImplementedError
234
+
235
+ def _decode(
236
+ self,
237
+ token_ids: Union[int, List[int]],
238
+ skip_special_tokens: bool = False,
239
+ errors: str = None,
240
+ **kwargs,
241
+ ) -> str:
242
+ if isinstance(token_ids, int):
243
+ token_ids = [token_ids]
244
+ if skip_special_tokens:
245
+ token_ids = [i for i in token_ids if i < self.eod_id]
246
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 8192,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9272137227630969,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 4.999734597774032e-05,
14
+ "loss": 1.6199,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 4.998938447446803e-05,
20
+ "loss": 1.5062,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.03,
25
+ "learning_rate": 4.997611718058365e-05,
26
+ "loss": 1.4138,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.04,
31
+ "learning_rate": 4.9957546913022665e-05,
32
+ "loss": 1.369,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.05,
37
+ "learning_rate": 4.993367761465736e-05,
38
+ "loss": 1.3408,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.06,
43
+ "learning_rate": 4.9904514353459654e-05,
44
+ "loss": 1.3321,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.06,
49
+ "learning_rate": 4.9870063321425105e-05,
50
+ "loss": 1.3251,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.07,
55
+ "learning_rate": 4.983033183325818e-05,
56
+ "loss": 1.3228,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.08,
61
+ "learning_rate": 4.97853283248192e-05,
62
+ "loss": 1.3111,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.09,
67
+ "learning_rate": 4.973506235133323e-05,
68
+ "loss": 1.3013,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.1,
73
+ "learning_rate": 4.967954458536126e-05,
74
+ "loss": 1.3004,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.11,
79
+ "learning_rate": 4.9618786814534226e-05,
80
+ "loss": 1.2959,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.12,
85
+ "learning_rate": 4.955280193905022e-05,
86
+ "loss": 1.2969,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.13,
91
+ "learning_rate": 4.948160396893553e-05,
92
+ "loss": 1.2879,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.14,
97
+ "learning_rate": 4.9405208021069946e-05,
98
+ "loss": 1.277,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.15,
103
+ "learning_rate": 4.9323630315977156e-05,
104
+ "loss": 1.283,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.16,
109
+ "learning_rate": 4.9236888174380784e-05,
110
+ "loss": 1.288,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.17,
115
+ "learning_rate": 4.91450000135268e-05,
116
+ "loss": 1.287,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.18,
121
+ "learning_rate": 4.9047985343273154e-05,
122
+ "loss": 1.2726,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.19,
127
+ "learning_rate": 4.894586476194739e-05,
128
+ "loss": 1.2808,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.19,
133
+ "learning_rate": 4.883865995197319e-05,
134
+ "loss": 1.2657,
135
+ "step": 210
136
+ },
137
+ {
138
+ "epoch": 0.2,
139
+ "learning_rate": 4.8726393675266716e-05,
140
+ "loss": 1.275,
141
+ "step": 220
142
+ },
143
+ {
144
+ "epoch": 0.21,
145
+ "learning_rate": 4.860908976840376e-05,
146
+ "loss": 1.2667,
147
+ "step": 230
148
+ },
149
+ {
150
+ "epoch": 0.22,
151
+ "learning_rate": 4.848677313755872e-05,
152
+ "loss": 1.2715,
153
+ "step": 240
154
+ },
155
+ {
156
+ "epoch": 0.23,
157
+ "learning_rate": 4.835946975321647e-05,
158
+ "loss": 1.273,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 0.24,
163
+ "learning_rate": 4.822720664465827e-05,
164
+ "loss": 1.2605,
165
+ "step": 260
166
+ },
167
+ {
168
+ "epoch": 0.25,
169
+ "learning_rate": 4.809001189422287e-05,
170
+ "loss": 1.2766,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 0.26,
175
+ "learning_rate": 4.794791463134399e-05,
176
+ "loss": 1.262,
177
+ "step": 280
178
+ },
179
+ {
180
+ "epoch": 0.27,
181
+ "learning_rate": 4.780094502636552e-05,
182
+ "loss": 1.255,
183
+ "step": 290
184
+ },
185
+ {
186
+ "epoch": 0.28,
187
+ "learning_rate": 4.764913428413572e-05,
188
+ "loss": 1.2652,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 0.29,
193
+ "learning_rate": 4.7492514637381727e-05,
194
+ "loss": 1.2668,
195
+ "step": 310
196
+ },
197
+ {
198
+ "epoch": 0.3,
199
+ "learning_rate": 4.733111933986583e-05,
200
+ "loss": 1.2621,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.31,
205
+ "learning_rate": 4.716498265932501e-05,
206
+ "loss": 1.257,
207
+ "step": 330
208
+ },
209
+ {
210
+ "epoch": 0.32,
211
+ "learning_rate": 4.699413987019512e-05,
212
+ "loss": 1.2789,
213
+ "step": 340
214
+ },
215
+ {
216
+ "epoch": 0.32,
217
+ "learning_rate": 4.681862724612141e-05,
218
+ "loss": 1.2634,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 0.33,
223
+ "learning_rate": 4.663848205225674e-05,
224
+ "loss": 1.2594,
225
+ "step": 360
226
+ },
227
+ {
228
+ "epoch": 0.34,
229
+ "learning_rate": 4.645374253734949e-05,
230
+ "loss": 1.26,
231
+ "step": 370
232
+ },
233
+ {
234
+ "epoch": 0.35,
235
+ "learning_rate": 4.626444792562244e-05,
236
+ "loss": 1.2514,
237
+ "step": 380
238
+ },
239
+ {
240
+ "epoch": 0.36,
241
+ "learning_rate": 4.607063840844463e-05,
242
+ "loss": 1.2506,
243
+ "step": 390
244
+ },
245
+ {
246
+ "epoch": 0.37,
247
+ "learning_rate": 4.587235513579791e-05,
248
+ "loss": 1.2648,
249
+ "step": 400
250
+ },
251
+ {
252
+ "epoch": 0.38,
253
+ "learning_rate": 4.5669640207539786e-05,
254
+ "loss": 1.2511,
255
+ "step": 410
256
+ },
257
+ {
258
+ "epoch": 0.39,
259
+ "learning_rate": 4.546253666446484e-05,
260
+ "loss": 1.2594,
261
+ "step": 420
262
+ },
263
+ {
264
+ "epoch": 0.4,
265
+ "learning_rate": 4.525108847916614e-05,
266
+ "loss": 1.2608,
267
+ "step": 430
268
+ },
269
+ {
270
+ "epoch": 0.41,
271
+ "learning_rate": 4.503534054669892e-05,
272
+ "loss": 1.2597,
273
+ "step": 440
274
+ },
275
+ {
276
+ "epoch": 0.42,
277
+ "learning_rate": 4.481533867504841e-05,
278
+ "loss": 1.2609,
279
+ "step": 450
280
+ },
281
+ {
282
+ "epoch": 0.43,
283
+ "learning_rate": 4.4591129575403765e-05,
284
+ "loss": 1.2505,
285
+ "step": 460
286
+ },
287
+ {
288
+ "epoch": 0.44,
289
+ "learning_rate": 4.43627608522403e-05,
290
+ "loss": 1.2481,
291
+ "step": 470
292
+ },
293
+ {
294
+ "epoch": 0.45,
295
+ "learning_rate": 4.4130280993211974e-05,
296
+ "loss": 1.2612,
297
+ "step": 480
298
+ },
299
+ {
300
+ "epoch": 0.45,
301
+ "learning_rate": 4.389373935885646e-05,
302
+ "loss": 1.2504,
303
+ "step": 490
304
+ },
305
+ {
306
+ "epoch": 0.46,
307
+ "learning_rate": 4.365318617211479e-05,
308
+ "loss": 1.2518,
309
+ "step": 500
310
+ },
311
+ {
312
+ "epoch": 0.47,
313
+ "learning_rate": 4.340867250766794e-05,
314
+ "loss": 1.2458,
315
+ "step": 510
316
+ },
317
+ {
318
+ "epoch": 0.48,
319
+ "learning_rate": 4.316025028109258e-05,
320
+ "loss": 1.2345,
321
+ "step": 520
322
+ },
323
+ {
324
+ "epoch": 0.49,
325
+ "learning_rate": 4.2907972237838225e-05,
326
+ "loss": 1.2521,
327
+ "step": 530
328
+ },
329
+ {
330
+ "epoch": 0.5,
331
+ "learning_rate": 4.2651891942028274e-05,
332
+ "loss": 1.2528,
333
+ "step": 540
334
+ },
335
+ {
336
+ "epoch": 0.51,
337
+ "learning_rate": 4.239206376508717e-05,
338
+ "loss": 1.2462,
339
+ "step": 550
340
+ },
341
+ {
342
+ "epoch": 0.52,
343
+ "learning_rate": 4.212854287419611e-05,
344
+ "loss": 1.2401,
345
+ "step": 560
346
+ },
347
+ {
348
+ "epoch": 0.53,
349
+ "learning_rate": 4.1861385220579934e-05,
350
+ "loss": 1.2496,
351
+ "step": 570
352
+ },
353
+ {
354
+ "epoch": 0.54,
355
+ "learning_rate": 4.1590647527627404e-05,
356
+ "loss": 1.2522,
357
+ "step": 580
358
+ },
359
+ {
360
+ "epoch": 0.55,
361
+ "learning_rate": 4.131638727884762e-05,
362
+ "loss": 1.2377,
363
+ "step": 590
364
+ },
365
+ {
366
+ "epoch": 0.56,
367
+ "learning_rate": 4.103866270566498e-05,
368
+ "loss": 1.2467,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 0.57,
373
+ "learning_rate": 4.075753277505544e-05,
374
+ "loss": 1.2421,
375
+ "step": 610
376
+ },
377
+ {
378
+ "epoch": 0.57,
379
+ "learning_rate": 4.0473057177026484e-05,
380
+ "loss": 1.2455,
381
+ "step": 620
382
+ },
383
+ {
384
+ "epoch": 0.58,
385
+ "learning_rate": 4.018529631194369e-05,
386
+ "loss": 1.2294,
387
+ "step": 630
388
+ },
389
+ {
390
+ "epoch": 0.59,
391
+ "learning_rate": 3.989431127770635e-05,
392
+ "loss": 1.2509,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 0.6,
397
+ "learning_rate": 3.960016385677513e-05,
398
+ "loss": 1.2354,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 0.61,
403
+ "learning_rate": 3.9302916503054246e-05,
404
+ "loss": 1.2318,
405
+ "step": 660
406
+ },
407
+ {
408
+ "epoch": 0.62,
409
+ "learning_rate": 3.9002632328631164e-05,
410
+ "loss": 1.2376,
411
+ "step": 670
412
+ },
413
+ {
414
+ "epoch": 0.63,
415
+ "learning_rate": 3.8699375090376534e-05,
416
+ "loss": 1.2412,
417
+ "step": 680
418
+ },
419
+ {
420
+ "epoch": 0.64,
421
+ "learning_rate": 3.8393209176407223e-05,
422
+ "loss": 1.2479,
423
+ "step": 690
424
+ },
425
+ {
426
+ "epoch": 0.65,
427
+ "learning_rate": 3.8084199592415305e-05,
428
+ "loss": 1.2431,
429
+ "step": 700
430
+ },
431
+ {
432
+ "epoch": 0.66,
433
+ "learning_rate": 3.777241194786591e-05,
434
+ "loss": 1.245,
435
+ "step": 710
436
+ },
437
+ {
438
+ "epoch": 0.67,
439
+ "learning_rate": 3.745791244206697e-05,
440
+ "loss": 1.2393,
441
+ "step": 720
442
+ },
443
+ {
444
+ "epoch": 0.68,
445
+ "learning_rate": 3.714076785011359e-05,
446
+ "loss": 1.2473,
447
+ "step": 730
448
+ },
449
+ {
450
+ "epoch": 0.69,
451
+ "learning_rate": 3.682104550871031e-05,
452
+ "loss": 1.2552,
453
+ "step": 740
454
+ },
455
+ {
456
+ "epoch": 0.7,
457
+ "learning_rate": 3.649881330187401e-05,
458
+ "loss": 1.2356,
459
+ "step": 750
460
+ },
461
+ {
462
+ "epoch": 0.7,
463
+ "learning_rate": 3.617413964652067e-05,
464
+ "loss": 1.2442,
465
+ "step": 760
466
+ },
467
+ {
468
+ "epoch": 0.71,
469
+ "learning_rate": 3.5847093477938956e-05,
470
+ "loss": 1.2605,
471
+ "step": 770
472
+ },
473
+ {
474
+ "epoch": 0.72,
475
+ "learning_rate": 3.551774423515378e-05,
476
+ "loss": 1.2364,
477
+ "step": 780
478
+ },
479
+ {
480
+ "epoch": 0.73,
481
+ "learning_rate": 3.518616184618288e-05,
482
+ "loss": 1.2482,
483
+ "step": 790
484
+ },
485
+ {
486
+ "epoch": 0.74,
487
+ "learning_rate": 3.4852416713189526e-05,
488
+ "loss": 1.2327,
489
+ "step": 800
490
+ },
491
+ {
492
+ "epoch": 0.75,
493
+ "learning_rate": 3.4516579697534705e-05,
494
+ "loss": 1.2548,
495
+ "step": 810
496
+ },
497
+ {
498
+ "epoch": 0.76,
499
+ "learning_rate": 3.417872210473162e-05,
500
+ "loss": 1.2469,
501
+ "step": 820
502
+ },
503
+ {
504
+ "epoch": 0.77,
505
+ "learning_rate": 3.3838915669306034e-05,
506
+ "loss": 1.2424,
507
+ "step": 830
508
+ },
509
+ {
510
+ "epoch": 0.78,
511
+ "learning_rate": 3.349723253956542e-05,
512
+ "loss": 1.2438,
513
+ "step": 840
514
+ },
515
+ {
516
+ "epoch": 0.79,
517
+ "learning_rate": 3.315374526228036e-05,
518
+ "loss": 1.243,
519
+ "step": 850
520
+ },
521
+ {
522
+ "epoch": 0.8,
523
+ "learning_rate": 3.2808526767281225e-05,
524
+ "loss": 1.2416,
525
+ "step": 860
526
+ },
527
+ {
528
+ "epoch": 0.81,
529
+ "learning_rate": 3.246165035197364e-05,
530
+ "loss": 1.2399,
531
+ "step": 870
532
+ },
533
+ {
534
+ "epoch": 0.82,
535
+ "learning_rate": 3.211318966577581e-05,
536
+ "loss": 1.239,
537
+ "step": 880
538
+ },
539
+ {
540
+ "epoch": 0.83,
541
+ "learning_rate": 3.176321869448116e-05,
542
+ "loss": 1.2498,
543
+ "step": 890
544
+ },
545
+ {
546
+ "epoch": 0.83,
547
+ "learning_rate": 3.1411811744549536e-05,
548
+ "loss": 1.2438,
549
+ "step": 900
550
+ },
551
+ {
552
+ "epoch": 0.84,
553
+ "learning_rate": 3.105904342733032e-05,
554
+ "loss": 1.233,
555
+ "step": 910
556
+ },
557
+ {
558
+ "epoch": 0.85,
559
+ "learning_rate": 3.070498864322081e-05,
560
+ "loss": 1.2315,
561
+ "step": 920
562
+ },
563
+ {
564
+ "epoch": 0.86,
565
+ "learning_rate": 3.034972256576328e-05,
566
+ "loss": 1.232,
567
+ "step": 930
568
+ },
569
+ {
570
+ "epoch": 0.87,
571
+ "learning_rate": 2.999332062568395e-05,
572
+ "loss": 1.2467,
573
+ "step": 940
574
+ },
575
+ {
576
+ "epoch": 0.88,
577
+ "learning_rate": 2.9635858494877384e-05,
578
+ "loss": 1.2483,
579
+ "step": 950
580
+ },
581
+ {
582
+ "epoch": 0.89,
583
+ "learning_rate": 2.9277412070339782e-05,
584
+ "loss": 1.2377,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 0.9,
589
+ "learning_rate": 2.891805745805429e-05,
590
+ "loss": 1.2347,
591
+ "step": 970
592
+ },
593
+ {
594
+ "epoch": 0.91,
595
+ "learning_rate": 2.8557870956832132e-05,
596
+ "loss": 1.2428,
597
+ "step": 980
598
+ },
599
+ {
600
+ "epoch": 0.92,
601
+ "learning_rate": 2.8196929042112652e-05,
602
+ "loss": 1.244,
603
+ "step": 990
604
+ },
605
+ {
606
+ "epoch": 0.93,
607
+ "learning_rate": 2.783530834972594e-05,
608
+ "loss": 1.2317,
609
+ "step": 1000
610
+ }
611
+ ],
612
+ "logging_steps": 10,
613
+ "max_steps": 2156,
614
+ "num_train_epochs": 2,
615
+ "save_steps": 1000,
616
+ "total_flos": 1.5703812576473252e+18,
617
+ "trial_name": null,
618
+ "trial_params": null
619
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1ca0e4372bd1720a48da585ce6946399a06d0c984dcfcf5a2b7104827c4603f
3
+ size 4219
checkpoint-2000/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: False
10
+ - load_in_4bit: True
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: nf4
16
+ - bnb_4bit_use_double_quant: True
17
+ - bnb_4bit_compute_dtype: float16
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.4.0
checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "./Qwen-14B-Chat",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "c_attn"
18
+ ],
19
+ "task_type": "CAUSAL_LM"
20
+ }
checkpoint-2000/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd24a9b391571bbdc9f88c1c9433cdd04de9e12bc47315124781f1fbb40b768f
3
+ size 26242657
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310608e62ebf23c5f12f1b89ef6037683c0938b0c464e55f1186e182461eaff9
3
+ size 52496005
checkpoint-2000/qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:402d9b1bf0a32ecfe8861b4f22edf733a41b828d3d32764e5ccad98767ebbf20
3
+ size 18679
checkpoint-2000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c1cc36814e5cb2cf8d7e0bc2c24d4362c7a781368af8673695b07b0575ebaa8
3
+ size 18679
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd0b604a79d16282fd4dcd9803f6bbe0802175560169f2fdb3fe364043fcb56a
3
+ size 627
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>"
4
+ ],
5
+ "eos_token": "<|endoftext|>",
6
+ "pad_token": "<|endoftext|>"
7
+ }
checkpoint-2000/tokenization_qwen.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ SPECIAL_TOKENS = (
31
+ ENDOFTEXT,
32
+ IMSTART,
33
+ IMEND,
34
+ ) + EXTRAS
35
+
36
+
37
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
+ with open(tiktoken_bpe_file, "rb") as f:
39
+ contents = f.read()
40
+ return {
41
+ base64.b64decode(token): int(rank)
42
+ for token, rank in (line.split() for line in contents.splitlines() if line)
43
+ }
44
+
45
+ class QWenTokenizer(PreTrainedTokenizer):
46
+ """QWen tokenizer."""
47
+
48
+ vocab_files_names = VOCAB_FILES_NAMES
49
+
50
+ def __init__(
51
+ self,
52
+ vocab_file,
53
+ errors="replace",
54
+ **kwargs,
55
+ ):
56
+ super().__init__(**kwargs)
57
+
58
+ self.errors = errors # how to handle errors in decoding
59
+
60
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
+ self.special_tokens = {
62
+ token: index
63
+ for index, token in enumerate(
64
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
+ )
66
+ }
67
+
68
+ enc = tiktoken.Encoding(
69
+ "Qwen",
70
+ pat_str=PAT_STR,
71
+ mergeable_ranks=self.mergeable_ranks,
72
+ special_tokens=self.special_tokens,
73
+ )
74
+ assert (
75
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
76
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
77
+
78
+ self.decoder = {
79
+ v: k for k, v in self.mergeable_ranks.items()
80
+ } # type: dict[int, bytes|str]
81
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
82
+
83
+ self.tokenizer = enc # type: tiktoken.Encoding
84
+
85
+ self.eod_id = self.tokenizer.eot_token
86
+ self.im_start_id = self.special_tokens[IMSTART]
87
+ self.im_end_id = self.special_tokens[IMEND]
88
+
89
+ def __getstate__(self):
90
+ # for pickle lovers
91
+ state = self.__dict__.copy()
92
+ del state['tokenizer']
93
+ return state
94
+
95
+ def __setstate__(self, state):
96
+ # tokenizer is not python native; don't pass it; rebuild it
97
+ self.__dict__.update(state)
98
+ enc = tiktoken.Encoding(
99
+ "Qwen",
100
+ pat_str=PAT_STR,
101
+ mergeable_ranks=self.mergeable_ranks,
102
+ special_tokens=self.special_tokens,
103
+ )
104
+ self.tokenizer = enc
105
+
106
+
107
+ def __len__(self) -> int:
108
+ return self.tokenizer.n_vocab
109
+
110
+ def get_vocab(self) -> Dict[bytes, int]:
111
+ return self.mergeable_ranks
112
+
113
+ def convert_tokens_to_ids(
114
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
115
+ ) -> List[int]:
116
+ ids = []
117
+ if isinstance(tokens, (str, bytes)):
118
+ if tokens in self.special_tokens:
119
+ return self.special_tokens[tokens]
120
+ else:
121
+ return self.mergeable_ranks.get(tokens)
122
+ for token in tokens:
123
+ if token in self.special_tokens:
124
+ ids.append(self.special_tokens[token])
125
+ else:
126
+ ids.append(self.mergeable_ranks.get(token))
127
+ return ids
128
+
129
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
130
+ if not special_tokens and new_tokens:
131
+ raise ValueError('Adding regular tokens is not supported')
132
+ for token in new_tokens:
133
+ surface_form = token.content if isinstance(token, AddedToken) else token
134
+ if surface_form not in SPECIAL_TOKENS:
135
+ raise ValueError('Adding unknown special tokens is not supported')
136
+ return 0
137
+
138
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
139
+ """
140
+ Save only the vocabulary of the tokenizer (vocabulary).
141
+
142
+ Returns:
143
+ `Tuple(str)`: Paths to the files saved.
144
+ """
145
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
146
+ with open(file_path, "w", encoding="utf8") as w:
147
+ for k, v in self.mergeable_ranks.items():
148
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
149
+ w.write(line)
150
+ return (file_path,)
151
+
152
+ def tokenize(
153
+ self,
154
+ text: str,
155
+ allowed_special: Union[Set, str] = "all",
156
+ disallowed_special: Union[Collection, str] = (),
157
+ **kwargs,
158
+ ) -> List[Union[bytes, str]]:
159
+ """
160
+ Converts a string in a sequence of tokens.
161
+
162
+ Args:
163
+ text (`str`):
164
+ The sequence to be encoded.
165
+ allowed_special (`Literal["all"]` or `set`):
166
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
167
+ Default to "all".
168
+ disallowed_special (`Literal["all"]` or `Collection`):
169
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
170
+ Default to an empty tuple.
171
+
172
+ kwargs (additional keyword arguments, *optional*):
173
+ Will be passed to the underlying model specific encode method.
174
+
175
+ Returns:
176
+ `List[bytes|str]`: The list of tokens.
177
+ """
178
+ tokens = []
179
+ text = unicodedata.normalize("NFC", text)
180
+
181
+ # this implementation takes a detour: text -> token id -> token surface forms
182
+ for t in self.tokenizer.encode(
183
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
184
+ ):
185
+ tokens.append(self.decoder[t])
186
+ return tokens
187
+
188
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
189
+ """
190
+ Converts a sequence of tokens in a single string.
191
+ """
192
+ text = ""
193
+ temp = b""
194
+ for t in tokens:
195
+ if isinstance(t, str):
196
+ if temp:
197
+ text += temp.decode("utf-8", errors=self.errors)
198
+ temp = b""
199
+ text += t
200
+ elif isinstance(t, bytes):
201
+ temp += t
202
+ else:
203
+ raise TypeError("token should only be of type types or str")
204
+ if temp:
205
+ text += temp.decode("utf-8", errors=self.errors)
206
+ return text
207
+
208
+ @property
209
+ def vocab_size(self):
210
+ return self.tokenizer.n_vocab
211
+
212
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
213
+ """Converts an id to a token, special tokens included"""
214
+ if index in self.decoder:
215
+ return self.decoder[index]
216
+ raise ValueError("unknown ids")
217
+
218
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
219
+ """Converts a token to an id using the vocab, special tokens included"""
220
+ if token in self.special_tokens:
221
+ return self.special_tokens[token]
222
+ if token in self.mergeable_ranks:
223
+ return self.mergeable_ranks[token]
224
+ raise ValueError("unknown token")
225
+
226
+ def _tokenize(self, text: str, **kwargs):
227
+ """
228
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
229
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
230
+
231
+ Do NOT take care of added tokens.
232
+ """
233
+ raise NotImplementedError
234
+
235
+ def _decode(
236
+ self,
237
+ token_ids: Union[int, List[int]],
238
+ skip_special_tokens: bool = False,
239
+ errors: str = None,
240
+ **kwargs,
241
+ ) -> str:
242
+ if isinstance(token_ids, int):
243
+ token_ids = [token_ids]
244
+ if skip_special_tokens:
245
+ token_ids = [i for i in token_ids if i < self.eod_id]
246
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 8192,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,1219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.8544274455261938,
5
+ "eval_steps": 500,
6
+ "global_step": 2000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 4.999734597774032e-05,
14
+ "loss": 1.6199,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 4.998938447446803e-05,
20
+ "loss": 1.5062,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.03,
25
+ "learning_rate": 4.997611718058365e-05,
26
+ "loss": 1.4138,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.04,
31
+ "learning_rate": 4.9957546913022665e-05,
32
+ "loss": 1.369,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.05,
37
+ "learning_rate": 4.993367761465736e-05,
38
+ "loss": 1.3408,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.06,
43
+ "learning_rate": 4.9904514353459654e-05,
44
+ "loss": 1.3321,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.06,
49
+ "learning_rate": 4.9870063321425105e-05,
50
+ "loss": 1.3251,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.07,
55
+ "learning_rate": 4.983033183325818e-05,
56
+ "loss": 1.3228,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.08,
61
+ "learning_rate": 4.97853283248192e-05,
62
+ "loss": 1.3111,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.09,
67
+ "learning_rate": 4.973506235133323e-05,
68
+ "loss": 1.3013,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.1,
73
+ "learning_rate": 4.967954458536126e-05,
74
+ "loss": 1.3004,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.11,
79
+ "learning_rate": 4.9618786814534226e-05,
80
+ "loss": 1.2959,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.12,
85
+ "learning_rate": 4.955280193905022e-05,
86
+ "loss": 1.2969,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.13,
91
+ "learning_rate": 4.948160396893553e-05,
92
+ "loss": 1.2879,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.14,
97
+ "learning_rate": 4.9405208021069946e-05,
98
+ "loss": 1.277,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.15,
103
+ "learning_rate": 4.9323630315977156e-05,
104
+ "loss": 1.283,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.16,
109
+ "learning_rate": 4.9236888174380784e-05,
110
+ "loss": 1.288,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.17,
115
+ "learning_rate": 4.91450000135268e-05,
116
+ "loss": 1.287,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.18,
121
+ "learning_rate": 4.9047985343273154e-05,
122
+ "loss": 1.2726,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.19,
127
+ "learning_rate": 4.894586476194739e-05,
128
+ "loss": 1.2808,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.19,
133
+ "learning_rate": 4.883865995197319e-05,
134
+ "loss": 1.2657,
135
+ "step": 210
136
+ },
137
+ {
138
+ "epoch": 0.2,
139
+ "learning_rate": 4.8726393675266716e-05,
140
+ "loss": 1.275,
141
+ "step": 220
142
+ },
143
+ {
144
+ "epoch": 0.21,
145
+ "learning_rate": 4.860908976840376e-05,
146
+ "loss": 1.2667,
147
+ "step": 230
148
+ },
149
+ {
150
+ "epoch": 0.22,
151
+ "learning_rate": 4.848677313755872e-05,
152
+ "loss": 1.2715,
153
+ "step": 240
154
+ },
155
+ {
156
+ "epoch": 0.23,
157
+ "learning_rate": 4.835946975321647e-05,
158
+ "loss": 1.273,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 0.24,
163
+ "learning_rate": 4.822720664465827e-05,
164
+ "loss": 1.2605,
165
+ "step": 260
166
+ },
167
+ {
168
+ "epoch": 0.25,
169
+ "learning_rate": 4.809001189422287e-05,
170
+ "loss": 1.2766,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 0.26,
175
+ "learning_rate": 4.794791463134399e-05,
176
+ "loss": 1.262,
177
+ "step": 280
178
+ },
179
+ {
180
+ "epoch": 0.27,
181
+ "learning_rate": 4.780094502636552e-05,
182
+ "loss": 1.255,
183
+ "step": 290
184
+ },
185
+ {
186
+ "epoch": 0.28,
187
+ "learning_rate": 4.764913428413572e-05,
188
+ "loss": 1.2652,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 0.29,
193
+ "learning_rate": 4.7492514637381727e-05,
194
+ "loss": 1.2668,
195
+ "step": 310
196
+ },
197
+ {
198
+ "epoch": 0.3,
199
+ "learning_rate": 4.733111933986583e-05,
200
+ "loss": 1.2621,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.31,
205
+ "learning_rate": 4.716498265932501e-05,
206
+ "loss": 1.257,
207
+ "step": 330
208
+ },
209
+ {
210
+ "epoch": 0.32,
211
+ "learning_rate": 4.699413987019512e-05,
212
+ "loss": 1.2789,
213
+ "step": 340
214
+ },
215
+ {
216
+ "epoch": 0.32,
217
+ "learning_rate": 4.681862724612141e-05,
218
+ "loss": 1.2634,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 0.33,
223
+ "learning_rate": 4.663848205225674e-05,
224
+ "loss": 1.2594,
225
+ "step": 360
226
+ },
227
+ {
228
+ "epoch": 0.34,
229
+ "learning_rate": 4.645374253734949e-05,
230
+ "loss": 1.26,
231
+ "step": 370
232
+ },
233
+ {
234
+ "epoch": 0.35,
235
+ "learning_rate": 4.626444792562244e-05,
236
+ "loss": 1.2514,
237
+ "step": 380
238
+ },
239
+ {
240
+ "epoch": 0.36,
241
+ "learning_rate": 4.607063840844463e-05,
242
+ "loss": 1.2506,
243
+ "step": 390
244
+ },
245
+ {
246
+ "epoch": 0.37,
247
+ "learning_rate": 4.587235513579791e-05,
248
+ "loss": 1.2648,
249
+ "step": 400
250
+ },
251
+ {
252
+ "epoch": 0.38,
253
+ "learning_rate": 4.5669640207539786e-05,
254
+ "loss": 1.2511,
255
+ "step": 410
256
+ },
257
+ {
258
+ "epoch": 0.39,
259
+ "learning_rate": 4.546253666446484e-05,
260
+ "loss": 1.2594,
261
+ "step": 420
262
+ },
263
+ {
264
+ "epoch": 0.4,
265
+ "learning_rate": 4.525108847916614e-05,
266
+ "loss": 1.2608,
267
+ "step": 430
268
+ },
269
+ {
270
+ "epoch": 0.41,
271
+ "learning_rate": 4.503534054669892e-05,
272
+ "loss": 1.2597,
273
+ "step": 440
274
+ },
275
+ {
276
+ "epoch": 0.42,
277
+ "learning_rate": 4.481533867504841e-05,
278
+ "loss": 1.2609,
279
+ "step": 450
280
+ },
281
+ {
282
+ "epoch": 0.43,
283
+ "learning_rate": 4.4591129575403765e-05,
284
+ "loss": 1.2505,
285
+ "step": 460
286
+ },
287
+ {
288
+ "epoch": 0.44,
289
+ "learning_rate": 4.43627608522403e-05,
290
+ "loss": 1.2481,
291
+ "step": 470
292
+ },
293
+ {
294
+ "epoch": 0.45,
295
+ "learning_rate": 4.4130280993211974e-05,
296
+ "loss": 1.2612,
297
+ "step": 480
298
+ },
299
+ {
300
+ "epoch": 0.45,
301
+ "learning_rate": 4.389373935885646e-05,
302
+ "loss": 1.2504,
303
+ "step": 490
304
+ },
305
+ {
306
+ "epoch": 0.46,
307
+ "learning_rate": 4.365318617211479e-05,
308
+ "loss": 1.2518,
309
+ "step": 500
310
+ },
311
+ {
312
+ "epoch": 0.47,
313
+ "learning_rate": 4.340867250766794e-05,
314
+ "loss": 1.2458,
315
+ "step": 510
316
+ },
317
+ {
318
+ "epoch": 0.48,
319
+ "learning_rate": 4.316025028109258e-05,
320
+ "loss": 1.2345,
321
+ "step": 520
322
+ },
323
+ {
324
+ "epoch": 0.49,
325
+ "learning_rate": 4.2907972237838225e-05,
326
+ "loss": 1.2521,
327
+ "step": 530
328
+ },
329
+ {
330
+ "epoch": 0.5,
331
+ "learning_rate": 4.2651891942028274e-05,
332
+ "loss": 1.2528,
333
+ "step": 540
334
+ },
335
+ {
336
+ "epoch": 0.51,
337
+ "learning_rate": 4.239206376508717e-05,
338
+ "loss": 1.2462,
339
+ "step": 550
340
+ },
341
+ {
342
+ "epoch": 0.52,
343
+ "learning_rate": 4.212854287419611e-05,
344
+ "loss": 1.2401,
345
+ "step": 560
346
+ },
347
+ {
348
+ "epoch": 0.53,
349
+ "learning_rate": 4.1861385220579934e-05,
350
+ "loss": 1.2496,
351
+ "step": 570
352
+ },
353
+ {
354
+ "epoch": 0.54,
355
+ "learning_rate": 4.1590647527627404e-05,
356
+ "loss": 1.2522,
357
+ "step": 580
358
+ },
359
+ {
360
+ "epoch": 0.55,
361
+ "learning_rate": 4.131638727884762e-05,
362
+ "loss": 1.2377,
363
+ "step": 590
364
+ },
365
+ {
366
+ "epoch": 0.56,
367
+ "learning_rate": 4.103866270566498e-05,
368
+ "loss": 1.2467,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 0.57,
373
+ "learning_rate": 4.075753277505544e-05,
374
+ "loss": 1.2421,
375
+ "step": 610
376
+ },
377
+ {
378
+ "epoch": 0.57,
379
+ "learning_rate": 4.0473057177026484e-05,
380
+ "loss": 1.2455,
381
+ "step": 620
382
+ },
383
+ {
384
+ "epoch": 0.58,
385
+ "learning_rate": 4.018529631194369e-05,
386
+ "loss": 1.2294,
387
+ "step": 630
388
+ },
389
+ {
390
+ "epoch": 0.59,
391
+ "learning_rate": 3.989431127770635e-05,
392
+ "loss": 1.2509,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 0.6,
397
+ "learning_rate": 3.960016385677513e-05,
398
+ "loss": 1.2354,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 0.61,
403
+ "learning_rate": 3.9302916503054246e-05,
404
+ "loss": 1.2318,
405
+ "step": 660
406
+ },
407
+ {
408
+ "epoch": 0.62,
409
+ "learning_rate": 3.9002632328631164e-05,
410
+ "loss": 1.2376,
411
+ "step": 670
412
+ },
413
+ {
414
+ "epoch": 0.63,
415
+ "learning_rate": 3.8699375090376534e-05,
416
+ "loss": 1.2412,
417
+ "step": 680
418
+ },
419
+ {
420
+ "epoch": 0.64,
421
+ "learning_rate": 3.8393209176407223e-05,
422
+ "loss": 1.2479,
423
+ "step": 690
424
+ },
425
+ {
426
+ "epoch": 0.65,
427
+ "learning_rate": 3.8084199592415305e-05,
428
+ "loss": 1.2431,
429
+ "step": 700
430
+ },
431
+ {
432
+ "epoch": 0.66,
433
+ "learning_rate": 3.777241194786591e-05,
434
+ "loss": 1.245,
435
+ "step": 710
436
+ },
437
+ {
438
+ "epoch": 0.67,
439
+ "learning_rate": 3.745791244206697e-05,
440
+ "loss": 1.2393,
441
+ "step": 720
442
+ },
443
+ {
444
+ "epoch": 0.68,
445
+ "learning_rate": 3.714076785011359e-05,
446
+ "loss": 1.2473,
447
+ "step": 730
448
+ },
449
+ {
450
+ "epoch": 0.69,
451
+ "learning_rate": 3.682104550871031e-05,
452
+ "loss": 1.2552,
453
+ "step": 740
454
+ },
455
+ {
456
+ "epoch": 0.7,
457
+ "learning_rate": 3.649881330187401e-05,
458
+ "loss": 1.2356,
459
+ "step": 750
460
+ },
461
+ {
462
+ "epoch": 0.7,
463
+ "learning_rate": 3.617413964652067e-05,
464
+ "loss": 1.2442,
465
+ "step": 760
466
+ },
467
+ {
468
+ "epoch": 0.71,
469
+ "learning_rate": 3.5847093477938956e-05,
470
+ "loss": 1.2605,
471
+ "step": 770
472
+ },
473
+ {
474
+ "epoch": 0.72,
475
+ "learning_rate": 3.551774423515378e-05,
476
+ "loss": 1.2364,
477
+ "step": 780
478
+ },
479
+ {
480
+ "epoch": 0.73,
481
+ "learning_rate": 3.518616184618288e-05,
482
+ "loss": 1.2482,
483
+ "step": 790
484
+ },
485
+ {
486
+ "epoch": 0.74,
487
+ "learning_rate": 3.4852416713189526e-05,
488
+ "loss": 1.2327,
489
+ "step": 800
490
+ },
491
+ {
492
+ "epoch": 0.75,
493
+ "learning_rate": 3.4516579697534705e-05,
494
+ "loss": 1.2548,
495
+ "step": 810
496
+ },
497
+ {
498
+ "epoch": 0.76,
499
+ "learning_rate": 3.417872210473162e-05,
500
+ "loss": 1.2469,
501
+ "step": 820
502
+ },
503
+ {
504
+ "epoch": 0.77,
505
+ "learning_rate": 3.3838915669306034e-05,
506
+ "loss": 1.2424,
507
+ "step": 830
508
+ },
509
+ {
510
+ "epoch": 0.78,
511
+ "learning_rate": 3.349723253956542e-05,
512
+ "loss": 1.2438,
513
+ "step": 840
514
+ },
515
+ {
516
+ "epoch": 0.79,
517
+ "learning_rate": 3.315374526228036e-05,
518
+ "loss": 1.243,
519
+ "step": 850
520
+ },
521
+ {
522
+ "epoch": 0.8,
523
+ "learning_rate": 3.2808526767281225e-05,
524
+ "loss": 1.2416,
525
+ "step": 860
526
+ },
527
+ {
528
+ "epoch": 0.81,
529
+ "learning_rate": 3.246165035197364e-05,
530
+ "loss": 1.2399,
531
+ "step": 870
532
+ },
533
+ {
534
+ "epoch": 0.82,
535
+ "learning_rate": 3.211318966577581e-05,
536
+ "loss": 1.239,
537
+ "step": 880
538
+ },
539
+ {
540
+ "epoch": 0.83,
541
+ "learning_rate": 3.176321869448116e-05,
542
+ "loss": 1.2498,
543
+ "step": 890
544
+ },
545
+ {
546
+ "epoch": 0.83,
547
+ "learning_rate": 3.1411811744549536e-05,
548
+ "loss": 1.2438,
549
+ "step": 900
550
+ },
551
+ {
552
+ "epoch": 0.84,
553
+ "learning_rate": 3.105904342733032e-05,
554
+ "loss": 1.233,
555
+ "step": 910
556
+ },
557
+ {
558
+ "epoch": 0.85,
559
+ "learning_rate": 3.070498864322081e-05,
560
+ "loss": 1.2315,
561
+ "step": 920
562
+ },
563
+ {
564
+ "epoch": 0.86,
565
+ "learning_rate": 3.034972256576328e-05,
566
+ "loss": 1.232,
567
+ "step": 930
568
+ },
569
+ {
570
+ "epoch": 0.87,
571
+ "learning_rate": 2.999332062568395e-05,
572
+ "loss": 1.2467,
573
+ "step": 940
574
+ },
575
+ {
576
+ "epoch": 0.88,
577
+ "learning_rate": 2.9635858494877384e-05,
578
+ "loss": 1.2483,
579
+ "step": 950
580
+ },
581
+ {
582
+ "epoch": 0.89,
583
+ "learning_rate": 2.9277412070339782e-05,
584
+ "loss": 1.2377,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 0.9,
589
+ "learning_rate": 2.891805745805429e-05,
590
+ "loss": 1.2347,
591
+ "step": 970
592
+ },
593
+ {
594
+ "epoch": 0.91,
595
+ "learning_rate": 2.8557870956832132e-05,
596
+ "loss": 1.2428,
597
+ "step": 980
598
+ },
599
+ {
600
+ "epoch": 0.92,
601
+ "learning_rate": 2.8196929042112652e-05,
602
+ "loss": 1.244,
603
+ "step": 990
604
+ },
605
+ {
606
+ "epoch": 0.93,
607
+ "learning_rate": 2.783530834972594e-05,
608
+ "loss": 1.2317,
609
+ "step": 1000
610
+ },
611
+ {
612
+ "epoch": 0.94,
613
+ "learning_rate": 2.7473085659621377e-05,
614
+ "loss": 1.2421,
615
+ "step": 1010
616
+ },
617
+ {
618
+ "epoch": 0.95,
619
+ "learning_rate": 2.711033787956555e-05,
620
+ "loss": 1.2446,
621
+ "step": 1020
622
+ },
623
+ {
624
+ "epoch": 0.96,
625
+ "learning_rate": 2.6747142028813105e-05,
626
+ "loss": 1.2471,
627
+ "step": 1030
628
+ },
629
+ {
630
+ "epoch": 0.96,
631
+ "learning_rate": 2.638357522175383e-05,
632
+ "loss": 1.2403,
633
+ "step": 1040
634
+ },
635
+ {
636
+ "epoch": 0.97,
637
+ "learning_rate": 2.6019714651539646e-05,
638
+ "loss": 1.2413,
639
+ "step": 1050
640
+ },
641
+ {
642
+ "epoch": 0.98,
643
+ "learning_rate": 2.565563757369475e-05,
644
+ "loss": 1.2341,
645
+ "step": 1060
646
+ },
647
+ {
648
+ "epoch": 0.99,
649
+ "learning_rate": 2.529142128971268e-05,
650
+ "loss": 1.236,
651
+ "step": 1070
652
+ },
653
+ {
654
+ "epoch": 1.0,
655
+ "learning_rate": 2.492714313064342e-05,
656
+ "loss": 1.2423,
657
+ "step": 1080
658
+ },
659
+ {
660
+ "epoch": 1.01,
661
+ "learning_rate": 2.45628804406744e-05,
662
+ "loss": 1.2253,
663
+ "step": 1090
664
+ },
665
+ {
666
+ "epoch": 1.02,
667
+ "learning_rate": 2.419871056070862e-05,
668
+ "loss": 1.2377,
669
+ "step": 1100
670
+ },
671
+ {
672
+ "epoch": 1.03,
673
+ "learning_rate": 2.3834710811943514e-05,
674
+ "loss": 1.2395,
675
+ "step": 1110
676
+ },
677
+ {
678
+ "epoch": 1.04,
679
+ "learning_rate": 2.3470958479453938e-05,
680
+ "loss": 1.2238,
681
+ "step": 1120
682
+ },
683
+ {
684
+ "epoch": 1.05,
685
+ "learning_rate": 2.3107530795782877e-05,
686
+ "loss": 1.2045,
687
+ "step": 1130
688
+ },
689
+ {
690
+ "epoch": 1.06,
691
+ "learning_rate": 2.2744504924543313e-05,
692
+ "loss": 1.2302,
693
+ "step": 1140
694
+ },
695
+ {
696
+ "epoch": 1.07,
697
+ "learning_rate": 2.23819579440347e-05,
698
+ "loss": 1.233,
699
+ "step": 1150
700
+ },
701
+ {
702
+ "epoch": 1.08,
703
+ "learning_rate": 2.2019966830877545e-05,
704
+ "loss": 1.2341,
705
+ "step": 1160
706
+ },
707
+ {
708
+ "epoch": 1.08,
709
+ "learning_rate": 2.1658608443669635e-05,
710
+ "loss": 1.2294,
711
+ "step": 1170
712
+ },
713
+ {
714
+ "epoch": 1.09,
715
+ "learning_rate": 2.1297959506667224e-05,
716
+ "loss": 1.2308,
717
+ "step": 1180
718
+ },
719
+ {
720
+ "epoch": 1.1,
721
+ "learning_rate": 2.0938096593494855e-05,
722
+ "loss": 1.2427,
723
+ "step": 1190
724
+ },
725
+ {
726
+ "epoch": 1.11,
727
+ "learning_rate": 2.057909611088709e-05,
728
+ "loss": 1.2404,
729
+ "step": 1200
730
+ },
731
+ {
732
+ "epoch": 1.12,
733
+ "learning_rate": 2.02210342824657e-05,
734
+ "loss": 1.2245,
735
+ "step": 1210
736
+ },
737
+ {
738
+ "epoch": 1.13,
739
+ "learning_rate": 1.9863987132555706e-05,
740
+ "loss": 1.2288,
741
+ "step": 1220
742
+ },
743
+ {
744
+ "epoch": 1.14,
745
+ "learning_rate": 1.9508030470043806e-05,
746
+ "loss": 1.2401,
747
+ "step": 1230
748
+ },
749
+ {
750
+ "epoch": 1.15,
751
+ "learning_rate": 1.915323987228247e-05,
752
+ "loss": 1.2275,
753
+ "step": 1240
754
+ },
755
+ {
756
+ "epoch": 1.16,
757
+ "learning_rate": 1.8799690669043212e-05,
758
+ "loss": 1.2251,
759
+ "step": 1250
760
+ },
761
+ {
762
+ "epoch": 1.17,
763
+ "learning_rate": 1.8447457926522454e-05,
764
+ "loss": 1.238,
765
+ "step": 1260
766
+ },
767
+ {
768
+ "epoch": 1.18,
769
+ "learning_rate": 1.8096616431403325e-05,
770
+ "loss": 1.2291,
771
+ "step": 1270
772
+ },
773
+ {
774
+ "epoch": 1.19,
775
+ "learning_rate": 1.7747240674976857e-05,
776
+ "loss": 1.2162,
777
+ "step": 1280
778
+ },
779
+ {
780
+ "epoch": 1.2,
781
+ "learning_rate": 1.7399404837325796e-05,
782
+ "loss": 1.2315,
783
+ "step": 1290
784
+ },
785
+ {
786
+ "epoch": 1.21,
787
+ "learning_rate": 1.7053182771574633e-05,
788
+ "loss": 1.2383,
789
+ "step": 1300
790
+ },
791
+ {
792
+ "epoch": 1.21,
793
+ "learning_rate": 1.6708647988208887e-05,
794
+ "loss": 1.2254,
795
+ "step": 1310
796
+ },
797
+ {
798
+ "epoch": 1.22,
799
+ "learning_rate": 1.6365873639467315e-05,
800
+ "loss": 1.2388,
801
+ "step": 1320
802
+ },
803
+ {
804
+ "epoch": 1.23,
805
+ "learning_rate": 1.602493250381003e-05,
806
+ "loss": 1.2213,
807
+ "step": 1330
808
+ },
809
+ {
810
+ "epoch": 1.24,
811
+ "learning_rate": 1.5685896970466123e-05,
812
+ "loss": 1.2366,
813
+ "step": 1340
814
+ },
815
+ {
816
+ "epoch": 1.25,
817
+ "learning_rate": 1.534883902406375e-05,
818
+ "loss": 1.2265,
819
+ "step": 1350
820
+ },
821
+ {
822
+ "epoch": 1.26,
823
+ "learning_rate": 1.5013830229346326e-05,
824
+ "loss": 1.2094,
825
+ "step": 1360
826
+ },
827
+ {
828
+ "epoch": 1.27,
829
+ "learning_rate": 1.4680941715977722e-05,
830
+ "loss": 1.2314,
831
+ "step": 1370
832
+ },
833
+ {
834
+ "epoch": 1.28,
835
+ "learning_rate": 1.4350244163439892e-05,
836
+ "loss": 1.2254,
837
+ "step": 1380
838
+ },
839
+ {
840
+ "epoch": 1.29,
841
+ "learning_rate": 1.4021807786026108e-05,
842
+ "loss": 1.2287,
843
+ "step": 1390
844
+ },
845
+ {
846
+ "epoch": 1.3,
847
+ "learning_rate": 1.3695702317932862e-05,
848
+ "loss": 1.2235,
849
+ "step": 1400
850
+ },
851
+ {
852
+ "epoch": 1.31,
853
+ "learning_rate": 1.337199699845387e-05,
854
+ "loss": 1.2209,
855
+ "step": 1410
856
+ },
857
+ {
858
+ "epoch": 1.32,
859
+ "learning_rate": 1.3050760557279023e-05,
860
+ "loss": 1.2296,
861
+ "step": 1420
862
+ },
863
+ {
864
+ "epoch": 1.33,
865
+ "learning_rate": 1.2732061199901562e-05,
866
+ "loss": 1.2363,
867
+ "step": 1430
868
+ },
869
+ {
870
+ "epoch": 1.34,
871
+ "learning_rate": 1.2415966593136547e-05,
872
+ "loss": 1.2116,
873
+ "step": 1440
874
+ },
875
+ {
876
+ "epoch": 1.34,
877
+ "learning_rate": 1.2102543850753808e-05,
878
+ "loss": 1.2299,
879
+ "step": 1450
880
+ },
881
+ {
882
+ "epoch": 1.35,
883
+ "learning_rate": 1.1791859519228138e-05,
884
+ "loss": 1.2204,
885
+ "step": 1460
886
+ },
887
+ {
888
+ "epoch": 1.36,
889
+ "learning_rate": 1.148397956361007e-05,
890
+ "loss": 1.2271,
891
+ "step": 1470
892
+ },
893
+ {
894
+ "epoch": 1.37,
895
+ "learning_rate": 1.1178969353520018e-05,
896
+ "loss": 1.2327,
897
+ "step": 1480
898
+ },
899
+ {
900
+ "epoch": 1.38,
901
+ "learning_rate": 1.087689364926897e-05,
902
+ "loss": 1.2252,
903
+ "step": 1490
904
+ },
905
+ {
906
+ "epoch": 1.39,
907
+ "learning_rate": 1.0577816588108378e-05,
908
+ "loss": 1.2305,
909
+ "step": 1500
910
+ },
911
+ {
912
+ "epoch": 1.4,
913
+ "learning_rate": 1.0281801670612449e-05,
914
+ "loss": 1.2179,
915
+ "step": 1510
916
+ },
917
+ {
918
+ "epoch": 1.41,
919
+ "learning_rate": 9.988911747195603e-06,
920
+ "loss": 1.229,
921
+ "step": 1520
922
+ },
923
+ {
924
+ "epoch": 1.42,
925
+ "learning_rate": 9.699209004767953e-06,
926
+ "loss": 1.2192,
927
+ "step": 1530
928
+ },
929
+ {
930
+ "epoch": 1.43,
931
+ "learning_rate": 9.412754953531663e-06,
932
+ "loss": 1.2322,
933
+ "step": 1540
934
+ },
935
+ {
936
+ "epoch": 1.44,
937
+ "learning_rate": 9.129610413921005e-06,
938
+ "loss": 1.2193,
939
+ "step": 1550
940
+ },
941
+ {
942
+ "epoch": 1.45,
943
+ "learning_rate": 8.849835503688847e-06,
944
+ "loss": 1.2149,
945
+ "step": 1560
946
+ },
947
+ {
948
+ "epoch": 1.46,
949
+ "learning_rate": 8.57348962514234e-06,
950
+ "loss": 1.217,
951
+ "step": 1570
952
+ },
953
+ {
954
+ "epoch": 1.46,
955
+ "learning_rate": 8.30063145253053e-06,
956
+ "loss": 1.2368,
957
+ "step": 1580
958
+ },
959
+ {
960
+ "epoch": 1.47,
961
+ "learning_rate": 8.031318919586523e-06,
962
+ "loss": 1.2264,
963
+ "step": 1590
964
+ },
965
+ {
966
+ "epoch": 1.48,
967
+ "learning_rate": 7.7656092072269e-06,
968
+ "loss": 1.2243,
969
+ "step": 1600
970
+ },
971
+ {
972
+ "epoch": 1.49,
973
+ "learning_rate": 7.503558731410959e-06,
974
+ "loss": 1.2427,
975
+ "step": 1610
976
+ },
977
+ {
978
+ "epoch": 1.5,
979
+ "learning_rate": 7.245223131162376e-06,
980
+ "loss": 1.2207,
981
+ "step": 1620
982
+ },
983
+ {
984
+ "epoch": 1.51,
985
+ "learning_rate": 6.9906572567558285e-06,
986
+ "loss": 1.2211,
987
+ "step": 1630
988
+ },
989
+ {
990
+ "epoch": 1.52,
991
+ "learning_rate": 6.739915158071106e-06,
992
+ "loss": 1.2343,
993
+ "step": 1640
994
+ },
995
+ {
996
+ "epoch": 1.53,
997
+ "learning_rate": 6.493050073117116e-06,
998
+ "loss": 1.2348,
999
+ "step": 1650
1000
+ },
1001
+ {
1002
+ "epoch": 1.54,
1003
+ "learning_rate": 6.250114416728298e-06,
1004
+ "loss": 1.2189,
1005
+ "step": 1660
1006
+ },
1007
+ {
1008
+ "epoch": 1.55,
1009
+ "learning_rate": 6.011159769435823e-06,
1010
+ "loss": 1.2207,
1011
+ "step": 1670
1012
+ },
1013
+ {
1014
+ "epoch": 1.56,
1015
+ "learning_rate": 5.776236866515947e-06,
1016
+ "loss": 1.2315,
1017
+ "step": 1680
1018
+ },
1019
+ {
1020
+ "epoch": 1.57,
1021
+ "learning_rate": 5.545395587217763e-06,
1022
+ "loss": 1.2334,
1023
+ "step": 1690
1024
+ },
1025
+ {
1026
+ "epoch": 1.58,
1027
+ "learning_rate": 5.318684944172752e-06,
1028
+ "loss": 1.232,
1029
+ "step": 1700
1030
+ },
1031
+ {
1032
+ "epoch": 1.59,
1033
+ "learning_rate": 5.096153072988366e-06,
1034
+ "loss": 1.2235,
1035
+ "step": 1710
1036
+ },
1037
+ {
1038
+ "epoch": 1.59,
1039
+ "learning_rate": 4.8778472220277874e-06,
1040
+ "loss": 1.2244,
1041
+ "step": 1720
1042
+ },
1043
+ {
1044
+ "epoch": 1.6,
1045
+ "learning_rate": 4.6638137423780395e-06,
1046
+ "loss": 1.2197,
1047
+ "step": 1730
1048
+ },
1049
+ {
1050
+ "epoch": 1.61,
1051
+ "learning_rate": 4.454098078008667e-06,
1052
+ "loss": 1.2242,
1053
+ "step": 1740
1054
+ },
1055
+ {
1056
+ "epoch": 1.62,
1057
+ "learning_rate": 4.248744756122986e-06,
1058
+ "loss": 1.2382,
1059
+ "step": 1750
1060
+ },
1061
+ {
1062
+ "epoch": 1.63,
1063
+ "learning_rate": 4.047797377703985e-06,
1064
+ "loss": 1.2265,
1065
+ "step": 1760
1066
+ },
1067
+ {
1068
+ "epoch": 1.64,
1069
+ "learning_rate": 3.851298608256892e-06,
1070
+ "loss": 1.2153,
1071
+ "step": 1770
1072
+ },
1073
+ {
1074
+ "epoch": 1.65,
1075
+ "learning_rate": 3.6592901687503566e-06,
1076
+ "loss": 1.2198,
1077
+ "step": 1780
1078
+ },
1079
+ {
1080
+ "epoch": 1.66,
1081
+ "learning_rate": 3.471812826758178e-06,
1082
+ "loss": 1.23,
1083
+ "step": 1790
1084
+ },
1085
+ {
1086
+ "epoch": 1.67,
1087
+ "learning_rate": 3.288906387803464e-06,
1088
+ "loss": 1.2268,
1089
+ "step": 1800
1090
+ },
1091
+ {
1092
+ "epoch": 1.68,
1093
+ "learning_rate": 3.1106096869070483e-06,
1094
+ "loss": 1.2228,
1095
+ "step": 1810
1096
+ },
1097
+ {
1098
+ "epoch": 1.69,
1099
+ "learning_rate": 2.9369605803419715e-06,
1100
+ "loss": 1.2288,
1101
+ "step": 1820
1102
+ },
1103
+ {
1104
+ "epoch": 1.7,
1105
+ "learning_rate": 2.767995937595766e-06,
1106
+ "loss": 1.228,
1107
+ "step": 1830
1108
+ },
1109
+ {
1110
+ "epoch": 1.71,
1111
+ "learning_rate": 2.6037516335422728e-06,
1112
+ "loss": 1.2178,
1113
+ "step": 1840
1114
+ },
1115
+ {
1116
+ "epoch": 1.72,
1117
+ "learning_rate": 2.4442625408246074e-06,
1118
+ "loss": 1.2386,
1119
+ "step": 1850
1120
+ },
1121
+ {
1122
+ "epoch": 1.72,
1123
+ "learning_rate": 2.289562522450947e-06,
1124
+ "loss": 1.2274,
1125
+ "step": 1860
1126
+ },
1127
+ {
1128
+ "epoch": 1.73,
1129
+ "learning_rate": 2.1396844246046903e-06,
1130
+ "loss": 1.2341,
1131
+ "step": 1870
1132
+ },
1133
+ {
1134
+ "epoch": 1.74,
1135
+ "learning_rate": 1.9946600696704592e-06,
1136
+ "loss": 1.2118,
1137
+ "step": 1880
1138
+ },
1139
+ {
1140
+ "epoch": 1.75,
1141
+ "learning_rate": 1.8545202494775509e-06,
1142
+ "loss": 1.2219,
1143
+ "step": 1890
1144
+ },
1145
+ {
1146
+ "epoch": 1.76,
1147
+ "learning_rate": 1.7192947187621434e-06,
1148
+ "loss": 1.2268,
1149
+ "step": 1900
1150
+ },
1151
+ {
1152
+ "epoch": 1.77,
1153
+ "learning_rate": 1.5890121888497366e-06,
1154
+ "loss": 1.2183,
1155
+ "step": 1910
1156
+ },
1157
+ {
1158
+ "epoch": 1.78,
1159
+ "learning_rate": 1.463700321559075e-06,
1160
+ "loss": 1.2253,
1161
+ "step": 1920
1162
+ },
1163
+ {
1164
+ "epoch": 1.79,
1165
+ "learning_rate": 1.3433857233289714e-06,
1166
+ "loss": 1.2262,
1167
+ "step": 1930
1168
+ },
1169
+ {
1170
+ "epoch": 1.8,
1171
+ "learning_rate": 1.2280939395691859e-06,
1172
+ "loss": 1.2232,
1173
+ "step": 1940
1174
+ },
1175
+ {
1176
+ "epoch": 1.81,
1177
+ "learning_rate": 1.1178494492365465e-06,
1178
+ "loss": 1.2376,
1179
+ "step": 1950
1180
+ },
1181
+ {
1182
+ "epoch": 1.82,
1183
+ "learning_rate": 1.0126756596375686e-06,
1184
+ "loss": 1.2287,
1185
+ "step": 1960
1186
+ },
1187
+ {
1188
+ "epoch": 1.83,
1189
+ "learning_rate": 9.125949014585383e-07,
1190
+ "loss": 1.2354,
1191
+ "step": 1970
1192
+ },
1193
+ {
1194
+ "epoch": 1.84,
1195
+ "learning_rate": 8.176284240242638e-07,
1196
+ "loss": 1.224,
1197
+ "step": 1980
1198
+ },
1199
+ {
1200
+ "epoch": 1.85,
1201
+ "learning_rate": 7.277963907863478e-07,
1202
+ "loss": 1.2214,
1203
+ "step": 1990
1204
+ },
1205
+ {
1206
+ "epoch": 1.85,
1207
+ "learning_rate": 6.431178750420513e-07,
1208
+ "loss": 1.2214,
1209
+ "step": 2000
1210
+ }
1211
+ ],
1212
+ "logging_steps": 10,
1213
+ "max_steps": 2156,
1214
+ "num_train_epochs": 2,
1215
+ "save_steps": 1000,
1216
+ "total_flos": 3.1404803520494305e+18,
1217
+ "trial_name": null,
1218
+ "trial_params": null
1219
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1ca0e4372bd1720a48da585ce6946399a06d0c984dcfcf5a2b7104827c4603f
3
+ size 4219
qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>"
4
+ ],
5
+ "eos_token": "<|endoftext|>",
6
+ "pad_token": "<|endoftext|>"
7
+ }
tokenization_qwen.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ SPECIAL_TOKENS = (
31
+ ENDOFTEXT,
32
+ IMSTART,
33
+ IMEND,
34
+ ) + EXTRAS
35
+
36
+
37
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
+ with open(tiktoken_bpe_file, "rb") as f:
39
+ contents = f.read()
40
+ return {
41
+ base64.b64decode(token): int(rank)
42
+ for token, rank in (line.split() for line in contents.splitlines() if line)
43
+ }
44
+
45
+ class QWenTokenizer(PreTrainedTokenizer):
46
+ """QWen tokenizer."""
47
+
48
+ vocab_files_names = VOCAB_FILES_NAMES
49
+
50
+ def __init__(
51
+ self,
52
+ vocab_file,
53
+ errors="replace",
54
+ **kwargs,
55
+ ):
56
+ super().__init__(**kwargs)
57
+
58
+ self.errors = errors # how to handle errors in decoding
59
+
60
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
+ self.special_tokens = {
62
+ token: index
63
+ for index, token in enumerate(
64
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
+ )
66
+ }
67
+
68
+ enc = tiktoken.Encoding(
69
+ "Qwen",
70
+ pat_str=PAT_STR,
71
+ mergeable_ranks=self.mergeable_ranks,
72
+ special_tokens=self.special_tokens,
73
+ )
74
+ assert (
75
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
76
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
77
+
78
+ self.decoder = {
79
+ v: k for k, v in self.mergeable_ranks.items()
80
+ } # type: dict[int, bytes|str]
81
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
82
+
83
+ self.tokenizer = enc # type: tiktoken.Encoding
84
+
85
+ self.eod_id = self.tokenizer.eot_token
86
+ self.im_start_id = self.special_tokens[IMSTART]
87
+ self.im_end_id = self.special_tokens[IMEND]
88
+
89
+ def __getstate__(self):
90
+ # for pickle lovers
91
+ state = self.__dict__.copy()
92
+ del state['tokenizer']
93
+ return state
94
+
95
+ def __setstate__(self, state):
96
+ # tokenizer is not python native; don't pass it; rebuild it
97
+ self.__dict__.update(state)
98
+ enc = tiktoken.Encoding(
99
+ "Qwen",
100
+ pat_str=PAT_STR,
101
+ mergeable_ranks=self.mergeable_ranks,
102
+ special_tokens=self.special_tokens,
103
+ )
104
+ self.tokenizer = enc
105
+
106
+
107
+ def __len__(self) -> int:
108
+ return self.tokenizer.n_vocab
109
+
110
+ def get_vocab(self) -> Dict[bytes, int]:
111
+ return self.mergeable_ranks
112
+
113
+ def convert_tokens_to_ids(
114
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
115
+ ) -> List[int]:
116
+ ids = []
117
+ if isinstance(tokens, (str, bytes)):
118
+ if tokens in self.special_tokens:
119
+ return self.special_tokens[tokens]
120
+ else:
121
+ return self.mergeable_ranks.get(tokens)
122
+ for token in tokens:
123
+ if token in self.special_tokens:
124
+ ids.append(self.special_tokens[token])
125
+ else:
126
+ ids.append(self.mergeable_ranks.get(token))
127
+ return ids
128
+
129
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
130
+ if not special_tokens and new_tokens:
131
+ raise ValueError('Adding regular tokens is not supported')
132
+ for token in new_tokens:
133
+ surface_form = token.content if isinstance(token, AddedToken) else token
134
+ if surface_form not in SPECIAL_TOKENS:
135
+ raise ValueError('Adding unknown special tokens is not supported')
136
+ return 0
137
+
138
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
139
+ """
140
+ Save only the vocabulary of the tokenizer (vocabulary).
141
+
142
+ Returns:
143
+ `Tuple(str)`: Paths to the files saved.
144
+ """
145
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
146
+ with open(file_path, "w", encoding="utf8") as w:
147
+ for k, v in self.mergeable_ranks.items():
148
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
149
+ w.write(line)
150
+ return (file_path,)
151
+
152
+ def tokenize(
153
+ self,
154
+ text: str,
155
+ allowed_special: Union[Set, str] = "all",
156
+ disallowed_special: Union[Collection, str] = (),
157
+ **kwargs,
158
+ ) -> List[Union[bytes, str]]:
159
+ """
160
+ Converts a string in a sequence of tokens.
161
+
162
+ Args:
163
+ text (`str`):
164
+ The sequence to be encoded.
165
+ allowed_special (`Literal["all"]` or `set`):
166
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
167
+ Default to "all".
168
+ disallowed_special (`Literal["all"]` or `Collection`):
169
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
170
+ Default to an empty tuple.
171
+
172
+ kwargs (additional keyword arguments, *optional*):
173
+ Will be passed to the underlying model specific encode method.
174
+
175
+ Returns:
176
+ `List[bytes|str]`: The list of tokens.
177
+ """
178
+ tokens = []
179
+ text = unicodedata.normalize("NFC", text)
180
+
181
+ # this implementation takes a detour: text -> token id -> token surface forms
182
+ for t in self.tokenizer.encode(
183
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
184
+ ):
185
+ tokens.append(self.decoder[t])
186
+ return tokens
187
+
188
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
189
+ """
190
+ Converts a sequence of tokens in a single string.
191
+ """
192
+ text = ""
193
+ temp = b""
194
+ for t in tokens:
195
+ if isinstance(t, str):
196
+ if temp:
197
+ text += temp.decode("utf-8", errors=self.errors)
198
+ temp = b""
199
+ text += t
200
+ elif isinstance(t, bytes):
201
+ temp += t
202
+ else:
203
+ raise TypeError("token should only be of type types or str")
204
+ if temp:
205
+ text += temp.decode("utf-8", errors=self.errors)
206
+ return text
207
+
208
+ @property
209
+ def vocab_size(self):
210
+ return self.tokenizer.n_vocab
211
+
212
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
213
+ """Converts an id to a token, special tokens included"""
214
+ if index in self.decoder:
215
+ return self.decoder[index]
216
+ raise ValueError("unknown ids")
217
+
218
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
219
+ """Converts a token to an id using the vocab, special tokens included"""
220
+ if token in self.special_tokens:
221
+ return self.special_tokens[token]
222
+ if token in self.mergeable_ranks:
223
+ return self.mergeable_ranks[token]
224
+ raise ValueError("unknown token")
225
+
226
+ def _tokenize(self, text: str, **kwargs):
227
+ """
228
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
229
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
230
+
231
+ Do NOT take care of added tokens.
232
+ """
233
+ raise NotImplementedError
234
+
235
+ def _decode(
236
+ self,
237
+ token_ids: Union[int, List[int]],
238
+ skip_special_tokens: bool = False,
239
+ errors: str = None,
240
+ **kwargs,
241
+ ) -> str:
242
+ if isinstance(token_ids, int):
243
+ token_ids = [token_ids]
244
+ if skip_special_tokens:
245
+ token_ids = [i for i in token_ids if i < self.eod_id]
246
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 8192,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "train_loss": 1.2461530792912217,
4
+ "train_runtime": 30560.3748,
5
+ "train_samples_per_second": 4.517,
6
+ "train_steps_per_second": 0.071
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 2156, "loss": 1.6199, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999734597774032e-05, "epoch": 0.01, "percentage": 0.46, "elapsed_time": "0:02:25", "remaining_time": "8:39:45"}
2
+ {"current_steps": 20, "total_steps": 2156, "loss": 1.5062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.998938447446803e-05, "epoch": 0.02, "percentage": 0.93, "elapsed_time": "0:04:50", "remaining_time": "8:37:49"}
3
+ {"current_steps": 30, "total_steps": 2156, "loss": 1.4138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.997611718058365e-05, "epoch": 0.03, "percentage": 1.39, "elapsed_time": "0:07:09", "remaining_time": "8:26:44"}
4
+ {"current_steps": 40, "total_steps": 2156, "loss": 1.369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9957546913022665e-05, "epoch": 0.04, "percentage": 1.86, "elapsed_time": "0:09:34", "remaining_time": "8:26:05"}
5
+ {"current_steps": 50, "total_steps": 2156, "loss": 1.3408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.993367761465736e-05, "epoch": 0.05, "percentage": 2.32, "elapsed_time": "0:11:55", "remaining_time": "8:21:57"}
6
+ {"current_steps": 60, "total_steps": 2156, "loss": 1.3321, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9904514353459654e-05, "epoch": 0.06, "percentage": 2.78, "elapsed_time": "0:14:18", "remaining_time": "8:20:02"}
7
+ {"current_steps": 70, "total_steps": 2156, "loss": 1.3251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9870063321425105e-05, "epoch": 0.06, "percentage": 3.25, "elapsed_time": "0:16:39", "remaining_time": "8:16:26"}
8
+ {"current_steps": 80, "total_steps": 2156, "loss": 1.3228, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.983033183325818e-05, "epoch": 0.07, "percentage": 3.71, "elapsed_time": "0:18:55", "remaining_time": "8:11:08"}
9
+ {"current_steps": 90, "total_steps": 2156, "loss": 1.3111, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.97853283248192e-05, "epoch": 0.08, "percentage": 4.17, "elapsed_time": "0:21:14", "remaining_time": "8:07:25"}
10
+ {"current_steps": 100, "total_steps": 2156, "loss": 1.3013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.973506235133323e-05, "epoch": 0.09, "percentage": 4.64, "elapsed_time": "0:23:37", "remaining_time": "8:05:34"}
11
+ {"current_steps": 110, "total_steps": 2156, "loss": 1.3004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.967954458536126e-05, "epoch": 0.1, "percentage": 5.1, "elapsed_time": "0:26:01", "remaining_time": "8:03:58"}
12
+ {"current_steps": 120, "total_steps": 2156, "loss": 1.2959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9618786814534226e-05, "epoch": 0.11, "percentage": 5.57, "elapsed_time": "0:28:23", "remaining_time": "8:01:40"}
13
+ {"current_steps": 130, "total_steps": 2156, "loss": 1.2969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.955280193905022e-05, "epoch": 0.12, "percentage": 6.03, "elapsed_time": "0:30:48", "remaining_time": "8:00:05"}
14
+ {"current_steps": 140, "total_steps": 2156, "loss": 1.2879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.948160396893553e-05, "epoch": 0.13, "percentage": 6.49, "elapsed_time": "0:33:11", "remaining_time": "7:57:58"}
15
+ {"current_steps": 150, "total_steps": 2156, "loss": 1.277, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9405208021069946e-05, "epoch": 0.14, "percentage": 6.96, "elapsed_time": "0:35:30", "remaining_time": "7:54:48"}
16
+ {"current_steps": 160, "total_steps": 2156, "loss": 1.283, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9323630315977156e-05, "epoch": 0.15, "percentage": 7.42, "elapsed_time": "0:37:55", "remaining_time": "7:53:08"}
17
+ {"current_steps": 170, "total_steps": 2156, "loss": 1.288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9236888174380784e-05, "epoch": 0.16, "percentage": 7.88, "elapsed_time": "0:40:13", "remaining_time": "7:49:57"}
18
+ {"current_steps": 180, "total_steps": 2156, "loss": 1.287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.91450000135268e-05, "epoch": 0.17, "percentage": 8.35, "elapsed_time": "0:42:34", "remaining_time": "7:47:18"}
19
+ {"current_steps": 190, "total_steps": 2156, "loss": 1.2726, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9047985343273154e-05, "epoch": 0.18, "percentage": 8.81, "elapsed_time": "0:44:50", "remaining_time": "7:43:57"}
20
+ {"current_steps": 200, "total_steps": 2156, "loss": 1.2808, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.894586476194739e-05, "epoch": 0.19, "percentage": 9.28, "elapsed_time": "0:47:07", "remaining_time": "7:40:53"}
21
+ {"current_steps": 210, "total_steps": 2156, "loss": 1.2657, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.883865995197319e-05, "epoch": 0.19, "percentage": 9.74, "elapsed_time": "0:49:29", "remaining_time": "7:38:40"}
22
+ {"current_steps": 220, "total_steps": 2156, "loss": 1.275, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8726393675266716e-05, "epoch": 0.2, "percentage": 10.2, "elapsed_time": "0:51:55", "remaining_time": "7:36:59"}
23
+ {"current_steps": 230, "total_steps": 2156, "loss": 1.2667, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.860908976840376e-05, "epoch": 0.21, "percentage": 10.67, "elapsed_time": "0:54:19", "remaining_time": "7:34:51"}
24
+ {"current_steps": 240, "total_steps": 2156, "loss": 1.2715, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.848677313755872e-05, "epoch": 0.22, "percentage": 11.13, "elapsed_time": "0:56:35", "remaining_time": "7:31:45"}
25
+ {"current_steps": 250, "total_steps": 2156, "loss": 1.273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.835946975321647e-05, "epoch": 0.23, "percentage": 11.6, "elapsed_time": "0:58:54", "remaining_time": "7:29:08"}
26
+ {"current_steps": 260, "total_steps": 2156, "loss": 1.2605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.822720664465827e-05, "epoch": 0.24, "percentage": 12.06, "elapsed_time": "1:01:17", "remaining_time": "7:26:57"}
27
+ {"current_steps": 270, "total_steps": 2156, "loss": 1.2766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.809001189422287e-05, "epoch": 0.25, "percentage": 12.52, "elapsed_time": "1:03:39", "remaining_time": "7:24:39"}
28
+ {"current_steps": 280, "total_steps": 2156, "loss": 1.262, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.794791463134399e-05, "epoch": 0.26, "percentage": 12.99, "elapsed_time": "1:05:57", "remaining_time": "7:21:56"}
29
+ {"current_steps": 290, "total_steps": 2156, "loss": 1.255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.780094502636552e-05, "epoch": 0.27, "percentage": 13.45, "elapsed_time": "1:08:16", "remaining_time": "7:19:18"}
30
+ {"current_steps": 300, "total_steps": 2156, "loss": 1.2652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.764913428413572e-05, "epoch": 0.28, "percentage": 13.91, "elapsed_time": "1:10:42", "remaining_time": "7:17:25"}
31
+ {"current_steps": 310, "total_steps": 2156, "loss": 1.2668, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7492514637381727e-05, "epoch": 0.29, "percentage": 14.38, "elapsed_time": "1:13:01", "remaining_time": "7:14:52"}
32
+ {"current_steps": 320, "total_steps": 2156, "loss": 1.2621, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.733111933986583e-05, "epoch": 0.3, "percentage": 14.84, "elapsed_time": "1:15:23", "remaining_time": "7:12:34"}
33
+ {"current_steps": 330, "total_steps": 2156, "loss": 1.257, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.716498265932501e-05, "epoch": 0.31, "percentage": 15.31, "elapsed_time": "1:17:43", "remaining_time": "7:10:04"}
34
+ {"current_steps": 340, "total_steps": 2156, "loss": 1.2789, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.699413987019512e-05, "epoch": 0.32, "percentage": 15.77, "elapsed_time": "1:20:02", "remaining_time": "7:07:30"}
35
+ {"current_steps": 350, "total_steps": 2156, "loss": 1.2634, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.681862724612141e-05, "epoch": 0.32, "percentage": 16.23, "elapsed_time": "1:22:26", "remaining_time": "7:05:24"}
36
+ {"current_steps": 360, "total_steps": 2156, "loss": 1.2594, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.663848205225674e-05, "epoch": 0.33, "percentage": 16.7, "elapsed_time": "1:24:46", "remaining_time": "7:02:55"}
37
+ {"current_steps": 370, "total_steps": 2156, "loss": 1.26, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.645374253734949e-05, "epoch": 0.34, "percentage": 17.16, "elapsed_time": "1:27:06", "remaining_time": "7:00:28"}
38
+ {"current_steps": 380, "total_steps": 2156, "loss": 1.2514, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.626444792562244e-05, "epoch": 0.35, "percentage": 17.63, "elapsed_time": "1:29:26", "remaining_time": "6:58:02"}
39
+ {"current_steps": 390, "total_steps": 2156, "loss": 1.2506, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.607063840844463e-05, "epoch": 0.36, "percentage": 18.09, "elapsed_time": "1:31:45", "remaining_time": "6:55:30"}
40
+ {"current_steps": 400, "total_steps": 2156, "loss": 1.2648, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.587235513579791e-05, "epoch": 0.37, "percentage": 18.55, "elapsed_time": "1:34:04", "remaining_time": "6:53:00"}
41
+ {"current_steps": 410, "total_steps": 2156, "loss": 1.2511, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5669640207539786e-05, "epoch": 0.38, "percentage": 19.02, "elapsed_time": "1:36:25", "remaining_time": "6:50:36"}
42
+ {"current_steps": 420, "total_steps": 2156, "loss": 1.2594, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.546253666446484e-05, "epoch": 0.39, "percentage": 19.48, "elapsed_time": "1:38:47", "remaining_time": "6:48:20"}
43
+ {"current_steps": 430, "total_steps": 2156, "loss": 1.2608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.525108847916614e-05, "epoch": 0.4, "percentage": 19.94, "elapsed_time": "1:41:06", "remaining_time": "6:45:52"}
44
+ {"current_steps": 440, "total_steps": 2156, "loss": 1.2597, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.503534054669892e-05, "epoch": 0.41, "percentage": 20.41, "elapsed_time": "1:43:28", "remaining_time": "6:43:33"}
45
+ {"current_steps": 450, "total_steps": 2156, "loss": 1.2609, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.481533867504841e-05, "epoch": 0.42, "percentage": 20.87, "elapsed_time": "1:45:51", "remaining_time": "6:41:19"}
46
+ {"current_steps": 460, "total_steps": 2156, "loss": 1.2505, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4591129575403765e-05, "epoch": 0.43, "percentage": 21.34, "elapsed_time": "1:48:11", "remaining_time": "6:38:53"}
47
+ {"current_steps": 470, "total_steps": 2156, "loss": 1.2481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.43627608522403e-05, "epoch": 0.44, "percentage": 21.8, "elapsed_time": "1:50:39", "remaining_time": "6:36:56"}
48
+ {"current_steps": 480, "total_steps": 2156, "loss": 1.2612, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4130280993211974e-05, "epoch": 0.45, "percentage": 22.26, "elapsed_time": "1:53:04", "remaining_time": "6:34:50"}
49
+ {"current_steps": 490, "total_steps": 2156, "loss": 1.2504, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.389373935885646e-05, "epoch": 0.45, "percentage": 22.73, "elapsed_time": "1:55:26", "remaining_time": "6:32:29"}
50
+ {"current_steps": 500, "total_steps": 2156, "loss": 1.2518, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.365318617211479e-05, "epoch": 0.46, "percentage": 23.19, "elapsed_time": "1:57:51", "remaining_time": "6:30:21"}
51
+ {"current_steps": 510, "total_steps": 2156, "loss": 1.2458, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.340867250766794e-05, "epoch": 0.47, "percentage": 23.65, "elapsed_time": "2:00:13", "remaining_time": "6:28:02"}
52
+ {"current_steps": 520, "total_steps": 2156, "loss": 1.2345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.316025028109258e-05, "epoch": 0.48, "percentage": 24.12, "elapsed_time": "2:02:34", "remaining_time": "6:25:37"}
53
+ {"current_steps": 530, "total_steps": 2156, "loss": 1.2521, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2907972237838225e-05, "epoch": 0.49, "percentage": 24.58, "elapsed_time": "2:04:58", "remaining_time": "6:23:23"}
54
+ {"current_steps": 540, "total_steps": 2156, "loss": 1.2528, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2651891942028274e-05, "epoch": 0.5, "percentage": 25.05, "elapsed_time": "2:07:19", "remaining_time": "6:21:02"}
55
+ {"current_steps": 550, "total_steps": 2156, "loss": 1.2462, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.239206376508717e-05, "epoch": 0.51, "percentage": 25.51, "elapsed_time": "2:09:41", "remaining_time": "6:18:42"}
56
+ {"current_steps": 560, "total_steps": 2156, "loss": 1.2401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.212854287419611e-05, "epoch": 0.52, "percentage": 25.97, "elapsed_time": "2:12:06", "remaining_time": "6:16:31"}
57
+ {"current_steps": 570, "total_steps": 2156, "loss": 1.2496, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1861385220579934e-05, "epoch": 0.53, "percentage": 26.44, "elapsed_time": "2:14:29", "remaining_time": "6:14:13"}
58
+ {"current_steps": 580, "total_steps": 2156, "loss": 1.2522, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1590647527627404e-05, "epoch": 0.54, "percentage": 26.9, "elapsed_time": "2:16:49", "remaining_time": "6:11:48"}
59
+ {"current_steps": 590, "total_steps": 2156, "loss": 1.2377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.131638727884762e-05, "epoch": 0.55, "percentage": 27.37, "elapsed_time": "2:19:12", "remaining_time": "6:09:29"}
60
+ {"current_steps": 600, "total_steps": 2156, "loss": 1.2467, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.103866270566498e-05, "epoch": 0.56, "percentage": 27.83, "elapsed_time": "2:21:36", "remaining_time": "6:07:13"}
61
+ {"current_steps": 610, "total_steps": 2156, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.075753277505544e-05, "epoch": 0.57, "percentage": 28.29, "elapsed_time": "2:23:53", "remaining_time": "6:04:41"}
62
+ {"current_steps": 620, "total_steps": 2156, "loss": 1.2455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0473057177026484e-05, "epoch": 0.57, "percentage": 28.76, "elapsed_time": "2:26:14", "remaining_time": "6:02:18"}
63
+ {"current_steps": 630, "total_steps": 2156, "loss": 1.2294, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.018529631194369e-05, "epoch": 0.58, "percentage": 29.22, "elapsed_time": "2:28:37", "remaining_time": "5:59:59"}
64
+ {"current_steps": 640, "total_steps": 2156, "loss": 1.2509, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.989431127770635e-05, "epoch": 0.59, "percentage": 29.68, "elapsed_time": "2:31:04", "remaining_time": "5:57:50"}
65
+ {"current_steps": 650, "total_steps": 2156, "loss": 1.2354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.960016385677513e-05, "epoch": 0.6, "percentage": 30.15, "elapsed_time": "2:33:22", "remaining_time": "5:55:20"}
66
+ {"current_steps": 660, "total_steps": 2156, "loss": 1.2318, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9302916503054246e-05, "epoch": 0.61, "percentage": 30.61, "elapsed_time": "2:35:43", "remaining_time": "5:52:59"}
67
+ {"current_steps": 670, "total_steps": 2156, "loss": 1.2376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9002632328631164e-05, "epoch": 0.62, "percentage": 31.08, "elapsed_time": "2:38:03", "remaining_time": "5:50:33"}
68
+ {"current_steps": 680, "total_steps": 2156, "loss": 1.2412, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8699375090376534e-05, "epoch": 0.63, "percentage": 31.54, "elapsed_time": "2:40:26", "remaining_time": "5:48:14"}
69
+ {"current_steps": 690, "total_steps": 2156, "loss": 1.2479, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8393209176407223e-05, "epoch": 0.64, "percentage": 32.0, "elapsed_time": "2:42:45", "remaining_time": "5:45:48"}
70
+ {"current_steps": 700, "total_steps": 2156, "loss": 1.2431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8084199592415305e-05, "epoch": 0.65, "percentage": 32.47, "elapsed_time": "2:45:05", "remaining_time": "5:43:23"}
71
+ {"current_steps": 710, "total_steps": 2156, "loss": 1.245, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.777241194786591e-05, "epoch": 0.66, "percentage": 32.93, "elapsed_time": "2:47:32", "remaining_time": "5:41:14"}
72
+ {"current_steps": 720, "total_steps": 2156, "loss": 1.2393, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.745791244206697e-05, "epoch": 0.67, "percentage": 33.4, "elapsed_time": "2:49:56", "remaining_time": "5:38:55"}
73
+ {"current_steps": 730, "total_steps": 2156, "loss": 1.2473, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.714076785011359e-05, "epoch": 0.68, "percentage": 33.86, "elapsed_time": "2:52:20", "remaining_time": "5:36:38"}
74
+ {"current_steps": 740, "total_steps": 2156, "loss": 1.2552, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.682104550871031e-05, "epoch": 0.69, "percentage": 34.32, "elapsed_time": "2:54:43", "remaining_time": "5:34:19"}
75
+ {"current_steps": 750, "total_steps": 2156, "loss": 1.2356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.649881330187401e-05, "epoch": 0.7, "percentage": 34.79, "elapsed_time": "2:57:03", "remaining_time": "5:31:56"}
76
+ {"current_steps": 760, "total_steps": 2156, "loss": 1.2442, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.617413964652067e-05, "epoch": 0.7, "percentage": 35.25, "elapsed_time": "2:59:28", "remaining_time": "5:29:40"}
77
+ {"current_steps": 770, "total_steps": 2156, "loss": 1.2605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5847093477938956e-05, "epoch": 0.71, "percentage": 35.71, "elapsed_time": "3:01:49", "remaining_time": "5:27:16"}
78
+ {"current_steps": 780, "total_steps": 2156, "loss": 1.2364, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.551774423515378e-05, "epoch": 0.72, "percentage": 36.18, "elapsed_time": "3:04:10", "remaining_time": "5:24:55"}
79
+ {"current_steps": 790, "total_steps": 2156, "loss": 1.2482, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.518616184618288e-05, "epoch": 0.73, "percentage": 36.64, "elapsed_time": "3:06:29", "remaining_time": "5:22:27"}
80
+ {"current_steps": 800, "total_steps": 2156, "loss": 1.2327, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4852416713189526e-05, "epoch": 0.74, "percentage": 37.11, "elapsed_time": "3:08:51", "remaining_time": "5:20:07"}
81
+ {"current_steps": 810, "total_steps": 2156, "loss": 1.2548, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4516579697534705e-05, "epoch": 0.75, "percentage": 37.57, "elapsed_time": "3:11:15", "remaining_time": "5:17:48"}
82
+ {"current_steps": 820, "total_steps": 2156, "loss": 1.2469, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.417872210473162e-05, "epoch": 0.76, "percentage": 38.03, "elapsed_time": "3:13:34", "remaining_time": "5:15:22"}
83
+ {"current_steps": 830, "total_steps": 2156, "loss": 1.2424, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3838915669306034e-05, "epoch": 0.77, "percentage": 38.5, "elapsed_time": "3:15:54", "remaining_time": "5:12:59"}
84
+ {"current_steps": 840, "total_steps": 2156, "loss": 1.2438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.349723253956542e-05, "epoch": 0.78, "percentage": 38.96, "elapsed_time": "3:18:16", "remaining_time": "5:10:38"}
85
+ {"current_steps": 850, "total_steps": 2156, "loss": 1.243, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.315374526228036e-05, "epoch": 0.79, "percentage": 39.42, "elapsed_time": "3:20:38", "remaining_time": "5:08:16"}
86
+ {"current_steps": 860, "total_steps": 2156, "loss": 1.2416, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2808526767281225e-05, "epoch": 0.8, "percentage": 39.89, "elapsed_time": "3:22:59", "remaining_time": "5:05:54"}
87
+ {"current_steps": 870, "total_steps": 2156, "loss": 1.2399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.246165035197364e-05, "epoch": 0.81, "percentage": 40.35, "elapsed_time": "3:25:22", "remaining_time": "5:03:34"}
88
+ {"current_steps": 880, "total_steps": 2156, "loss": 1.239, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.211318966577581e-05, "epoch": 0.82, "percentage": 40.82, "elapsed_time": "3:27:41", "remaining_time": "5:01:08"}
89
+ {"current_steps": 890, "total_steps": 2156, "loss": 1.2498, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.176321869448116e-05, "epoch": 0.83, "percentage": 41.28, "elapsed_time": "3:30:03", "remaining_time": "4:58:48"}
90
+ {"current_steps": 900, "total_steps": 2156, "loss": 1.2438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1411811744549536e-05, "epoch": 0.83, "percentage": 41.74, "elapsed_time": "3:32:27", "remaining_time": "4:56:29"}
91
+ {"current_steps": 910, "total_steps": 2156, "loss": 1.233, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.105904342733032e-05, "epoch": 0.84, "percentage": 42.21, "elapsed_time": "3:34:51", "remaining_time": "4:54:11"}
92
+ {"current_steps": 920, "total_steps": 2156, "loss": 1.2315, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.070498864322081e-05, "epoch": 0.85, "percentage": 42.67, "elapsed_time": "3:37:13", "remaining_time": "4:51:50"}
93
+ {"current_steps": 930, "total_steps": 2156, "loss": 1.232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.034972256576328e-05, "epoch": 0.86, "percentage": 43.14, "elapsed_time": "3:39:35", "remaining_time": "4:49:28"}
94
+ {"current_steps": 940, "total_steps": 2156, "loss": 1.2467, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.999332062568395e-05, "epoch": 0.87, "percentage": 43.6, "elapsed_time": "3:42:00", "remaining_time": "4:47:11"}
95
+ {"current_steps": 950, "total_steps": 2156, "loss": 1.2483, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9635858494877384e-05, "epoch": 0.88, "percentage": 44.06, "elapsed_time": "3:44:20", "remaining_time": "4:44:47"}
96
+ {"current_steps": 960, "total_steps": 2156, "loss": 1.2377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9277412070339782e-05, "epoch": 0.89, "percentage": 44.53, "elapsed_time": "3:46:43", "remaining_time": "4:42:27"}
97
+ {"current_steps": 970, "total_steps": 2156, "loss": 1.2347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.891805745805429e-05, "epoch": 0.9, "percentage": 44.99, "elapsed_time": "3:49:02", "remaining_time": "4:40:03"}
98
+ {"current_steps": 980, "total_steps": 2156, "loss": 1.2428, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8557870956832132e-05, "epoch": 0.91, "percentage": 45.45, "elapsed_time": "3:51:26", "remaining_time": "4:37:44"}
99
+ {"current_steps": 990, "total_steps": 2156, "loss": 1.244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8196929042112652e-05, "epoch": 0.92, "percentage": 45.92, "elapsed_time": "3:53:49", "remaining_time": "4:35:24"}
100
+ {"current_steps": 1000, "total_steps": 2156, "loss": 1.2317, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.783530834972594e-05, "epoch": 0.93, "percentage": 46.38, "elapsed_time": "3:56:15", "remaining_time": "4:33:06"}
101
+ {"current_steps": 1010, "total_steps": 2156, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7473085659621377e-05, "epoch": 0.94, "percentage": 46.85, "elapsed_time": "3:58:37", "remaining_time": "4:30:45"}
102
+ {"current_steps": 1020, "total_steps": 2156, "loss": 1.2446, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.711033787956555e-05, "epoch": 0.95, "percentage": 47.31, "elapsed_time": "4:01:01", "remaining_time": "4:28:26"}
103
+ {"current_steps": 1030, "total_steps": 2156, "loss": 1.2471, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6747142028813105e-05, "epoch": 0.96, "percentage": 47.77, "elapsed_time": "4:03:24", "remaining_time": "4:26:05"}
104
+ {"current_steps": 1040, "total_steps": 2156, "loss": 1.2403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.638357522175383e-05, "epoch": 0.96, "percentage": 48.24, "elapsed_time": "4:05:43", "remaining_time": "4:23:41"}
105
+ {"current_steps": 1050, "total_steps": 2156, "loss": 1.2413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6019714651539646e-05, "epoch": 0.97, "percentage": 48.7, "elapsed_time": "4:08:11", "remaining_time": "4:21:25"}
106
+ {"current_steps": 1060, "total_steps": 2156, "loss": 1.2341, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.565563757369475e-05, "epoch": 0.98, "percentage": 49.17, "elapsed_time": "4:10:31", "remaining_time": "4:19:02"}
107
+ {"current_steps": 1070, "total_steps": 2156, "loss": 1.236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.529142128971268e-05, "epoch": 0.99, "percentage": 49.63, "elapsed_time": "4:12:53", "remaining_time": "4:16:39"}
108
+ {"current_steps": 1080, "total_steps": 2156, "loss": 1.2423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.492714313064342e-05, "epoch": 1.0, "percentage": 50.09, "elapsed_time": "4:15:12", "remaining_time": "4:14:15"}
109
+ {"current_steps": 1090, "total_steps": 2156, "loss": 1.2253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.45628804406744e-05, "epoch": 1.01, "percentage": 50.56, "elapsed_time": "4:17:33", "remaining_time": "4:11:53"}
110
+ {"current_steps": 1100, "total_steps": 2156, "loss": 1.2377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.419871056070862e-05, "epoch": 1.02, "percentage": 51.02, "elapsed_time": "4:19:52", "remaining_time": "4:09:28"}
111
+ {"current_steps": 1110, "total_steps": 2156, "loss": 1.2395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3834710811943514e-05, "epoch": 1.03, "percentage": 51.48, "elapsed_time": "4:22:18", "remaining_time": "4:07:11"}
112
+ {"current_steps": 1120, "total_steps": 2156, "loss": 1.2238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3470958479453938e-05, "epoch": 1.04, "percentage": 51.95, "elapsed_time": "4:24:41", "remaining_time": "4:04:50"}
113
+ {"current_steps": 1130, "total_steps": 2156, "loss": 1.2045, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3107530795782877e-05, "epoch": 1.05, "percentage": 52.41, "elapsed_time": "4:27:04", "remaining_time": "4:02:29"}
114
+ {"current_steps": 1140, "total_steps": 2156, "loss": 1.2302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2744504924543313e-05, "epoch": 1.06, "percentage": 52.88, "elapsed_time": "4:29:25", "remaining_time": "4:00:07"}
115
+ {"current_steps": 1150, "total_steps": 2156, "loss": 1.233, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.23819579440347e-05, "epoch": 1.07, "percentage": 53.34, "elapsed_time": "4:31:52", "remaining_time": "3:57:49"}
116
+ {"current_steps": 1160, "total_steps": 2156, "loss": 1.2341, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2019966830877545e-05, "epoch": 1.08, "percentage": 53.8, "elapsed_time": "4:34:11", "remaining_time": "3:55:25"}
117
+ {"current_steps": 1170, "total_steps": 2156, "loss": 1.2294, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1658608443669635e-05, "epoch": 1.08, "percentage": 54.27, "elapsed_time": "4:36:35", "remaining_time": "3:53:05"}
118
+ {"current_steps": 1180, "total_steps": 2156, "loss": 1.2308, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1297959506667224e-05, "epoch": 1.09, "percentage": 54.73, "elapsed_time": "4:38:55", "remaining_time": "3:50:42"}
119
+ {"current_steps": 1190, "total_steps": 2156, "loss": 1.2427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0938096593494855e-05, "epoch": 1.1, "percentage": 55.19, "elapsed_time": "4:41:12", "remaining_time": "3:48:16"}
120
+ {"current_steps": 1200, "total_steps": 2156, "loss": 1.2404, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.057909611088709e-05, "epoch": 1.11, "percentage": 55.66, "elapsed_time": "4:43:32", "remaining_time": "3:45:53"}
121
+ {"current_steps": 1210, "total_steps": 2156, "loss": 1.2245, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.02210342824657e-05, "epoch": 1.12, "percentage": 56.12, "elapsed_time": "4:45:58", "remaining_time": "3:43:35"}
122
+ {"current_steps": 1220, "total_steps": 2156, "loss": 1.2288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9863987132555706e-05, "epoch": 1.13, "percentage": 56.59, "elapsed_time": "4:48:20", "remaining_time": "3:41:13"}
123
+ {"current_steps": 1230, "total_steps": 2156, "loss": 1.2401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9508030470043806e-05, "epoch": 1.14, "percentage": 57.05, "elapsed_time": "4:50:46", "remaining_time": "3:38:54"}
124
+ {"current_steps": 1240, "total_steps": 2156, "loss": 1.2275, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.915323987228247e-05, "epoch": 1.15, "percentage": 57.51, "elapsed_time": "4:53:04", "remaining_time": "3:36:29"}
125
+ {"current_steps": 1250, "total_steps": 2156, "loss": 1.2251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8799690669043212e-05, "epoch": 1.16, "percentage": 57.98, "elapsed_time": "4:55:27", "remaining_time": "3:34:08"}
126
+ {"current_steps": 1260, "total_steps": 2156, "loss": 1.238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8447457926522454e-05, "epoch": 1.17, "percentage": 58.44, "elapsed_time": "4:57:48", "remaining_time": "3:31:46"}
127
+ {"current_steps": 1270, "total_steps": 2156, "loss": 1.2291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8096616431403325e-05, "epoch": 1.18, "percentage": 58.91, "elapsed_time": "5:00:05", "remaining_time": "3:29:21"}
128
+ {"current_steps": 1280, "total_steps": 2156, "loss": 1.2162, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7747240674976857e-05, "epoch": 1.19, "percentage": 59.37, "elapsed_time": "5:02:27", "remaining_time": "3:26:59"}
129
+ {"current_steps": 1290, "total_steps": 2156, "loss": 1.2315, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7399404837325796e-05, "epoch": 1.2, "percentage": 59.83, "elapsed_time": "5:04:51", "remaining_time": "3:24:39"}
130
+ {"current_steps": 1300, "total_steps": 2156, "loss": 1.2383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7053182771574633e-05, "epoch": 1.21, "percentage": 60.3, "elapsed_time": "5:07:10", "remaining_time": "3:22:15"}
131
+ {"current_steps": 1310, "total_steps": 2156, "loss": 1.2254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6708647988208887e-05, "epoch": 1.21, "percentage": 60.76, "elapsed_time": "5:09:34", "remaining_time": "3:19:55"}
132
+ {"current_steps": 1320, "total_steps": 2156, "loss": 1.2388, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6365873639467315e-05, "epoch": 1.22, "percentage": 61.22, "elapsed_time": "5:11:56", "remaining_time": "3:17:34"}
133
+ {"current_steps": 1330, "total_steps": 2156, "loss": 1.2213, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.602493250381003e-05, "epoch": 1.23, "percentage": 61.69, "elapsed_time": "5:14:18", "remaining_time": "3:15:11"}
134
+ {"current_steps": 1340, "total_steps": 2156, "loss": 1.2366, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5685896970466123e-05, "epoch": 1.24, "percentage": 62.15, "elapsed_time": "5:16:36", "remaining_time": "3:12:47"}
135
+ {"current_steps": 1350, "total_steps": 2156, "loss": 1.2265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.534883902406375e-05, "epoch": 1.25, "percentage": 62.62, "elapsed_time": "5:19:01", "remaining_time": "3:10:27"}
136
+ {"current_steps": 1360, "total_steps": 2156, "loss": 1.2094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5013830229346326e-05, "epoch": 1.26, "percentage": 63.08, "elapsed_time": "5:21:24", "remaining_time": "3:08:07"}
137
+ {"current_steps": 1370, "total_steps": 2156, "loss": 1.2314, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4680941715977722e-05, "epoch": 1.27, "percentage": 63.54, "elapsed_time": "5:23:49", "remaining_time": "3:05:46"}
138
+ {"current_steps": 1380, "total_steps": 2156, "loss": 1.2254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4350244163439892e-05, "epoch": 1.28, "percentage": 64.01, "elapsed_time": "5:26:07", "remaining_time": "3:03:23"}
139
+ {"current_steps": 1390, "total_steps": 2156, "loss": 1.2287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4021807786026108e-05, "epoch": 1.29, "percentage": 64.47, "elapsed_time": "5:28:29", "remaining_time": "3:01:01"}
140
+ {"current_steps": 1400, "total_steps": 2156, "loss": 1.2235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3695702317932862e-05, "epoch": 1.3, "percentage": 64.94, "elapsed_time": "5:30:55", "remaining_time": "2:58:42"}
141
+ {"current_steps": 1410, "total_steps": 2156, "loss": 1.2209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.337199699845387e-05, "epoch": 1.31, "percentage": 65.4, "elapsed_time": "5:33:10", "remaining_time": "2:56:16"}
142
+ {"current_steps": 1420, "total_steps": 2156, "loss": 1.2296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3050760557279023e-05, "epoch": 1.32, "percentage": 65.86, "elapsed_time": "5:35:29", "remaining_time": "2:53:53"}
143
+ {"current_steps": 1430, "total_steps": 2156, "loss": 1.2363, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2732061199901562e-05, "epoch": 1.33, "percentage": 66.33, "elapsed_time": "5:37:52", "remaining_time": "2:51:32"}
144
+ {"current_steps": 1440, "total_steps": 2156, "loss": 1.2116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2415966593136547e-05, "epoch": 1.34, "percentage": 66.79, "elapsed_time": "5:40:16", "remaining_time": "2:49:11"}
145
+ {"current_steps": 1450, "total_steps": 2156, "loss": 1.2299, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2102543850753808e-05, "epoch": 1.34, "percentage": 67.25, "elapsed_time": "5:42:38", "remaining_time": "2:46:50"}
146
+ {"current_steps": 1460, "total_steps": 2156, "loss": 1.2204, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1791859519228138e-05, "epoch": 1.35, "percentage": 67.72, "elapsed_time": "5:45:03", "remaining_time": "2:44:29"}
147
+ {"current_steps": 1470, "total_steps": 2156, "loss": 1.2271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.148397956361007e-05, "epoch": 1.36, "percentage": 68.18, "elapsed_time": "5:47:21", "remaining_time": "2:42:05"}
148
+ {"current_steps": 1480, "total_steps": 2156, "loss": 1.2327, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1178969353520018e-05, "epoch": 1.37, "percentage": 68.65, "elapsed_time": "5:49:44", "remaining_time": "2:39:44"}
149
+ {"current_steps": 1490, "total_steps": 2156, "loss": 1.2252, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.087689364926897e-05, "epoch": 1.38, "percentage": 69.11, "elapsed_time": "5:52:04", "remaining_time": "2:37:22"}
150
+ {"current_steps": 1500, "total_steps": 2156, "loss": 1.2305, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0577816588108378e-05, "epoch": 1.39, "percentage": 69.57, "elapsed_time": "5:54:28", "remaining_time": "2:35:01"}
151
+ {"current_steps": 1510, "total_steps": 2156, "loss": 1.2179, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0281801670612449e-05, "epoch": 1.4, "percentage": 70.04, "elapsed_time": "5:56:53", "remaining_time": "2:32:40"}
152
+ {"current_steps": 1520, "total_steps": 2156, "loss": 1.229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.988911747195603e-06, "epoch": 1.41, "percentage": 70.5, "elapsed_time": "5:59:13", "remaining_time": "2:30:18"}
153
+ {"current_steps": 1530, "total_steps": 2156, "loss": 1.2192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.699209004767953e-06, "epoch": 1.42, "percentage": 70.96, "elapsed_time": "6:01:36", "remaining_time": "2:27:56"}
154
+ {"current_steps": 1540, "total_steps": 2156, "loss": 1.2322, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.412754953531663e-06, "epoch": 1.43, "percentage": 71.43, "elapsed_time": "6:03:59", "remaining_time": "2:25:35"}
155
+ {"current_steps": 1550, "total_steps": 2156, "loss": 1.2193, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.129610413921005e-06, "epoch": 1.44, "percentage": 71.89, "elapsed_time": "6:06:22", "remaining_time": "2:23:14"}
156
+ {"current_steps": 1560, "total_steps": 2156, "loss": 1.2149, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.849835503688847e-06, "epoch": 1.45, "percentage": 72.36, "elapsed_time": "6:08:49", "remaining_time": "2:20:54"}
157
+ {"current_steps": 1570, "total_steps": 2156, "loss": 1.217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.57348962514234e-06, "epoch": 1.46, "percentage": 72.82, "elapsed_time": "6:11:09", "remaining_time": "2:18:32"}
158
+ {"current_steps": 1580, "total_steps": 2156, "loss": 1.2368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.30063145253053e-06, "epoch": 1.46, "percentage": 73.28, "elapsed_time": "6:13:27", "remaining_time": "2:16:08"}
159
+ {"current_steps": 1590, "total_steps": 2156, "loss": 1.2264, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.031318919586523e-06, "epoch": 1.47, "percentage": 73.75, "elapsed_time": "6:15:52", "remaining_time": "2:13:48"}
160
+ {"current_steps": 1600, "total_steps": 2156, "loss": 1.2243, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.7656092072269e-06, "epoch": 1.48, "percentage": 74.21, "elapsed_time": "6:18:11", "remaining_time": "2:11:25"}
161
+ {"current_steps": 1610, "total_steps": 2156, "loss": 1.2427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.503558731410959e-06, "epoch": 1.49, "percentage": 74.68, "elapsed_time": "6:20:32", "remaining_time": "2:09:03"}
162
+ {"current_steps": 1620, "total_steps": 2156, "loss": 1.2207, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.245223131162376e-06, "epoch": 1.5, "percentage": 75.14, "elapsed_time": "6:22:56", "remaining_time": "2:06:42"}
163
+ {"current_steps": 1630, "total_steps": 2156, "loss": 1.2211, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.9906572567558285e-06, "epoch": 1.51, "percentage": 75.6, "elapsed_time": "6:25:19", "remaining_time": "2:04:20"}
164
+ {"current_steps": 1640, "total_steps": 2156, "loss": 1.2343, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.739915158071106e-06, "epoch": 1.52, "percentage": 76.07, "elapsed_time": "6:27:35", "remaining_time": "2:01:57"}
165
+ {"current_steps": 1650, "total_steps": 2156, "loss": 1.2348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.493050073117116e-06, "epoch": 1.53, "percentage": 76.53, "elapsed_time": "6:29:57", "remaining_time": "1:59:35"}
166
+ {"current_steps": 1660, "total_steps": 2156, "loss": 1.2189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.250114416728298e-06, "epoch": 1.54, "percentage": 76.99, "elapsed_time": "6:32:14", "remaining_time": "1:57:12"}
167
+ {"current_steps": 1670, "total_steps": 2156, "loss": 1.2207, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.011159769435823e-06, "epoch": 1.55, "percentage": 77.46, "elapsed_time": "6:34:30", "remaining_time": "1:54:48"}
168
+ {"current_steps": 1680, "total_steps": 2156, "loss": 1.2315, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.776236866515947e-06, "epoch": 1.56, "percentage": 77.92, "elapsed_time": "6:36:48", "remaining_time": "1:52:25"}
169
+ {"current_steps": 1690, "total_steps": 2156, "loss": 1.2334, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.545395587217763e-06, "epoch": 1.57, "percentage": 78.39, "elapsed_time": "6:39:05", "remaining_time": "1:50:02"}
170
+ {"current_steps": 1700, "total_steps": 2156, "loss": 1.232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.318684944172752e-06, "epoch": 1.58, "percentage": 78.85, "elapsed_time": "6:41:25", "remaining_time": "1:47:40"}
171
+ {"current_steps": 1710, "total_steps": 2156, "loss": 1.2235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.096153072988366e-06, "epoch": 1.59, "percentage": 79.31, "elapsed_time": "6:43:44", "remaining_time": "1:45:18"}
172
+ {"current_steps": 1720, "total_steps": 2156, "loss": 1.2244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8778472220277874e-06, "epoch": 1.59, "percentage": 79.78, "elapsed_time": "6:46:02", "remaining_time": "1:42:55"}
173
+ {"current_steps": 1730, "total_steps": 2156, "loss": 1.2197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6638137423780395e-06, "epoch": 1.6, "percentage": 80.24, "elapsed_time": "6:48:25", "remaining_time": "1:40:34"}
174
+ {"current_steps": 1740, "total_steps": 2156, "loss": 1.2242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.454098078008667e-06, "epoch": 1.61, "percentage": 80.71, "elapsed_time": "6:50:51", "remaining_time": "1:38:13"}
175
+ {"current_steps": 1750, "total_steps": 2156, "loss": 1.2382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.248744756122986e-06, "epoch": 1.62, "percentage": 81.17, "elapsed_time": "6:53:10", "remaining_time": "1:35:51"}
176
+ {"current_steps": 1760, "total_steps": 2156, "loss": 1.2265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.047797377703985e-06, "epoch": 1.63, "percentage": 81.63, "elapsed_time": "6:55:29", "remaining_time": "1:33:29"}
177
+ {"current_steps": 1770, "total_steps": 2156, "loss": 1.2153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.851298608256892e-06, "epoch": 1.64, "percentage": 82.1, "elapsed_time": "6:57:52", "remaining_time": "1:31:07"}
178
+ {"current_steps": 1780, "total_steps": 2156, "loss": 1.2198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6592901687503566e-06, "epoch": 1.65, "percentage": 82.56, "elapsed_time": "7:00:12", "remaining_time": "1:28:45"}
179
+ {"current_steps": 1790, "total_steps": 2156, "loss": 1.23, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.471812826758178e-06, "epoch": 1.66, "percentage": 83.02, "elapsed_time": "7:02:31", "remaining_time": "1:26:23"}
180
+ {"current_steps": 1800, "total_steps": 2156, "loss": 1.2268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.288906387803464e-06, "epoch": 1.67, "percentage": 83.49, "elapsed_time": "7:04:53", "remaining_time": "1:24:02"}
181
+ {"current_steps": 1810, "total_steps": 2156, "loss": 1.2228, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1106096869070483e-06, "epoch": 1.68, "percentage": 83.95, "elapsed_time": "7:07:14", "remaining_time": "1:21:40"}
182
+ {"current_steps": 1820, "total_steps": 2156, "loss": 1.2288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9369605803419715e-06, "epoch": 1.69, "percentage": 84.42, "elapsed_time": "7:09:30", "remaining_time": "1:19:17"}
183
+ {"current_steps": 1830, "total_steps": 2156, "loss": 1.228, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.767995937595766e-06, "epoch": 1.7, "percentage": 84.88, "elapsed_time": "7:11:49", "remaining_time": "1:16:55"}
184
+ {"current_steps": 1840, "total_steps": 2156, "loss": 1.2178, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6037516335422728e-06, "epoch": 1.71, "percentage": 85.34, "elapsed_time": "7:14:09", "remaining_time": "1:14:33"}
185
+ {"current_steps": 1850, "total_steps": 2156, "loss": 1.2386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4442625408246074e-06, "epoch": 1.72, "percentage": 85.81, "elapsed_time": "7:16:28", "remaining_time": "1:12:11"}
186
+ {"current_steps": 1860, "total_steps": 2156, "loss": 1.2274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.289562522450947e-06, "epoch": 1.72, "percentage": 86.27, "elapsed_time": "7:18:53", "remaining_time": "1:09:50"}
187
+ {"current_steps": 1870, "total_steps": 2156, "loss": 1.2341, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1396844246046903e-06, "epoch": 1.73, "percentage": 86.73, "elapsed_time": "7:21:17", "remaining_time": "1:07:29"}
188
+ {"current_steps": 1880, "total_steps": 2156, "loss": 1.2118, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9946600696704592e-06, "epoch": 1.74, "percentage": 87.2, "elapsed_time": "7:23:46", "remaining_time": "1:05:08"}
189
+ {"current_steps": 1890, "total_steps": 2156, "loss": 1.2219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8545202494775509e-06, "epoch": 1.75, "percentage": 87.66, "elapsed_time": "7:26:09", "remaining_time": "1:02:47"}
190
+ {"current_steps": 1900, "total_steps": 2156, "loss": 1.2268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7192947187621434e-06, "epoch": 1.76, "percentage": 88.13, "elapsed_time": "7:28:32", "remaining_time": "1:00:26"}
191
+ {"current_steps": 1910, "total_steps": 2156, "loss": 1.2183, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5890121888497366e-06, "epoch": 1.77, "percentage": 88.59, "elapsed_time": "7:30:53", "remaining_time": "0:58:04"}
192
+ {"current_steps": 1920, "total_steps": 2156, "loss": 1.2253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.463700321559075e-06, "epoch": 1.78, "percentage": 89.05, "elapsed_time": "7:33:19", "remaining_time": "0:55:43"}
193
+ {"current_steps": 1930, "total_steps": 2156, "loss": 1.2262, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3433857233289714e-06, "epoch": 1.79, "percentage": 89.52, "elapsed_time": "7:35:40", "remaining_time": "0:53:21"}
194
+ {"current_steps": 1940, "total_steps": 2156, "loss": 1.2232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2280939395691859e-06, "epoch": 1.8, "percentage": 89.98, "elapsed_time": "7:37:59", "remaining_time": "0:50:59"}
195
+ {"current_steps": 1950, "total_steps": 2156, "loss": 1.2376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1178494492365465e-06, "epoch": 1.81, "percentage": 90.45, "elapsed_time": "7:40:30", "remaining_time": "0:48:38"}
196
+ {"current_steps": 1960, "total_steps": 2156, "loss": 1.2287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0126756596375686e-06, "epoch": 1.82, "percentage": 90.91, "elapsed_time": "7:42:48", "remaining_time": "0:46:16"}
197
+ {"current_steps": 1970, "total_steps": 2156, "loss": 1.2354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.125949014585383e-07, "epoch": 1.83, "percentage": 91.37, "elapsed_time": "7:45:10", "remaining_time": "0:43:55"}
198
+ {"current_steps": 1980, "total_steps": 2156, "loss": 1.224, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.176284240242638e-07, "epoch": 1.84, "percentage": 91.84, "elapsed_time": "7:47:30", "remaining_time": "0:41:33"}
199
+ {"current_steps": 1990, "total_steps": 2156, "loss": 1.2214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.277963907863478e-07, "epoch": 1.85, "percentage": 92.3, "elapsed_time": "7:49:52", "remaining_time": "0:39:11"}
200
+ {"current_steps": 2000, "total_steps": 2156, "loss": 1.2214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.431178750420513e-07, "epoch": 1.85, "percentage": 92.76, "elapsed_time": "7:52:13", "remaining_time": "0:36:50"}
201
+ {"current_steps": 2010, "total_steps": 2156, "loss": 1.225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.636108558846359e-07, "epoch": 1.86, "percentage": 93.23, "elapsed_time": "7:54:36", "remaining_time": "0:34:28"}
202
+ {"current_steps": 2020, "total_steps": 2156, "loss": 1.2373, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.892922143859918e-07, "epoch": 1.87, "percentage": 93.69, "elapsed_time": "7:56:57", "remaining_time": "0:32:06"}
203
+ {"current_steps": 2030, "total_steps": 2156, "loss": 1.2268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.201777300124249e-07, "epoch": 1.88, "percentage": 94.16, "elapsed_time": "7:59:17", "remaining_time": "0:29:44"}
204
+ {"current_steps": 2040, "total_steps": 2156, "loss": 1.2209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.562820772743392e-07, "epoch": 1.89, "percentage": 94.62, "elapsed_time": "8:01:43", "remaining_time": "0:27:23"}
205
+ {"current_steps": 2050, "total_steps": 2156, "loss": 1.2184, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9761882261050143e-07, "epoch": 1.9, "percentage": 95.08, "elapsed_time": "8:04:08", "remaining_time": "0:25:02"}
206
+ {"current_steps": 2060, "total_steps": 2156, "loss": 1.2265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4420042150761215e-07, "epoch": 1.91, "percentage": 95.55, "elapsed_time": "8:06:28", "remaining_time": "0:22:40"}
207
+ {"current_steps": 2070, "total_steps": 2156, "loss": 1.227, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9603821585572058e-07, "epoch": 1.92, "percentage": 96.01, "elapsed_time": "8:08:51", "remaining_time": "0:20:18"}
208
+ {"current_steps": 2080, "total_steps": 2156, "loss": 1.2172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.531424315400931e-07, "epoch": 1.93, "percentage": 96.47, "elapsed_time": "8:11:20", "remaining_time": "0:17:57"}
209
+ {"current_steps": 2090, "total_steps": 2156, "loss": 1.2196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1552217627004425e-07, "epoch": 1.94, "percentage": 96.94, "elapsed_time": "8:13:39", "remaining_time": "0:15:35"}
210
+ {"current_steps": 2100, "total_steps": 2156, "loss": 1.2381, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.318543764516961e-08, "epoch": 1.95, "percentage": 97.4, "elapsed_time": "8:15:59", "remaining_time": "0:13:13"}
211
+ {"current_steps": 2110, "total_steps": 2156, "loss": 1.2288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.613908145939428e-08, "epoch": 1.96, "percentage": 97.87, "elapsed_time": "8:18:23", "remaining_time": "0:10:51"}
212
+ {"current_steps": 2120, "total_steps": 2156, "loss": 1.2387, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.438885024322769e-08, "epoch": 1.97, "percentage": 98.33, "elapsed_time": "8:20:45", "remaining_time": "0:08:30"}
213
+ {"current_steps": 2130, "total_steps": 2156, "loss": 1.2357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7939362044494534e-08, "epoch": 1.97, "percentage": 98.79, "elapsed_time": "8:23:04", "remaining_time": "0:06:08"}
214
+ {"current_steps": 2140, "total_steps": 2156, "loss": 1.2338, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.794109447824659e-09, "epoch": 1.98, "percentage": 99.26, "elapsed_time": "8:25:26", "remaining_time": "0:03:46"}
215
+ {"current_steps": 2150, "total_steps": 2156, "loss": 1.2226, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.554588330934478e-10, "epoch": 1.99, "percentage": 99.72, "elapsed_time": "8:27:54", "remaining_time": "0:01:25"}
216
+ {"current_steps": 2156, "total_steps": 2156, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "8:29:20", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,1318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.999072786277237,
5
+ "eval_steps": 500,
6
+ "global_step": 2156,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 4.999734597774032e-05,
14
+ "loss": 1.6199,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 4.998938447446803e-05,
20
+ "loss": 1.5062,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.03,
25
+ "learning_rate": 4.997611718058365e-05,
26
+ "loss": 1.4138,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.04,
31
+ "learning_rate": 4.9957546913022665e-05,
32
+ "loss": 1.369,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.05,
37
+ "learning_rate": 4.993367761465736e-05,
38
+ "loss": 1.3408,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.06,
43
+ "learning_rate": 4.9904514353459654e-05,
44
+ "loss": 1.3321,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.06,
49
+ "learning_rate": 4.9870063321425105e-05,
50
+ "loss": 1.3251,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.07,
55
+ "learning_rate": 4.983033183325818e-05,
56
+ "loss": 1.3228,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.08,
61
+ "learning_rate": 4.97853283248192e-05,
62
+ "loss": 1.3111,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.09,
67
+ "learning_rate": 4.973506235133323e-05,
68
+ "loss": 1.3013,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.1,
73
+ "learning_rate": 4.967954458536126e-05,
74
+ "loss": 1.3004,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.11,
79
+ "learning_rate": 4.9618786814534226e-05,
80
+ "loss": 1.2959,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.12,
85
+ "learning_rate": 4.955280193905022e-05,
86
+ "loss": 1.2969,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.13,
91
+ "learning_rate": 4.948160396893553e-05,
92
+ "loss": 1.2879,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.14,
97
+ "learning_rate": 4.9405208021069946e-05,
98
+ "loss": 1.277,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.15,
103
+ "learning_rate": 4.9323630315977156e-05,
104
+ "loss": 1.283,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.16,
109
+ "learning_rate": 4.9236888174380784e-05,
110
+ "loss": 1.288,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.17,
115
+ "learning_rate": 4.91450000135268e-05,
116
+ "loss": 1.287,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.18,
121
+ "learning_rate": 4.9047985343273154e-05,
122
+ "loss": 1.2726,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.19,
127
+ "learning_rate": 4.894586476194739e-05,
128
+ "loss": 1.2808,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.19,
133
+ "learning_rate": 4.883865995197319e-05,
134
+ "loss": 1.2657,
135
+ "step": 210
136
+ },
137
+ {
138
+ "epoch": 0.2,
139
+ "learning_rate": 4.8726393675266716e-05,
140
+ "loss": 1.275,
141
+ "step": 220
142
+ },
143
+ {
144
+ "epoch": 0.21,
145
+ "learning_rate": 4.860908976840376e-05,
146
+ "loss": 1.2667,
147
+ "step": 230
148
+ },
149
+ {
150
+ "epoch": 0.22,
151
+ "learning_rate": 4.848677313755872e-05,
152
+ "loss": 1.2715,
153
+ "step": 240
154
+ },
155
+ {
156
+ "epoch": 0.23,
157
+ "learning_rate": 4.835946975321647e-05,
158
+ "loss": 1.273,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 0.24,
163
+ "learning_rate": 4.822720664465827e-05,
164
+ "loss": 1.2605,
165
+ "step": 260
166
+ },
167
+ {
168
+ "epoch": 0.25,
169
+ "learning_rate": 4.809001189422287e-05,
170
+ "loss": 1.2766,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 0.26,
175
+ "learning_rate": 4.794791463134399e-05,
176
+ "loss": 1.262,
177
+ "step": 280
178
+ },
179
+ {
180
+ "epoch": 0.27,
181
+ "learning_rate": 4.780094502636552e-05,
182
+ "loss": 1.255,
183
+ "step": 290
184
+ },
185
+ {
186
+ "epoch": 0.28,
187
+ "learning_rate": 4.764913428413572e-05,
188
+ "loss": 1.2652,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 0.29,
193
+ "learning_rate": 4.7492514637381727e-05,
194
+ "loss": 1.2668,
195
+ "step": 310
196
+ },
197
+ {
198
+ "epoch": 0.3,
199
+ "learning_rate": 4.733111933986583e-05,
200
+ "loss": 1.2621,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.31,
205
+ "learning_rate": 4.716498265932501e-05,
206
+ "loss": 1.257,
207
+ "step": 330
208
+ },
209
+ {
210
+ "epoch": 0.32,
211
+ "learning_rate": 4.699413987019512e-05,
212
+ "loss": 1.2789,
213
+ "step": 340
214
+ },
215
+ {
216
+ "epoch": 0.32,
217
+ "learning_rate": 4.681862724612141e-05,
218
+ "loss": 1.2634,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 0.33,
223
+ "learning_rate": 4.663848205225674e-05,
224
+ "loss": 1.2594,
225
+ "step": 360
226
+ },
227
+ {
228
+ "epoch": 0.34,
229
+ "learning_rate": 4.645374253734949e-05,
230
+ "loss": 1.26,
231
+ "step": 370
232
+ },
233
+ {
234
+ "epoch": 0.35,
235
+ "learning_rate": 4.626444792562244e-05,
236
+ "loss": 1.2514,
237
+ "step": 380
238
+ },
239
+ {
240
+ "epoch": 0.36,
241
+ "learning_rate": 4.607063840844463e-05,
242
+ "loss": 1.2506,
243
+ "step": 390
244
+ },
245
+ {
246
+ "epoch": 0.37,
247
+ "learning_rate": 4.587235513579791e-05,
248
+ "loss": 1.2648,
249
+ "step": 400
250
+ },
251
+ {
252
+ "epoch": 0.38,
253
+ "learning_rate": 4.5669640207539786e-05,
254
+ "loss": 1.2511,
255
+ "step": 410
256
+ },
257
+ {
258
+ "epoch": 0.39,
259
+ "learning_rate": 4.546253666446484e-05,
260
+ "loss": 1.2594,
261
+ "step": 420
262
+ },
263
+ {
264
+ "epoch": 0.4,
265
+ "learning_rate": 4.525108847916614e-05,
266
+ "loss": 1.2608,
267
+ "step": 430
268
+ },
269
+ {
270
+ "epoch": 0.41,
271
+ "learning_rate": 4.503534054669892e-05,
272
+ "loss": 1.2597,
273
+ "step": 440
274
+ },
275
+ {
276
+ "epoch": 0.42,
277
+ "learning_rate": 4.481533867504841e-05,
278
+ "loss": 1.2609,
279
+ "step": 450
280
+ },
281
+ {
282
+ "epoch": 0.43,
283
+ "learning_rate": 4.4591129575403765e-05,
284
+ "loss": 1.2505,
285
+ "step": 460
286
+ },
287
+ {
288
+ "epoch": 0.44,
289
+ "learning_rate": 4.43627608522403e-05,
290
+ "loss": 1.2481,
291
+ "step": 470
292
+ },
293
+ {
294
+ "epoch": 0.45,
295
+ "learning_rate": 4.4130280993211974e-05,
296
+ "loss": 1.2612,
297
+ "step": 480
298
+ },
299
+ {
300
+ "epoch": 0.45,
301
+ "learning_rate": 4.389373935885646e-05,
302
+ "loss": 1.2504,
303
+ "step": 490
304
+ },
305
+ {
306
+ "epoch": 0.46,
307
+ "learning_rate": 4.365318617211479e-05,
308
+ "loss": 1.2518,
309
+ "step": 500
310
+ },
311
+ {
312
+ "epoch": 0.47,
313
+ "learning_rate": 4.340867250766794e-05,
314
+ "loss": 1.2458,
315
+ "step": 510
316
+ },
317
+ {
318
+ "epoch": 0.48,
319
+ "learning_rate": 4.316025028109258e-05,
320
+ "loss": 1.2345,
321
+ "step": 520
322
+ },
323
+ {
324
+ "epoch": 0.49,
325
+ "learning_rate": 4.2907972237838225e-05,
326
+ "loss": 1.2521,
327
+ "step": 530
328
+ },
329
+ {
330
+ "epoch": 0.5,
331
+ "learning_rate": 4.2651891942028274e-05,
332
+ "loss": 1.2528,
333
+ "step": 540
334
+ },
335
+ {
336
+ "epoch": 0.51,
337
+ "learning_rate": 4.239206376508717e-05,
338
+ "loss": 1.2462,
339
+ "step": 550
340
+ },
341
+ {
342
+ "epoch": 0.52,
343
+ "learning_rate": 4.212854287419611e-05,
344
+ "loss": 1.2401,
345
+ "step": 560
346
+ },
347
+ {
348
+ "epoch": 0.53,
349
+ "learning_rate": 4.1861385220579934e-05,
350
+ "loss": 1.2496,
351
+ "step": 570
352
+ },
353
+ {
354
+ "epoch": 0.54,
355
+ "learning_rate": 4.1590647527627404e-05,
356
+ "loss": 1.2522,
357
+ "step": 580
358
+ },
359
+ {
360
+ "epoch": 0.55,
361
+ "learning_rate": 4.131638727884762e-05,
362
+ "loss": 1.2377,
363
+ "step": 590
364
+ },
365
+ {
366
+ "epoch": 0.56,
367
+ "learning_rate": 4.103866270566498e-05,
368
+ "loss": 1.2467,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 0.57,
373
+ "learning_rate": 4.075753277505544e-05,
374
+ "loss": 1.2421,
375
+ "step": 610
376
+ },
377
+ {
378
+ "epoch": 0.57,
379
+ "learning_rate": 4.0473057177026484e-05,
380
+ "loss": 1.2455,
381
+ "step": 620
382
+ },
383
+ {
384
+ "epoch": 0.58,
385
+ "learning_rate": 4.018529631194369e-05,
386
+ "loss": 1.2294,
387
+ "step": 630
388
+ },
389
+ {
390
+ "epoch": 0.59,
391
+ "learning_rate": 3.989431127770635e-05,
392
+ "loss": 1.2509,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 0.6,
397
+ "learning_rate": 3.960016385677513e-05,
398
+ "loss": 1.2354,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 0.61,
403
+ "learning_rate": 3.9302916503054246e-05,
404
+ "loss": 1.2318,
405
+ "step": 660
406
+ },
407
+ {
408
+ "epoch": 0.62,
409
+ "learning_rate": 3.9002632328631164e-05,
410
+ "loss": 1.2376,
411
+ "step": 670
412
+ },
413
+ {
414
+ "epoch": 0.63,
415
+ "learning_rate": 3.8699375090376534e-05,
416
+ "loss": 1.2412,
417
+ "step": 680
418
+ },
419
+ {
420
+ "epoch": 0.64,
421
+ "learning_rate": 3.8393209176407223e-05,
422
+ "loss": 1.2479,
423
+ "step": 690
424
+ },
425
+ {
426
+ "epoch": 0.65,
427
+ "learning_rate": 3.8084199592415305e-05,
428
+ "loss": 1.2431,
429
+ "step": 700
430
+ },
431
+ {
432
+ "epoch": 0.66,
433
+ "learning_rate": 3.777241194786591e-05,
434
+ "loss": 1.245,
435
+ "step": 710
436
+ },
437
+ {
438
+ "epoch": 0.67,
439
+ "learning_rate": 3.745791244206697e-05,
440
+ "loss": 1.2393,
441
+ "step": 720
442
+ },
443
+ {
444
+ "epoch": 0.68,
445
+ "learning_rate": 3.714076785011359e-05,
446
+ "loss": 1.2473,
447
+ "step": 730
448
+ },
449
+ {
450
+ "epoch": 0.69,
451
+ "learning_rate": 3.682104550871031e-05,
452
+ "loss": 1.2552,
453
+ "step": 740
454
+ },
455
+ {
456
+ "epoch": 0.7,
457
+ "learning_rate": 3.649881330187401e-05,
458
+ "loss": 1.2356,
459
+ "step": 750
460
+ },
461
+ {
462
+ "epoch": 0.7,
463
+ "learning_rate": 3.617413964652067e-05,
464
+ "loss": 1.2442,
465
+ "step": 760
466
+ },
467
+ {
468
+ "epoch": 0.71,
469
+ "learning_rate": 3.5847093477938956e-05,
470
+ "loss": 1.2605,
471
+ "step": 770
472
+ },
473
+ {
474
+ "epoch": 0.72,
475
+ "learning_rate": 3.551774423515378e-05,
476
+ "loss": 1.2364,
477
+ "step": 780
478
+ },
479
+ {
480
+ "epoch": 0.73,
481
+ "learning_rate": 3.518616184618288e-05,
482
+ "loss": 1.2482,
483
+ "step": 790
484
+ },
485
+ {
486
+ "epoch": 0.74,
487
+ "learning_rate": 3.4852416713189526e-05,
488
+ "loss": 1.2327,
489
+ "step": 800
490
+ },
491
+ {
492
+ "epoch": 0.75,
493
+ "learning_rate": 3.4516579697534705e-05,
494
+ "loss": 1.2548,
495
+ "step": 810
496
+ },
497
+ {
498
+ "epoch": 0.76,
499
+ "learning_rate": 3.417872210473162e-05,
500
+ "loss": 1.2469,
501
+ "step": 820
502
+ },
503
+ {
504
+ "epoch": 0.77,
505
+ "learning_rate": 3.3838915669306034e-05,
506
+ "loss": 1.2424,
507
+ "step": 830
508
+ },
509
+ {
510
+ "epoch": 0.78,
511
+ "learning_rate": 3.349723253956542e-05,
512
+ "loss": 1.2438,
513
+ "step": 840
514
+ },
515
+ {
516
+ "epoch": 0.79,
517
+ "learning_rate": 3.315374526228036e-05,
518
+ "loss": 1.243,
519
+ "step": 850
520
+ },
521
+ {
522
+ "epoch": 0.8,
523
+ "learning_rate": 3.2808526767281225e-05,
524
+ "loss": 1.2416,
525
+ "step": 860
526
+ },
527
+ {
528
+ "epoch": 0.81,
529
+ "learning_rate": 3.246165035197364e-05,
530
+ "loss": 1.2399,
531
+ "step": 870
532
+ },
533
+ {
534
+ "epoch": 0.82,
535
+ "learning_rate": 3.211318966577581e-05,
536
+ "loss": 1.239,
537
+ "step": 880
538
+ },
539
+ {
540
+ "epoch": 0.83,
541
+ "learning_rate": 3.176321869448116e-05,
542
+ "loss": 1.2498,
543
+ "step": 890
544
+ },
545
+ {
546
+ "epoch": 0.83,
547
+ "learning_rate": 3.1411811744549536e-05,
548
+ "loss": 1.2438,
549
+ "step": 900
550
+ },
551
+ {
552
+ "epoch": 0.84,
553
+ "learning_rate": 3.105904342733032e-05,
554
+ "loss": 1.233,
555
+ "step": 910
556
+ },
557
+ {
558
+ "epoch": 0.85,
559
+ "learning_rate": 3.070498864322081e-05,
560
+ "loss": 1.2315,
561
+ "step": 920
562
+ },
563
+ {
564
+ "epoch": 0.86,
565
+ "learning_rate": 3.034972256576328e-05,
566
+ "loss": 1.232,
567
+ "step": 930
568
+ },
569
+ {
570
+ "epoch": 0.87,
571
+ "learning_rate": 2.999332062568395e-05,
572
+ "loss": 1.2467,
573
+ "step": 940
574
+ },
575
+ {
576
+ "epoch": 0.88,
577
+ "learning_rate": 2.9635858494877384e-05,
578
+ "loss": 1.2483,
579
+ "step": 950
580
+ },
581
+ {
582
+ "epoch": 0.89,
583
+ "learning_rate": 2.9277412070339782e-05,
584
+ "loss": 1.2377,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 0.9,
589
+ "learning_rate": 2.891805745805429e-05,
590
+ "loss": 1.2347,
591
+ "step": 970
592
+ },
593
+ {
594
+ "epoch": 0.91,
595
+ "learning_rate": 2.8557870956832132e-05,
596
+ "loss": 1.2428,
597
+ "step": 980
598
+ },
599
+ {
600
+ "epoch": 0.92,
601
+ "learning_rate": 2.8196929042112652e-05,
602
+ "loss": 1.244,
603
+ "step": 990
604
+ },
605
+ {
606
+ "epoch": 0.93,
607
+ "learning_rate": 2.783530834972594e-05,
608
+ "loss": 1.2317,
609
+ "step": 1000
610
+ },
611
+ {
612
+ "epoch": 0.94,
613
+ "learning_rate": 2.7473085659621377e-05,
614
+ "loss": 1.2421,
615
+ "step": 1010
616
+ },
617
+ {
618
+ "epoch": 0.95,
619
+ "learning_rate": 2.711033787956555e-05,
620
+ "loss": 1.2446,
621
+ "step": 1020
622
+ },
623
+ {
624
+ "epoch": 0.96,
625
+ "learning_rate": 2.6747142028813105e-05,
626
+ "loss": 1.2471,
627
+ "step": 1030
628
+ },
629
+ {
630
+ "epoch": 0.96,
631
+ "learning_rate": 2.638357522175383e-05,
632
+ "loss": 1.2403,
633
+ "step": 1040
634
+ },
635
+ {
636
+ "epoch": 0.97,
637
+ "learning_rate": 2.6019714651539646e-05,
638
+ "loss": 1.2413,
639
+ "step": 1050
640
+ },
641
+ {
642
+ "epoch": 0.98,
643
+ "learning_rate": 2.565563757369475e-05,
644
+ "loss": 1.2341,
645
+ "step": 1060
646
+ },
647
+ {
648
+ "epoch": 0.99,
649
+ "learning_rate": 2.529142128971268e-05,
650
+ "loss": 1.236,
651
+ "step": 1070
652
+ },
653
+ {
654
+ "epoch": 1.0,
655
+ "learning_rate": 2.492714313064342e-05,
656
+ "loss": 1.2423,
657
+ "step": 1080
658
+ },
659
+ {
660
+ "epoch": 1.01,
661
+ "learning_rate": 2.45628804406744e-05,
662
+ "loss": 1.2253,
663
+ "step": 1090
664
+ },
665
+ {
666
+ "epoch": 1.02,
667
+ "learning_rate": 2.419871056070862e-05,
668
+ "loss": 1.2377,
669
+ "step": 1100
670
+ },
671
+ {
672
+ "epoch": 1.03,
673
+ "learning_rate": 2.3834710811943514e-05,
674
+ "loss": 1.2395,
675
+ "step": 1110
676
+ },
677
+ {
678
+ "epoch": 1.04,
679
+ "learning_rate": 2.3470958479453938e-05,
680
+ "loss": 1.2238,
681
+ "step": 1120
682
+ },
683
+ {
684
+ "epoch": 1.05,
685
+ "learning_rate": 2.3107530795782877e-05,
686
+ "loss": 1.2045,
687
+ "step": 1130
688
+ },
689
+ {
690
+ "epoch": 1.06,
691
+ "learning_rate": 2.2744504924543313e-05,
692
+ "loss": 1.2302,
693
+ "step": 1140
694
+ },
695
+ {
696
+ "epoch": 1.07,
697
+ "learning_rate": 2.23819579440347e-05,
698
+ "loss": 1.233,
699
+ "step": 1150
700
+ },
701
+ {
702
+ "epoch": 1.08,
703
+ "learning_rate": 2.2019966830877545e-05,
704
+ "loss": 1.2341,
705
+ "step": 1160
706
+ },
707
+ {
708
+ "epoch": 1.08,
709
+ "learning_rate": 2.1658608443669635e-05,
710
+ "loss": 1.2294,
711
+ "step": 1170
712
+ },
713
+ {
714
+ "epoch": 1.09,
715
+ "learning_rate": 2.1297959506667224e-05,
716
+ "loss": 1.2308,
717
+ "step": 1180
718
+ },
719
+ {
720
+ "epoch": 1.1,
721
+ "learning_rate": 2.0938096593494855e-05,
722
+ "loss": 1.2427,
723
+ "step": 1190
724
+ },
725
+ {
726
+ "epoch": 1.11,
727
+ "learning_rate": 2.057909611088709e-05,
728
+ "loss": 1.2404,
729
+ "step": 1200
730
+ },
731
+ {
732
+ "epoch": 1.12,
733
+ "learning_rate": 2.02210342824657e-05,
734
+ "loss": 1.2245,
735
+ "step": 1210
736
+ },
737
+ {
738
+ "epoch": 1.13,
739
+ "learning_rate": 1.9863987132555706e-05,
740
+ "loss": 1.2288,
741
+ "step": 1220
742
+ },
743
+ {
744
+ "epoch": 1.14,
745
+ "learning_rate": 1.9508030470043806e-05,
746
+ "loss": 1.2401,
747
+ "step": 1230
748
+ },
749
+ {
750
+ "epoch": 1.15,
751
+ "learning_rate": 1.915323987228247e-05,
752
+ "loss": 1.2275,
753
+ "step": 1240
754
+ },
755
+ {
756
+ "epoch": 1.16,
757
+ "learning_rate": 1.8799690669043212e-05,
758
+ "loss": 1.2251,
759
+ "step": 1250
760
+ },
761
+ {
762
+ "epoch": 1.17,
763
+ "learning_rate": 1.8447457926522454e-05,
764
+ "loss": 1.238,
765
+ "step": 1260
766
+ },
767
+ {
768
+ "epoch": 1.18,
769
+ "learning_rate": 1.8096616431403325e-05,
770
+ "loss": 1.2291,
771
+ "step": 1270
772
+ },
773
+ {
774
+ "epoch": 1.19,
775
+ "learning_rate": 1.7747240674976857e-05,
776
+ "loss": 1.2162,
777
+ "step": 1280
778
+ },
779
+ {
780
+ "epoch": 1.2,
781
+ "learning_rate": 1.7399404837325796e-05,
782
+ "loss": 1.2315,
783
+ "step": 1290
784
+ },
785
+ {
786
+ "epoch": 1.21,
787
+ "learning_rate": 1.7053182771574633e-05,
788
+ "loss": 1.2383,
789
+ "step": 1300
790
+ },
791
+ {
792
+ "epoch": 1.21,
793
+ "learning_rate": 1.6708647988208887e-05,
794
+ "loss": 1.2254,
795
+ "step": 1310
796
+ },
797
+ {
798
+ "epoch": 1.22,
799
+ "learning_rate": 1.6365873639467315e-05,
800
+ "loss": 1.2388,
801
+ "step": 1320
802
+ },
803
+ {
804
+ "epoch": 1.23,
805
+ "learning_rate": 1.602493250381003e-05,
806
+ "loss": 1.2213,
807
+ "step": 1330
808
+ },
809
+ {
810
+ "epoch": 1.24,
811
+ "learning_rate": 1.5685896970466123e-05,
812
+ "loss": 1.2366,
813
+ "step": 1340
814
+ },
815
+ {
816
+ "epoch": 1.25,
817
+ "learning_rate": 1.534883902406375e-05,
818
+ "loss": 1.2265,
819
+ "step": 1350
820
+ },
821
+ {
822
+ "epoch": 1.26,
823
+ "learning_rate": 1.5013830229346326e-05,
824
+ "loss": 1.2094,
825
+ "step": 1360
826
+ },
827
+ {
828
+ "epoch": 1.27,
829
+ "learning_rate": 1.4680941715977722e-05,
830
+ "loss": 1.2314,
831
+ "step": 1370
832
+ },
833
+ {
834
+ "epoch": 1.28,
835
+ "learning_rate": 1.4350244163439892e-05,
836
+ "loss": 1.2254,
837
+ "step": 1380
838
+ },
839
+ {
840
+ "epoch": 1.29,
841
+ "learning_rate": 1.4021807786026108e-05,
842
+ "loss": 1.2287,
843
+ "step": 1390
844
+ },
845
+ {
846
+ "epoch": 1.3,
847
+ "learning_rate": 1.3695702317932862e-05,
848
+ "loss": 1.2235,
849
+ "step": 1400
850
+ },
851
+ {
852
+ "epoch": 1.31,
853
+ "learning_rate": 1.337199699845387e-05,
854
+ "loss": 1.2209,
855
+ "step": 1410
856
+ },
857
+ {
858
+ "epoch": 1.32,
859
+ "learning_rate": 1.3050760557279023e-05,
860
+ "loss": 1.2296,
861
+ "step": 1420
862
+ },
863
+ {
864
+ "epoch": 1.33,
865
+ "learning_rate": 1.2732061199901562e-05,
866
+ "loss": 1.2363,
867
+ "step": 1430
868
+ },
869
+ {
870
+ "epoch": 1.34,
871
+ "learning_rate": 1.2415966593136547e-05,
872
+ "loss": 1.2116,
873
+ "step": 1440
874
+ },
875
+ {
876
+ "epoch": 1.34,
877
+ "learning_rate": 1.2102543850753808e-05,
878
+ "loss": 1.2299,
879
+ "step": 1450
880
+ },
881
+ {
882
+ "epoch": 1.35,
883
+ "learning_rate": 1.1791859519228138e-05,
884
+ "loss": 1.2204,
885
+ "step": 1460
886
+ },
887
+ {
888
+ "epoch": 1.36,
889
+ "learning_rate": 1.148397956361007e-05,
890
+ "loss": 1.2271,
891
+ "step": 1470
892
+ },
893
+ {
894
+ "epoch": 1.37,
895
+ "learning_rate": 1.1178969353520018e-05,
896
+ "loss": 1.2327,
897
+ "step": 1480
898
+ },
899
+ {
900
+ "epoch": 1.38,
901
+ "learning_rate": 1.087689364926897e-05,
902
+ "loss": 1.2252,
903
+ "step": 1490
904
+ },
905
+ {
906
+ "epoch": 1.39,
907
+ "learning_rate": 1.0577816588108378e-05,
908
+ "loss": 1.2305,
909
+ "step": 1500
910
+ },
911
+ {
912
+ "epoch": 1.4,
913
+ "learning_rate": 1.0281801670612449e-05,
914
+ "loss": 1.2179,
915
+ "step": 1510
916
+ },
917
+ {
918
+ "epoch": 1.41,
919
+ "learning_rate": 9.988911747195603e-06,
920
+ "loss": 1.229,
921
+ "step": 1520
922
+ },
923
+ {
924
+ "epoch": 1.42,
925
+ "learning_rate": 9.699209004767953e-06,
926
+ "loss": 1.2192,
927
+ "step": 1530
928
+ },
929
+ {
930
+ "epoch": 1.43,
931
+ "learning_rate": 9.412754953531663e-06,
932
+ "loss": 1.2322,
933
+ "step": 1540
934
+ },
935
+ {
936
+ "epoch": 1.44,
937
+ "learning_rate": 9.129610413921005e-06,
938
+ "loss": 1.2193,
939
+ "step": 1550
940
+ },
941
+ {
942
+ "epoch": 1.45,
943
+ "learning_rate": 8.849835503688847e-06,
944
+ "loss": 1.2149,
945
+ "step": 1560
946
+ },
947
+ {
948
+ "epoch": 1.46,
949
+ "learning_rate": 8.57348962514234e-06,
950
+ "loss": 1.217,
951
+ "step": 1570
952
+ },
953
+ {
954
+ "epoch": 1.46,
955
+ "learning_rate": 8.30063145253053e-06,
956
+ "loss": 1.2368,
957
+ "step": 1580
958
+ },
959
+ {
960
+ "epoch": 1.47,
961
+ "learning_rate": 8.031318919586523e-06,
962
+ "loss": 1.2264,
963
+ "step": 1590
964
+ },
965
+ {
966
+ "epoch": 1.48,
967
+ "learning_rate": 7.7656092072269e-06,
968
+ "loss": 1.2243,
969
+ "step": 1600
970
+ },
971
+ {
972
+ "epoch": 1.49,
973
+ "learning_rate": 7.503558731410959e-06,
974
+ "loss": 1.2427,
975
+ "step": 1610
976
+ },
977
+ {
978
+ "epoch": 1.5,
979
+ "learning_rate": 7.245223131162376e-06,
980
+ "loss": 1.2207,
981
+ "step": 1620
982
+ },
983
+ {
984
+ "epoch": 1.51,
985
+ "learning_rate": 6.9906572567558285e-06,
986
+ "loss": 1.2211,
987
+ "step": 1630
988
+ },
989
+ {
990
+ "epoch": 1.52,
991
+ "learning_rate": 6.739915158071106e-06,
992
+ "loss": 1.2343,
993
+ "step": 1640
994
+ },
995
+ {
996
+ "epoch": 1.53,
997
+ "learning_rate": 6.493050073117116e-06,
998
+ "loss": 1.2348,
999
+ "step": 1650
1000
+ },
1001
+ {
1002
+ "epoch": 1.54,
1003
+ "learning_rate": 6.250114416728298e-06,
1004
+ "loss": 1.2189,
1005
+ "step": 1660
1006
+ },
1007
+ {
1008
+ "epoch": 1.55,
1009
+ "learning_rate": 6.011159769435823e-06,
1010
+ "loss": 1.2207,
1011
+ "step": 1670
1012
+ },
1013
+ {
1014
+ "epoch": 1.56,
1015
+ "learning_rate": 5.776236866515947e-06,
1016
+ "loss": 1.2315,
1017
+ "step": 1680
1018
+ },
1019
+ {
1020
+ "epoch": 1.57,
1021
+ "learning_rate": 5.545395587217763e-06,
1022
+ "loss": 1.2334,
1023
+ "step": 1690
1024
+ },
1025
+ {
1026
+ "epoch": 1.58,
1027
+ "learning_rate": 5.318684944172752e-06,
1028
+ "loss": 1.232,
1029
+ "step": 1700
1030
+ },
1031
+ {
1032
+ "epoch": 1.59,
1033
+ "learning_rate": 5.096153072988366e-06,
1034
+ "loss": 1.2235,
1035
+ "step": 1710
1036
+ },
1037
+ {
1038
+ "epoch": 1.59,
1039
+ "learning_rate": 4.8778472220277874e-06,
1040
+ "loss": 1.2244,
1041
+ "step": 1720
1042
+ },
1043
+ {
1044
+ "epoch": 1.6,
1045
+ "learning_rate": 4.6638137423780395e-06,
1046
+ "loss": 1.2197,
1047
+ "step": 1730
1048
+ },
1049
+ {
1050
+ "epoch": 1.61,
1051
+ "learning_rate": 4.454098078008667e-06,
1052
+ "loss": 1.2242,
1053
+ "step": 1740
1054
+ },
1055
+ {
1056
+ "epoch": 1.62,
1057
+ "learning_rate": 4.248744756122986e-06,
1058
+ "loss": 1.2382,
1059
+ "step": 1750
1060
+ },
1061
+ {
1062
+ "epoch": 1.63,
1063
+ "learning_rate": 4.047797377703985e-06,
1064
+ "loss": 1.2265,
1065
+ "step": 1760
1066
+ },
1067
+ {
1068
+ "epoch": 1.64,
1069
+ "learning_rate": 3.851298608256892e-06,
1070
+ "loss": 1.2153,
1071
+ "step": 1770
1072
+ },
1073
+ {
1074
+ "epoch": 1.65,
1075
+ "learning_rate": 3.6592901687503566e-06,
1076
+ "loss": 1.2198,
1077
+ "step": 1780
1078
+ },
1079
+ {
1080
+ "epoch": 1.66,
1081
+ "learning_rate": 3.471812826758178e-06,
1082
+ "loss": 1.23,
1083
+ "step": 1790
1084
+ },
1085
+ {
1086
+ "epoch": 1.67,
1087
+ "learning_rate": 3.288906387803464e-06,
1088
+ "loss": 1.2268,
1089
+ "step": 1800
1090
+ },
1091
+ {
1092
+ "epoch": 1.68,
1093
+ "learning_rate": 3.1106096869070483e-06,
1094
+ "loss": 1.2228,
1095
+ "step": 1810
1096
+ },
1097
+ {
1098
+ "epoch": 1.69,
1099
+ "learning_rate": 2.9369605803419715e-06,
1100
+ "loss": 1.2288,
1101
+ "step": 1820
1102
+ },
1103
+ {
1104
+ "epoch": 1.7,
1105
+ "learning_rate": 2.767995937595766e-06,
1106
+ "loss": 1.228,
1107
+ "step": 1830
1108
+ },
1109
+ {
1110
+ "epoch": 1.71,
1111
+ "learning_rate": 2.6037516335422728e-06,
1112
+ "loss": 1.2178,
1113
+ "step": 1840
1114
+ },
1115
+ {
1116
+ "epoch": 1.72,
1117
+ "learning_rate": 2.4442625408246074e-06,
1118
+ "loss": 1.2386,
1119
+ "step": 1850
1120
+ },
1121
+ {
1122
+ "epoch": 1.72,
1123
+ "learning_rate": 2.289562522450947e-06,
1124
+ "loss": 1.2274,
1125
+ "step": 1860
1126
+ },
1127
+ {
1128
+ "epoch": 1.73,
1129
+ "learning_rate": 2.1396844246046903e-06,
1130
+ "loss": 1.2341,
1131
+ "step": 1870
1132
+ },
1133
+ {
1134
+ "epoch": 1.74,
1135
+ "learning_rate": 1.9946600696704592e-06,
1136
+ "loss": 1.2118,
1137
+ "step": 1880
1138
+ },
1139
+ {
1140
+ "epoch": 1.75,
1141
+ "learning_rate": 1.8545202494775509e-06,
1142
+ "loss": 1.2219,
1143
+ "step": 1890
1144
+ },
1145
+ {
1146
+ "epoch": 1.76,
1147
+ "learning_rate": 1.7192947187621434e-06,
1148
+ "loss": 1.2268,
1149
+ "step": 1900
1150
+ },
1151
+ {
1152
+ "epoch": 1.77,
1153
+ "learning_rate": 1.5890121888497366e-06,
1154
+ "loss": 1.2183,
1155
+ "step": 1910
1156
+ },
1157
+ {
1158
+ "epoch": 1.78,
1159
+ "learning_rate": 1.463700321559075e-06,
1160
+ "loss": 1.2253,
1161
+ "step": 1920
1162
+ },
1163
+ {
1164
+ "epoch": 1.79,
1165
+ "learning_rate": 1.3433857233289714e-06,
1166
+ "loss": 1.2262,
1167
+ "step": 1930
1168
+ },
1169
+ {
1170
+ "epoch": 1.8,
1171
+ "learning_rate": 1.2280939395691859e-06,
1172
+ "loss": 1.2232,
1173
+ "step": 1940
1174
+ },
1175
+ {
1176
+ "epoch": 1.81,
1177
+ "learning_rate": 1.1178494492365465e-06,
1178
+ "loss": 1.2376,
1179
+ "step": 1950
1180
+ },
1181
+ {
1182
+ "epoch": 1.82,
1183
+ "learning_rate": 1.0126756596375686e-06,
1184
+ "loss": 1.2287,
1185
+ "step": 1960
1186
+ },
1187
+ {
1188
+ "epoch": 1.83,
1189
+ "learning_rate": 9.125949014585383e-07,
1190
+ "loss": 1.2354,
1191
+ "step": 1970
1192
+ },
1193
+ {
1194
+ "epoch": 1.84,
1195
+ "learning_rate": 8.176284240242638e-07,
1196
+ "loss": 1.224,
1197
+ "step": 1980
1198
+ },
1199
+ {
1200
+ "epoch": 1.85,
1201
+ "learning_rate": 7.277963907863478e-07,
1202
+ "loss": 1.2214,
1203
+ "step": 1990
1204
+ },
1205
+ {
1206
+ "epoch": 1.85,
1207
+ "learning_rate": 6.431178750420513e-07,
1208
+ "loss": 1.2214,
1209
+ "step": 2000
1210
+ },
1211
+ {
1212
+ "epoch": 1.86,
1213
+ "learning_rate": 5.636108558846359e-07,
1214
+ "loss": 1.225,
1215
+ "step": 2010
1216
+ },
1217
+ {
1218
+ "epoch": 1.87,
1219
+ "learning_rate": 4.892922143859918e-07,
1220
+ "loss": 1.2373,
1221
+ "step": 2020
1222
+ },
1223
+ {
1224
+ "epoch": 1.88,
1225
+ "learning_rate": 4.201777300124249e-07,
1226
+ "loss": 1.2268,
1227
+ "step": 2030
1228
+ },
1229
+ {
1230
+ "epoch": 1.89,
1231
+ "learning_rate": 3.562820772743392e-07,
1232
+ "loss": 1.2209,
1233
+ "step": 2040
1234
+ },
1235
+ {
1236
+ "epoch": 1.9,
1237
+ "learning_rate": 2.9761882261050143e-07,
1238
+ "loss": 1.2184,
1239
+ "step": 2050
1240
+ },
1241
+ {
1242
+ "epoch": 1.91,
1243
+ "learning_rate": 2.4420042150761215e-07,
1244
+ "loss": 1.2265,
1245
+ "step": 2060
1246
+ },
1247
+ {
1248
+ "epoch": 1.92,
1249
+ "learning_rate": 1.9603821585572058e-07,
1250
+ "loss": 1.227,
1251
+ "step": 2070
1252
+ },
1253
+ {
1254
+ "epoch": 1.93,
1255
+ "learning_rate": 1.531424315400931e-07,
1256
+ "loss": 1.2172,
1257
+ "step": 2080
1258
+ },
1259
+ {
1260
+ "epoch": 1.94,
1261
+ "learning_rate": 1.1552217627004425e-07,
1262
+ "loss": 1.2196,
1263
+ "step": 2090
1264
+ },
1265
+ {
1266
+ "epoch": 1.95,
1267
+ "learning_rate": 8.318543764516961e-08,
1268
+ "loss": 1.2381,
1269
+ "step": 2100
1270
+ },
1271
+ {
1272
+ "epoch": 1.96,
1273
+ "learning_rate": 5.613908145939428e-08,
1274
+ "loss": 1.2288,
1275
+ "step": 2110
1276
+ },
1277
+ {
1278
+ "epoch": 1.97,
1279
+ "learning_rate": 3.438885024322769e-08,
1280
+ "loss": 1.2387,
1281
+ "step": 2120
1282
+ },
1283
+ {
1284
+ "epoch": 1.97,
1285
+ "learning_rate": 1.7939362044494534e-08,
1286
+ "loss": 1.2357,
1287
+ "step": 2130
1288
+ },
1289
+ {
1290
+ "epoch": 1.98,
1291
+ "learning_rate": 6.794109447824659e-09,
1292
+ "loss": 1.2338,
1293
+ "step": 2140
1294
+ },
1295
+ {
1296
+ "epoch": 1.99,
1297
+ "learning_rate": 9.554588330934478e-10,
1298
+ "loss": 1.2226,
1299
+ "step": 2150
1300
+ },
1301
+ {
1302
+ "epoch": 2.0,
1303
+ "step": 2156,
1304
+ "total_flos": 3.386848502305784e+18,
1305
+ "train_loss": 1.2461530792912217,
1306
+ "train_runtime": 30560.3748,
1307
+ "train_samples_per_second": 4.517,
1308
+ "train_steps_per_second": 0.071
1309
+ }
1310
+ ],
1311
+ "logging_steps": 10,
1312
+ "max_steps": 2156,
1313
+ "num_train_epochs": 2,
1314
+ "save_steps": 1000,
1315
+ "total_flos": 3.386848502305784e+18,
1316
+ "trial_name": null,
1317
+ "trial_params": null
1318
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1ca0e4372bd1720a48da585ce6946399a06d0c984dcfcf5a2b7104827c4603f
3
+ size 4219
training_loss.png ADDED