ggkk2012 commited on
Commit
0fe17ec
1 Parent(s): 37e307e

Upload 11 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen-7B-Chat",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "c_attn"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ccb0d95949741c36479d6bf4467bccf75c1ef0b0201a2bd35f030ff99504b58
3
+ size 33562824
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "train_loss": 0.886494568245396,
4
+ "train_runtime": 33369.3902,
5
+ "train_samples_per_second": 1.148,
6
+ "train_steps_per_second": 0.287
7
+ }
qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "<|im_end|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": "<|im_end|>"
10
+ }
tokenization_qwen.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
13
+
14
+ import tiktoken
15
+ from transformers import PreTrainedTokenizer, AddedToken
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
+
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
+ SPECIAL_START_ID = 151643
32
+ SPECIAL_TOKENS = tuple(
33
+ enumerate(
34
+ (
35
+ (
36
+ ENDOFTEXT,
37
+ IMSTART,
38
+ IMEND,
39
+ )
40
+ + EXTRAS
41
+ ),
42
+ start=SPECIAL_START_ID,
43
+ )
44
+ )
45
+ SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
+
47
+
48
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
49
+ with open(tiktoken_bpe_file, "rb") as f:
50
+ contents = f.read()
51
+ return {
52
+ base64.b64decode(token): int(rank)
53
+ for token, rank in (line.split() for line in contents.splitlines() if line)
54
+ }
55
+
56
+
57
+ class QWenTokenizer(PreTrainedTokenizer):
58
+ """QWen tokenizer."""
59
+
60
+ vocab_files_names = VOCAB_FILES_NAMES
61
+
62
+ def __init__(
63
+ self,
64
+ vocab_file,
65
+ errors="replace",
66
+ extra_vocab_file=None,
67
+ **kwargs,
68
+ ):
69
+ super().__init__(**kwargs)
70
+
71
+ # how to handle errors in decoding UTF-8 byte sequences
72
+ # use ignore if you are in streaming inference
73
+ self.errors = errors
74
+
75
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
+ self.special_tokens = {
77
+ token: index
78
+ for index, token in SPECIAL_TOKENS
79
+ }
80
+
81
+ # try load extra vocab from file
82
+ if extra_vocab_file is not None:
83
+ used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
+ extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
+ for token, index in extra_mergeable_ranks.items():
86
+ if token in self.mergeable_ranks:
87
+ logger.info(f"extra token {token} exists, skipping")
88
+ continue
89
+ if index in used_ids:
90
+ logger.info(f'the index {index} for extra token {token} exists, skipping')
91
+ continue
92
+ self.mergeable_ranks[token] = index
93
+ # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
+
95
+ enc = tiktoken.Encoding(
96
+ "Qwen",
97
+ pat_str=PAT_STR,
98
+ mergeable_ranks=self.mergeable_ranks,
99
+ special_tokens=self.special_tokens,
100
+ )
101
+ assert (
102
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
103
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
104
+
105
+ self.decoder = {
106
+ v: k for k, v in self.mergeable_ranks.items()
107
+ } # type: dict[int, bytes|str]
108
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
109
+
110
+ self.tokenizer = enc # type: tiktoken.Encoding
111
+
112
+ self.eod_id = self.tokenizer.eot_token
113
+ self.im_start_id = self.special_tokens[IMSTART]
114
+ self.im_end_id = self.special_tokens[IMEND]
115
+
116
+ def __getstate__(self):
117
+ # for pickle lovers
118
+ state = self.__dict__.copy()
119
+ del state["tokenizer"]
120
+ return state
121
+
122
+ def __setstate__(self, state):
123
+ # tokenizer is not python native; don't pass it; rebuild it
124
+ self.__dict__.update(state)
125
+ enc = tiktoken.Encoding(
126
+ "Qwen",
127
+ pat_str=PAT_STR,
128
+ mergeable_ranks=self.mergeable_ranks,
129
+ special_tokens=self.special_tokens,
130
+ )
131
+ self.tokenizer = enc
132
+
133
+ def __len__(self) -> int:
134
+ return self.tokenizer.n_vocab
135
+
136
+ def get_vocab(self) -> Dict[bytes, int]:
137
+ return self.mergeable_ranks
138
+
139
+ def convert_tokens_to_ids(
140
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
141
+ ) -> List[int]:
142
+ ids = []
143
+ if isinstance(tokens, (str, bytes)):
144
+ if tokens in self.special_tokens:
145
+ return self.special_tokens[tokens]
146
+ else:
147
+ return self.mergeable_ranks.get(tokens)
148
+ for token in tokens:
149
+ if token in self.special_tokens:
150
+ ids.append(self.special_tokens[token])
151
+ else:
152
+ ids.append(self.mergeable_ranks.get(token))
153
+ return ids
154
+
155
+ def _add_tokens(
156
+ self,
157
+ new_tokens: Union[List[str], List[AddedToken]],
158
+ special_tokens: bool = False,
159
+ ) -> int:
160
+ if not special_tokens and new_tokens:
161
+ raise ValueError("Adding regular tokens is not supported")
162
+ for token in new_tokens:
163
+ surface_form = token.content if isinstance(token, AddedToken) else token
164
+ if surface_form not in SPECIAL_TOKENS_SET:
165
+ raise ValueError("Adding unknown special tokens is not supported")
166
+ return 0
167
+
168
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
169
+ """
170
+ Save only the vocabulary of the tokenizer (vocabulary).
171
+
172
+ Returns:
173
+ `Tuple(str)`: Paths to the files saved.
174
+ """
175
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
176
+ with open(file_path, "w", encoding="utf8") as w:
177
+ for k, v in self.mergeable_ranks.items():
178
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
179
+ w.write(line)
180
+ return (file_path,)
181
+
182
+ def tokenize(
183
+ self,
184
+ text: str,
185
+ allowed_special: Union[Set, str] = "all",
186
+ disallowed_special: Union[Collection, str] = (),
187
+ **kwargs,
188
+ ) -> List[Union[bytes, str]]:
189
+ """
190
+ Converts a string in a sequence of tokens.
191
+
192
+ Args:
193
+ text (`str`):
194
+ The sequence to be encoded.
195
+ allowed_special (`Literal["all"]` or `set`):
196
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
197
+ Default to "all".
198
+ disallowed_special (`Literal["all"]` or `Collection`):
199
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
200
+ Default to an empty tuple.
201
+
202
+ kwargs (additional keyword arguments, *optional*):
203
+ Will be passed to the underlying model specific encode method.
204
+
205
+ Returns:
206
+ `List[bytes|str]`: The list of tokens.
207
+ """
208
+ tokens = []
209
+ text = unicodedata.normalize("NFC", text)
210
+
211
+ # this implementation takes a detour: text -> token id -> token surface forms
212
+ for t in self.tokenizer.encode(
213
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
214
+ ):
215
+ tokens.append(self.decoder[t])
216
+ return tokens
217
+
218
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
219
+ """
220
+ Converts a sequence of tokens in a single string.
221
+ """
222
+ text = ""
223
+ temp = b""
224
+ for t in tokens:
225
+ if isinstance(t, str):
226
+ if temp:
227
+ text += temp.decode("utf-8", errors=self.errors)
228
+ temp = b""
229
+ text += t
230
+ elif isinstance(t, bytes):
231
+ temp += t
232
+ else:
233
+ raise TypeError("token should only be of type types or str")
234
+ if temp:
235
+ text += temp.decode("utf-8", errors=self.errors)
236
+ return text
237
+
238
+ @property
239
+ def vocab_size(self):
240
+ return self.tokenizer.n_vocab
241
+
242
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
243
+ """Converts an id to a token, special tokens included"""
244
+ if index in self.decoder:
245
+ return self.decoder[index]
246
+ raise ValueError("unknown ids")
247
+
248
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
249
+ """Converts a token to an id using the vocab, special tokens included"""
250
+ if token in self.special_tokens:
251
+ return self.special_tokens[token]
252
+ if token in self.mergeable_ranks:
253
+ return self.mergeable_ranks[token]
254
+ raise ValueError("unknown token")
255
+
256
+ def _tokenize(self, text: str, **kwargs):
257
+ """
258
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
259
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
260
+
261
+ Do NOT take care of added tokens.
262
+ """
263
+ raise NotImplementedError
264
+
265
+ def _decode(
266
+ self,
267
+ token_ids: Union[int, List[int]],
268
+ skip_special_tokens: bool = False,
269
+ errors: str = None,
270
+ **kwargs,
271
+ ) -> str:
272
+ if isinstance(token_ids, int):
273
+ token_ids = [token_ids]
274
+ if skip_special_tokens:
275
+ token_ids = [i for i in token_ids if i < self.eod_id]
276
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_qwen.QWenTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}",
10
+ "clean_up_tokenization_spaces": true,
11
+ "eos_token": "<|im_end|>",
12
+ "model_max_length": 32768,
13
+ "pad_token": "<|im_end|>",
14
+ "padding_side": "right",
15
+ "split_special_tokens": false,
16
+ "tokenizer_class": "QWenTokenizer"
17
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "train_loss": 0.886494568245396,
4
+ "train_runtime": 33369.3902,
5
+ "train_samples_per_second": 1.148,
6
+ "train_steps_per_second": 0.287
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 100, "total_steps": 9580, "loss": 1.8222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019994623498004714, "epoch": 0.05, "percentage": 1.04, "elapsed_time": "0:05:45", "remaining_time": "9:05:27"}
2
+ {"current_steps": 200, "total_steps": 9580, "loss": 1.8565, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019978499773373596, "epoch": 0.1, "percentage": 2.09, "elapsed_time": "0:11:32", "remaining_time": "9:01:24"}
3
+ {"current_steps": 300, "total_steps": 9580, "loss": 1.7994, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019951646163954176, "epoch": 0.16, "percentage": 3.13, "elapsed_time": "0:17:22", "remaining_time": "8:57:18"}
4
+ {"current_steps": 400, "total_steps": 9580, "loss": 1.6984, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001991409154544338, "epoch": 0.21, "percentage": 4.18, "elapsed_time": "0:23:03", "remaining_time": "8:49:00"}
5
+ {"current_steps": 500, "total_steps": 9580, "loss": 1.6973, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019865876300337478, "epoch": 0.26, "percentage": 5.22, "elapsed_time": "0:28:50", "remaining_time": "8:43:38"}
6
+ {"current_steps": 600, "total_steps": 9580, "loss": 1.7661, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019807052274508773, "epoch": 0.31, "percentage": 6.26, "elapsed_time": "0:34:35", "remaining_time": "8:37:45"}
7
+ {"current_steps": 700, "total_steps": 9580, "loss": 1.6948, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019737682721455714, "epoch": 0.37, "percentage": 7.31, "elapsed_time": "0:40:22", "remaining_time": "8:32:05"}
8
+ {"current_steps": 800, "total_steps": 9580, "loss": 1.8664, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001965784223428638, "epoch": 0.42, "percentage": 8.35, "elapsed_time": "0:46:07", "remaining_time": "8:26:11"}
9
+ {"current_steps": 900, "total_steps": 9580, "loss": 1.7087, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019567616665508485, "epoch": 0.47, "percentage": 9.39, "elapsed_time": "0:51:54", "remaining_time": "8:20:35"}
10
+ {"current_steps": 1000, "total_steps": 9580, "loss": 1.6826, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001946710303471214, "epoch": 0.52, "percentage": 10.44, "elapsed_time": "0:57:38", "remaining_time": "8:14:34"}
11
+ {"current_steps": 1100, "total_steps": 9580, "loss": 1.5399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019356409424244655, "epoch": 0.57, "percentage": 11.48, "elapsed_time": "1:03:33", "remaining_time": "8:09:55"}
12
+ {"current_steps": 1200, "total_steps": 9580, "loss": 1.6394, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019235654862989537, "epoch": 0.63, "percentage": 12.53, "elapsed_time": "1:09:19", "remaining_time": "8:04:10"}
13
+ {"current_steps": 1300, "total_steps": 9580, "loss": 1.645, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019104969198374688, "epoch": 0.68, "percentage": 13.57, "elapsed_time": "1:15:06", "remaining_time": "7:58:25"}
14
+ {"current_steps": 1400, "total_steps": 9580, "loss": 1.5853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018964492956747425, "epoch": 0.73, "percentage": 14.61, "elapsed_time": "1:20:53", "remaining_time": "7:52:37"}
15
+ {"current_steps": 1500, "total_steps": 9580, "loss": 1.5778, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018814377192266423, "epoch": 0.78, "percentage": 15.66, "elapsed_time": "1:26:39", "remaining_time": "7:46:48"}
16
+ {"current_steps": 1600, "total_steps": 9580, "loss": 1.6368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018654783324473137, "epoch": 0.84, "percentage": 16.7, "elapsed_time": "1:32:22", "remaining_time": "7:40:40"}
17
+ {"current_steps": 1700, "total_steps": 9580, "loss": 1.6269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018487617447307124, "epoch": 0.89, "percentage": 17.75, "elapsed_time": "1:38:04", "remaining_time": "7:34:38"}
18
+ {"current_steps": 1800, "total_steps": 9580, "loss": 1.7073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018309682531549338, "epoch": 0.94, "percentage": 18.79, "elapsed_time": "1:43:50", "remaining_time": "7:28:50"}
19
+ {"current_steps": 1900, "total_steps": 9580, "loss": 1.5544, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018122812210849337, "epoch": 0.99, "percentage": 19.83, "elapsed_time": "1:49:36", "remaining_time": "7:23:02"}
20
+ {"current_steps": 2000, "total_steps": 9580, "loss": 1.3285, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017927207426937544, "epoch": 1.04, "percentage": 20.88, "elapsed_time": "1:55:21", "remaining_time": "7:17:13"}
21
+ {"current_steps": 2100, "total_steps": 9580, "loss": 1.2529, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017723078513716157, "epoch": 1.1, "percentage": 21.92, "elapsed_time": "2:01:14", "remaining_time": "7:11:51"}
22
+ {"current_steps": 2200, "total_steps": 9580, "loss": 1.3708, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017510644971087015, "epoch": 1.15, "percentage": 22.96, "elapsed_time": "2:07:00", "remaining_time": "7:06:04"}
23
+ {"current_steps": 2300, "total_steps": 9580, "loss": 1.3309, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001729013522892329, "epoch": 1.2, "percentage": 24.01, "elapsed_time": "2:12:47", "remaining_time": "7:00:17"}
24
+ {"current_steps": 2400, "total_steps": 9580, "loss": 1.4201, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001706178640143872, "epoch": 1.25, "percentage": 25.05, "elapsed_time": "2:18:33", "remaining_time": "6:54:30"}
25
+ {"current_steps": 2500, "total_steps": 9580, "loss": 1.2956, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016825844032218625, "epoch": 1.3, "percentage": 26.1, "elapsed_time": "2:24:20", "remaining_time": "6:48:46"}
26
+ {"current_steps": 2600, "total_steps": 9580, "loss": 1.3375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016582561830186785, "epoch": 1.36, "percentage": 27.14, "elapsed_time": "2:30:11", "remaining_time": "6:43:13"}
27
+ {"current_steps": 2700, "total_steps": 9580, "loss": 1.2897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016332201396792123, "epoch": 1.41, "percentage": 28.18, "elapsed_time": "2:35:57", "remaining_time": "6:37:24"}
28
+ {"current_steps": 2800, "total_steps": 9580, "loss": 1.3323, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016075031944708584, "epoch": 1.46, "percentage": 29.23, "elapsed_time": "2:41:43", "remaining_time": "6:31:36"}
29
+ {"current_steps": 2900, "total_steps": 9580, "loss": 1.2841, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001581133000835061, "epoch": 1.51, "percentage": 30.27, "elapsed_time": "2:47:30", "remaining_time": "6:25:50"}
30
+ {"current_steps": 3000, "total_steps": 9580, "loss": 1.2666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015541379146515603, "epoch": 1.57, "percentage": 31.32, "elapsed_time": "2:53:19", "remaining_time": "6:20:09"}
31
+ {"current_steps": 3100, "total_steps": 9580, "loss": 1.2336, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001526546963747302, "epoch": 1.62, "percentage": 32.36, "elapsed_time": "2:59:09", "remaining_time": "6:14:30"}
32
+ {"current_steps": 3200, "total_steps": 9580, "loss": 1.2627, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014986740918973633, "epoch": 1.67, "percentage": 33.4, "elapsed_time": "3:04:55", "remaining_time": "6:08:42"}
33
+ {"current_steps": 3300, "total_steps": 9580, "loss": 1.2683, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014699862334591993, "epoch": 1.72, "percentage": 34.45, "elapsed_time": "3:10:38", "remaining_time": "6:02:48"}
34
+ {"current_steps": 3400, "total_steps": 9580, "loss": 1.3076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014407929986366458, "epoch": 1.77, "percentage": 35.49, "elapsed_time": "3:16:26", "remaining_time": "5:57:03"}
35
+ {"current_steps": 3500, "total_steps": 9580, "loss": 1.3296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001411125778926756, "epoch": 1.83, "percentage": 36.53, "elapsed_time": "3:22:09", "remaining_time": "5:51:11"}
36
+ {"current_steps": 3600, "total_steps": 9580, "loss": 1.2321, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001381016475502724, "epoch": 1.88, "percentage": 37.58, "elapsed_time": "3:27:54", "remaining_time": "5:45:22"}
37
+ {"current_steps": 3700, "total_steps": 9580, "loss": 1.1754, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013504974649105364, "epoch": 1.93, "percentage": 38.62, "elapsed_time": "3:33:44", "remaining_time": "5:39:39"}
38
+ {"current_steps": 3800, "total_steps": 9580, "loss": 1.2665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001319601564254462, "epoch": 1.98, "percentage": 39.67, "elapsed_time": "3:39:26", "remaining_time": "5:33:46"}
39
+ {"current_steps": 3900, "total_steps": 9580, "loss": 1.0379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012883619959088054, "epoch": 2.04, "percentage": 40.71, "elapsed_time": "3:45:13", "remaining_time": "5:28:00"}
40
+ {"current_steps": 4000, "total_steps": 9580, "loss": 0.8172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001256812351793875, "epoch": 2.09, "percentage": 41.75, "elapsed_time": "3:50:58", "remaining_time": "5:22:13"}
41
+ {"current_steps": 4100, "total_steps": 9580, "loss": 0.8519, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001224986557254578, "epoch": 2.14, "percentage": 42.8, "elapsed_time": "3:56:48", "remaining_time": "5:16:31"}
42
+ {"current_steps": 4200, "total_steps": 9580, "loss": 0.8636, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011929188345804825, "epoch": 2.19, "percentage": 43.84, "elapsed_time": "4:02:42", "remaining_time": "5:10:53"}
43
+ {"current_steps": 4300, "total_steps": 9580, "loss": 0.8727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011606436662065767, "epoch": 2.24, "percentage": 44.89, "elapsed_time": "4:08:31", "remaining_time": "5:05:09"}
44
+ {"current_steps": 4400, "total_steps": 9580, "loss": 0.8618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011281957576342934, "epoch": 2.3, "percentage": 45.93, "elapsed_time": "4:14:18", "remaining_time": "4:59:23"}
45
+ {"current_steps": 4500, "total_steps": 9580, "loss": 0.7976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010956100001126682, "epoch": 2.35, "percentage": 46.97, "elapsed_time": "4:20:10", "remaining_time": "4:53:42"}
46
+ {"current_steps": 4600, "total_steps": 9580, "loss": 0.7871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010629214331197683, "epoch": 2.4, "percentage": 48.02, "elapsed_time": "4:26:02", "remaining_time": "4:48:00"}
47
+ {"current_steps": 4700, "total_steps": 9580, "loss": 0.8557, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010301652066847249, "epoch": 2.45, "percentage": 49.06, "elapsed_time": "4:31:52", "remaining_time": "4:42:16"}
48
+ {"current_steps": 4800, "total_steps": 9580, "loss": 0.8898, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.973765435908962e-05, "epoch": 2.51, "percentage": 50.1, "elapsed_time": "4:37:40", "remaining_time": "4:36:30"}
49
+ {"current_steps": 4900, "total_steps": 9580, "loss": 0.8269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.64590701500791e-05, "epoch": 2.56, "percentage": 51.15, "elapsed_time": "4:43:31", "remaining_time": "4:30:47"}
50
+ {"current_steps": 5000, "total_steps": 9580, "loss": 0.7502, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.318429350434922e-05, "epoch": 2.61, "percentage": 52.19, "elapsed_time": "4:49:18", "remaining_time": "4:25:00"}
51
+ {"current_steps": 5100, "total_steps": 9580, "loss": 0.8201, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.991684579053403e-05, "epoch": 2.66, "percentage": 53.24, "elapsed_time": "4:55:12", "remaining_time": "4:19:18"}
52
+ {"current_steps": 5200, "total_steps": 9580, "loss": 0.8618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.666024049646397e-05, "epoch": 2.71, "percentage": 54.28, "elapsed_time": "5:00:57", "remaining_time": "4:13:30"}
53
+ {"current_steps": 5300, "total_steps": 9580, "loss": 0.8255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.341797945111142e-05, "epoch": 2.77, "percentage": 55.32, "elapsed_time": "5:06:44", "remaining_time": "4:07:42"}
54
+ {"current_steps": 5400, "total_steps": 9580, "loss": 0.759, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.019354905907224e-05, "epoch": 2.82, "percentage": 56.37, "elapsed_time": "5:12:31", "remaining_time": "4:01:55"}
55
+ {"current_steps": 5500, "total_steps": 9580, "loss": 0.8287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.699041655163378e-05, "epoch": 2.87, "percentage": 57.41, "elapsed_time": "5:18:21", "remaining_time": "3:56:09"}
56
+ {"current_steps": 5600, "total_steps": 9580, "loss": 0.8356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.381202625845948e-05, "epoch": 2.92, "percentage": 58.46, "elapsed_time": "5:24:03", "remaining_time": "3:50:18"}
57
+ {"current_steps": 5700, "total_steps": 9580, "loss": 0.9181, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.066179590389994e-05, "epoch": 2.97, "percentage": 59.5, "elapsed_time": "5:29:47", "remaining_time": "3:44:29"}
58
+ {"current_steps": 5800, "total_steps": 9580, "loss": 0.5811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.754311293191257e-05, "epoch": 3.03, "percentage": 60.54, "elapsed_time": "5:35:33", "remaining_time": "3:38:41"}
59
+ {"current_steps": 5900, "total_steps": 9580, "loss": 0.4604, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.44593308635417e-05, "epoch": 3.08, "percentage": 61.59, "elapsed_time": "5:41:22", "remaining_time": "3:32:55"}
60
+ {"current_steps": 6000, "total_steps": 9580, "loss": 0.412, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.1413765690876e-05, "epoch": 3.13, "percentage": 62.63, "elapsed_time": "5:47:10", "remaining_time": "3:27:08"}
61
+ {"current_steps": 6100, "total_steps": 9580, "loss": 0.4478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.840969231136102e-05, "epoch": 3.18, "percentage": 63.67, "elapsed_time": "5:53:05", "remaining_time": "3:21:26"}
62
+ {"current_steps": 6200, "total_steps": 9580, "loss": 0.3905, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.5450341006300535e-05, "epoch": 3.24, "percentage": 64.72, "elapsed_time": "5:58:53", "remaining_time": "3:15:39"}
63
+ {"current_steps": 6300, "total_steps": 9580, "loss": 0.401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.2538893967333866e-05, "epoch": 3.29, "percentage": 65.76, "elapsed_time": "6:04:41", "remaining_time": "3:09:52"}
64
+ {"current_steps": 6400, "total_steps": 9580, "loss": 0.417, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9678481874623836e-05, "epoch": 3.34, "percentage": 66.81, "elapsed_time": "6:10:36", "remaining_time": "3:04:08"}
65
+ {"current_steps": 6500, "total_steps": 9580, "loss": 0.4213, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.687218053043516e-05, "epoch": 3.39, "percentage": 67.85, "elapsed_time": "6:16:27", "remaining_time": "2:58:22"}
66
+ {"current_steps": 6600, "total_steps": 9580, "loss": 0.4371, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.412300755172314e-05, "epoch": 3.44, "percentage": 68.89, "elapsed_time": "6:22:19", "remaining_time": "2:52:37"}
67
+ {"current_steps": 6700, "total_steps": 9580, "loss": 0.4686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1433919125288914e-05, "epoch": 3.5, "percentage": 69.94, "elapsed_time": "6:28:07", "remaining_time": "2:46:50"}
68
+ {"current_steps": 6800, "total_steps": 9580, "loss": 0.4017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8807806828990455e-05, "epoch": 3.55, "percentage": 70.98, "elapsed_time": "6:33:56", "remaining_time": "2:41:03"}
69
+ {"current_steps": 6900, "total_steps": 9580, "loss": 0.4227, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.624749452242799e-05, "epoch": 3.6, "percentage": 72.03, "elapsed_time": "6:39:45", "remaining_time": "2:35:16"}
70
+ {"current_steps": 7000, "total_steps": 9580, "loss": 0.4035, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.375573531044645e-05, "epoch": 3.65, "percentage": 73.07, "elapsed_time": "6:45:34", "remaining_time": "2:29:28"}
71
+ {"current_steps": 7100, "total_steps": 9580, "loss": 0.4271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1335208582720856e-05, "epoch": 3.71, "percentage": 74.11, "elapsed_time": "6:51:27", "remaining_time": "2:23:43"}
72
+ {"current_steps": 7200, "total_steps": 9580, "loss": 0.4546, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8988517132607428e-05, "epoch": 3.76, "percentage": 75.16, "elapsed_time": "6:57:16", "remaining_time": "2:17:56"}
73
+ {"current_steps": 7300, "total_steps": 9580, "loss": 0.3988, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6718184358358956e-05, "epoch": 3.81, "percentage": 76.2, "elapsed_time": "7:03:10", "remaining_time": "2:12:10"}
74
+ {"current_steps": 7400, "total_steps": 9580, "loss": 0.3839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4526651549713608e-05, "epoch": 3.86, "percentage": 77.24, "elapsed_time": "7:09:06", "remaining_time": "2:06:24"}
75
+ {"current_steps": 7500, "total_steps": 9580, "loss": 0.3956, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2436969776075456e-05, "epoch": 3.91, "percentage": 78.29, "elapsed_time": "7:14:52", "remaining_time": "2:00:36"}
76
+ {"current_steps": 7600, "total_steps": 9580, "loss": 0.4, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.040917410080746e-05, "epoch": 3.97, "percentage": 79.33, "elapsed_time": "7:20:45", "remaining_time": "1:54:49"}
77
+ {"current_steps": 7700, "total_steps": 9580, "loss": 0.3287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8466962472390136e-05, "epoch": 4.02, "percentage": 80.38, "elapsed_time": "7:26:32", "remaining_time": "1:49:01"}
78
+ {"current_steps": 7800, "total_steps": 9580, "loss": 0.2069, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.661242335176261e-05, "epoch": 4.07, "percentage": 81.42, "elapsed_time": "7:32:22", "remaining_time": "1:43:13"}
79
+ {"current_steps": 7900, "total_steps": 9580, "loss": 0.1645, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4847550925581377e-05, "epoch": 4.12, "percentage": 82.46, "elapsed_time": "7:38:07", "remaining_time": "1:37:25"}
80
+ {"current_steps": 8000, "total_steps": 9580, "loss": 0.1888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3174242961870542e-05, "epoch": 4.18, "percentage": 83.51, "elapsed_time": "7:44:02", "remaining_time": "1:31:38"}
81
+ {"current_steps": 8100, "total_steps": 9580, "loss": 0.1644, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1594298769351286e-05, "epoch": 4.23, "percentage": 84.55, "elapsed_time": "7:49:54", "remaining_time": "1:25:51"}
82
+ {"current_steps": 8200, "total_steps": 9580, "loss": 0.2169, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0109417262644261e-05, "epoch": 4.28, "percentage": 85.59, "elapsed_time": "7:55:43", "remaining_time": "1:20:03"}
83
+ {"current_steps": 8300, "total_steps": 9580, "loss": 0.1992, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.72119513542623e-06, "epoch": 4.33, "percentage": 86.64, "elapsed_time": "8:01:31", "remaining_time": "1:14:15"}
84
+ {"current_steps": 8400, "total_steps": 9580, "loss": 0.2011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.431125143504525e-06, "epoch": 4.38, "percentage": 87.68, "elapsed_time": "8:07:19", "remaining_time": "1:08:27"}
85
+ {"current_steps": 8500, "total_steps": 9580, "loss": 0.1783, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.240594499656316e-06, "epoch": 4.44, "percentage": 88.73, "elapsed_time": "8:13:07", "remaining_time": "1:02:39"}
86
+ {"current_steps": 8600, "total_steps": 9580, "loss": 0.1813, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.150883381957983e-06, "epoch": 4.49, "percentage": 89.77, "elapsed_time": "8:18:54", "remaining_time": "0:56:51"}
87
+ {"current_steps": 8700, "total_steps": 9580, "loss": 0.2027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1631635572092706e-06, "epoch": 4.54, "percentage": 90.81, "elapsed_time": "8:24:39", "remaining_time": "0:51:02"}
88
+ {"current_steps": 8800, "total_steps": 9580, "loss": 0.216, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2784971209318673e-06, "epoch": 4.59, "percentage": 91.86, "elapsed_time": "8:30:38", "remaining_time": "0:45:15"}
89
+ {"current_steps": 8900, "total_steps": 9580, "loss": 0.202, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4978353552977398e-06, "epoch": 4.65, "percentage": 92.9, "elapsed_time": "8:36:28", "remaining_time": "0:39:27"}
90
+ {"current_steps": 9000, "total_steps": 9580, "loss": 0.1762, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.822017706215029e-06, "epoch": 4.7, "percentage": 93.95, "elapsed_time": "8:42:21", "remaining_time": "0:33:39"}
91
+ {"current_steps": 9100, "total_steps": 9580, "loss": 0.217, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2517708806714657e-06, "epoch": 4.75, "percentage": 94.99, "elapsed_time": "8:48:12", "remaining_time": "0:27:51"}
92
+ {"current_steps": 9200, "total_steps": 9580, "loss": 0.2045, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.877080653061031e-07, "epoch": 4.8, "percentage": 96.03, "elapsed_time": "8:54:06", "remaining_time": "0:22:03"}
93
+ {"current_steps": 9300, "total_steps": 9580, "loss": 0.2263, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.303282670495068e-07, "epoch": 4.85, "percentage": 97.08, "elapsed_time": "8:59:54", "remaining_time": "0:16:15"}
94
+ {"current_steps": 9400, "total_steps": 9580, "loss": 0.192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.800157765413535e-07, "epoch": 4.91, "percentage": 98.12, "elapsed_time": "9:05:44", "remaining_time": "0:10:27"}
95
+ {"current_steps": 9500, "total_steps": 9580, "loss": 0.2436, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.703975490257916e-08, "epoch": 4.96, "percentage": 99.16, "elapsed_time": "9:11:33", "remaining_time": "0:04:38"}
96
+ {"current_steps": 9580, "total_steps": 9580, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "9:16:09", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 9580,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05,
13
+ "grad_norm": 1.0843530893325806,
14
+ "learning_rate": 0.00019994623498004714,
15
+ "loss": 1.8222,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.1,
20
+ "grad_norm": 0.9015785455703735,
21
+ "learning_rate": 0.00019978499773373596,
22
+ "loss": 1.8565,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.16,
27
+ "grad_norm": 1.2422339916229248,
28
+ "learning_rate": 0.00019951646163954176,
29
+ "loss": 1.7994,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.21,
34
+ "grad_norm": 1.0499353408813477,
35
+ "learning_rate": 0.0001991409154544338,
36
+ "loss": 1.6984,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.26,
41
+ "grad_norm": 1.122926950454712,
42
+ "learning_rate": 0.00019865876300337478,
43
+ "loss": 1.6973,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.31,
48
+ "grad_norm": 1.3406814336776733,
49
+ "learning_rate": 0.00019807052274508773,
50
+ "loss": 1.7661,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.37,
55
+ "grad_norm": 0.7354043126106262,
56
+ "learning_rate": 0.00019737682721455714,
57
+ "loss": 1.6948,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.42,
62
+ "grad_norm": 2.118002414703369,
63
+ "learning_rate": 0.0001965784223428638,
64
+ "loss": 1.8664,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.47,
69
+ "grad_norm": 1.6859912872314453,
70
+ "learning_rate": 0.00019567616665508485,
71
+ "loss": 1.7087,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.52,
76
+ "grad_norm": 1.2144606113433838,
77
+ "learning_rate": 0.0001946710303471214,
78
+ "loss": 1.6826,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.57,
83
+ "grad_norm": 1.680190920829773,
84
+ "learning_rate": 0.00019356409424244655,
85
+ "loss": 1.5399,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.63,
90
+ "grad_norm": 1.834701418876648,
91
+ "learning_rate": 0.00019235654862989537,
92
+ "loss": 1.6394,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.68,
97
+ "grad_norm": 1.9838495254516602,
98
+ "learning_rate": 0.00019104969198374688,
99
+ "loss": 1.645,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.73,
104
+ "grad_norm": 2.141674280166626,
105
+ "learning_rate": 0.00018964492956747425,
106
+ "loss": 1.5853,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.78,
111
+ "grad_norm": 2.6233127117156982,
112
+ "learning_rate": 0.00018814377192266423,
113
+ "loss": 1.5778,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.84,
118
+ "grad_norm": 1.6887329816818237,
119
+ "learning_rate": 0.00018654783324473137,
120
+ "loss": 1.6368,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.89,
125
+ "grad_norm": 3.113426446914673,
126
+ "learning_rate": 0.00018487617447307124,
127
+ "loss": 1.6269,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.94,
132
+ "grad_norm": 2.1365339756011963,
133
+ "learning_rate": 0.00018309682531549338,
134
+ "loss": 1.7073,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.99,
139
+ "grad_norm": 2.3787343502044678,
140
+ "learning_rate": 0.00018122812210849337,
141
+ "loss": 1.5544,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 1.04,
146
+ "grad_norm": 2.1635191440582275,
147
+ "learning_rate": 0.00017927207426937544,
148
+ "loss": 1.3285,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 1.1,
153
+ "grad_norm": 2.331688642501831,
154
+ "learning_rate": 0.00017723078513716157,
155
+ "loss": 1.2529,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 1.15,
160
+ "grad_norm": 5.185802936553955,
161
+ "learning_rate": 0.00017510644971087015,
162
+ "loss": 1.3708,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 1.2,
167
+ "grad_norm": 3.1339027881622314,
168
+ "learning_rate": 0.0001729013522892329,
169
+ "loss": 1.3309,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 1.25,
174
+ "grad_norm": 3.4236795902252197,
175
+ "learning_rate": 0.0001706178640143872,
176
+ "loss": 1.4201,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 1.3,
181
+ "grad_norm": 3.849432945251465,
182
+ "learning_rate": 0.00016825844032218625,
183
+ "loss": 1.2956,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 1.36,
188
+ "grad_norm": 3.8217601776123047,
189
+ "learning_rate": 0.00016582561830186785,
190
+ "loss": 1.3375,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 1.41,
195
+ "grad_norm": 1.5418843030929565,
196
+ "learning_rate": 0.00016332201396792123,
197
+ "loss": 1.2897,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 1.46,
202
+ "grad_norm": 2.2709996700286865,
203
+ "learning_rate": 0.00016075031944708584,
204
+ "loss": 1.3323,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 1.51,
209
+ "grad_norm": 4.869096279144287,
210
+ "learning_rate": 0.0001581133000835061,
211
+ "loss": 1.2841,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 1.57,
216
+ "grad_norm": 3.3247766494750977,
217
+ "learning_rate": 0.00015541379146515603,
218
+ "loss": 1.2666,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 1.62,
223
+ "grad_norm": 4.0471110343933105,
224
+ "learning_rate": 0.0001526546963747302,
225
+ "loss": 1.2336,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 1.67,
230
+ "grad_norm": 2.8835229873657227,
231
+ "learning_rate": 0.00014986740918973633,
232
+ "loss": 1.2627,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 1.72,
237
+ "grad_norm": 4.162999629974365,
238
+ "learning_rate": 0.00014699862334591993,
239
+ "loss": 1.2683,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 1.77,
244
+ "grad_norm": 2.161064624786377,
245
+ "learning_rate": 0.00014407929986366458,
246
+ "loss": 1.3076,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 1.83,
251
+ "grad_norm": 2.2110209465026855,
252
+ "learning_rate": 0.0001411125778926756,
253
+ "loss": 1.3296,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 1.88,
258
+ "grad_norm": 3.9512176513671875,
259
+ "learning_rate": 0.0001381016475502724,
260
+ "loss": 1.2321,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 1.93,
265
+ "grad_norm": 1.0244474411010742,
266
+ "learning_rate": 0.00013504974649105364,
267
+ "loss": 1.1754,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 1.98,
272
+ "grad_norm": 4.0455217361450195,
273
+ "learning_rate": 0.0001319601564254462,
274
+ "loss": 1.2665,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 2.04,
279
+ "grad_norm": 4.162383556365967,
280
+ "learning_rate": 0.00012883619959088054,
281
+ "loss": 1.0379,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 2.09,
286
+ "grad_norm": 2.3285574913024902,
287
+ "learning_rate": 0.0001256812351793875,
288
+ "loss": 0.8172,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 2.14,
293
+ "grad_norm": 5.439870834350586,
294
+ "learning_rate": 0.0001224986557254578,
295
+ "loss": 0.8519,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 2.19,
300
+ "grad_norm": 2.932978391647339,
301
+ "learning_rate": 0.00011929188345804825,
302
+ "loss": 0.8636,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 2.24,
307
+ "grad_norm": 5.512314319610596,
308
+ "learning_rate": 0.00011606436662065767,
309
+ "loss": 0.8727,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 2.3,
314
+ "grad_norm": 4.129833698272705,
315
+ "learning_rate": 0.00011281957576342934,
316
+ "loss": 0.8618,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 2.35,
321
+ "grad_norm": 2.5295193195343018,
322
+ "learning_rate": 0.00010956100001126682,
323
+ "loss": 0.7976,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 2.4,
328
+ "grad_norm": 4.148761749267578,
329
+ "learning_rate": 0.00010629214331197683,
330
+ "loss": 0.7871,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 2.45,
335
+ "grad_norm": 5.00084924697876,
336
+ "learning_rate": 0.00010301652066847249,
337
+ "loss": 0.8557,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 2.51,
342
+ "grad_norm": 5.721644878387451,
343
+ "learning_rate": 9.973765435908962e-05,
344
+ "loss": 0.8898,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 2.56,
349
+ "grad_norm": 5.872501850128174,
350
+ "learning_rate": 9.64590701500791e-05,
351
+ "loss": 0.8269,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 2.61,
356
+ "grad_norm": 5.41200065612793,
357
+ "learning_rate": 9.318429350434922e-05,
358
+ "loss": 0.7502,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 2.66,
363
+ "grad_norm": 3.3696563243865967,
364
+ "learning_rate": 8.991684579053403e-05,
365
+ "loss": 0.8201,
366
+ "step": 5100
367
+ },
368
+ {
369
+ "epoch": 2.71,
370
+ "grad_norm": 3.6817996501922607,
371
+ "learning_rate": 8.666024049646397e-05,
372
+ "loss": 0.8618,
373
+ "step": 5200
374
+ },
375
+ {
376
+ "epoch": 2.77,
377
+ "grad_norm": 4.630326747894287,
378
+ "learning_rate": 8.341797945111142e-05,
379
+ "loss": 0.8255,
380
+ "step": 5300
381
+ },
382
+ {
383
+ "epoch": 2.82,
384
+ "grad_norm": 2.672938823699951,
385
+ "learning_rate": 8.019354905907224e-05,
386
+ "loss": 0.759,
387
+ "step": 5400
388
+ },
389
+ {
390
+ "epoch": 2.87,
391
+ "grad_norm": 3.0967512130737305,
392
+ "learning_rate": 7.699041655163378e-05,
393
+ "loss": 0.8287,
394
+ "step": 5500
395
+ },
396
+ {
397
+ "epoch": 2.92,
398
+ "grad_norm": 4.052151679992676,
399
+ "learning_rate": 7.381202625845948e-05,
400
+ "loss": 0.8356,
401
+ "step": 5600
402
+ },
403
+ {
404
+ "epoch": 2.97,
405
+ "grad_norm": 3.5286691188812256,
406
+ "learning_rate": 7.066179590389994e-05,
407
+ "loss": 0.9181,
408
+ "step": 5700
409
+ },
410
+ {
411
+ "epoch": 3.03,
412
+ "grad_norm": 4.84535026550293,
413
+ "learning_rate": 6.754311293191257e-05,
414
+ "loss": 0.5811,
415
+ "step": 5800
416
+ },
417
+ {
418
+ "epoch": 3.08,
419
+ "grad_norm": 2.9033737182617188,
420
+ "learning_rate": 6.44593308635417e-05,
421
+ "loss": 0.4604,
422
+ "step": 5900
423
+ },
424
+ {
425
+ "epoch": 3.13,
426
+ "grad_norm": 1.6759638786315918,
427
+ "learning_rate": 6.1413765690876e-05,
428
+ "loss": 0.412,
429
+ "step": 6000
430
+ },
431
+ {
432
+ "epoch": 3.18,
433
+ "grad_norm": 3.905642032623291,
434
+ "learning_rate": 5.840969231136102e-05,
435
+ "loss": 0.4478,
436
+ "step": 6100
437
+ },
438
+ {
439
+ "epoch": 3.24,
440
+ "grad_norm": 7.030938148498535,
441
+ "learning_rate": 5.5450341006300535e-05,
442
+ "loss": 0.3905,
443
+ "step": 6200
444
+ },
445
+ {
446
+ "epoch": 3.29,
447
+ "grad_norm": 5.1767988204956055,
448
+ "learning_rate": 5.2538893967333866e-05,
449
+ "loss": 0.401,
450
+ "step": 6300
451
+ },
452
+ {
453
+ "epoch": 3.34,
454
+ "grad_norm": 2.0529091358184814,
455
+ "learning_rate": 4.9678481874623836e-05,
456
+ "loss": 0.417,
457
+ "step": 6400
458
+ },
459
+ {
460
+ "epoch": 3.39,
461
+ "grad_norm": 4.730635643005371,
462
+ "learning_rate": 4.687218053043516e-05,
463
+ "loss": 0.4213,
464
+ "step": 6500
465
+ },
466
+ {
467
+ "epoch": 3.44,
468
+ "grad_norm": 5.270570278167725,
469
+ "learning_rate": 4.412300755172314e-05,
470
+ "loss": 0.4371,
471
+ "step": 6600
472
+ },
473
+ {
474
+ "epoch": 3.5,
475
+ "grad_norm": 5.049856185913086,
476
+ "learning_rate": 4.1433919125288914e-05,
477
+ "loss": 0.4686,
478
+ "step": 6700
479
+ },
480
+ {
481
+ "epoch": 3.55,
482
+ "grad_norm": 5.543980121612549,
483
+ "learning_rate": 3.8807806828990455e-05,
484
+ "loss": 0.4017,
485
+ "step": 6800
486
+ },
487
+ {
488
+ "epoch": 3.6,
489
+ "grad_norm": 5.048495292663574,
490
+ "learning_rate": 3.624749452242799e-05,
491
+ "loss": 0.4227,
492
+ "step": 6900
493
+ },
494
+ {
495
+ "epoch": 3.65,
496
+ "grad_norm": 6.684240818023682,
497
+ "learning_rate": 3.375573531044645e-05,
498
+ "loss": 0.4035,
499
+ "step": 7000
500
+ },
501
+ {
502
+ "epoch": 3.71,
503
+ "grad_norm": 6.066633701324463,
504
+ "learning_rate": 3.1335208582720856e-05,
505
+ "loss": 0.4271,
506
+ "step": 7100
507
+ },
508
+ {
509
+ "epoch": 3.76,
510
+ "grad_norm": 4.037696361541748,
511
+ "learning_rate": 2.8988517132607428e-05,
512
+ "loss": 0.4546,
513
+ "step": 7200
514
+ },
515
+ {
516
+ "epoch": 3.81,
517
+ "grad_norm": 4.5075602531433105,
518
+ "learning_rate": 2.6718184358358956e-05,
519
+ "loss": 0.3988,
520
+ "step": 7300
521
+ },
522
+ {
523
+ "epoch": 3.86,
524
+ "grad_norm": 2.9646005630493164,
525
+ "learning_rate": 2.4526651549713608e-05,
526
+ "loss": 0.3839,
527
+ "step": 7400
528
+ },
529
+ {
530
+ "epoch": 3.91,
531
+ "grad_norm": 6.707378387451172,
532
+ "learning_rate": 2.2436969776075456e-05,
533
+ "loss": 0.3956,
534
+ "step": 7500
535
+ },
536
+ {
537
+ "epoch": 3.97,
538
+ "grad_norm": 6.770478248596191,
539
+ "learning_rate": 2.040917410080746e-05,
540
+ "loss": 0.4,
541
+ "step": 7600
542
+ },
543
+ {
544
+ "epoch": 4.02,
545
+ "grad_norm": 3.7659125328063965,
546
+ "learning_rate": 1.8466962472390136e-05,
547
+ "loss": 0.3287,
548
+ "step": 7700
549
+ },
550
+ {
551
+ "epoch": 4.07,
552
+ "grad_norm": 2.905404567718506,
553
+ "learning_rate": 1.661242335176261e-05,
554
+ "loss": 0.2069,
555
+ "step": 7800
556
+ },
557
+ {
558
+ "epoch": 4.12,
559
+ "grad_norm": 5.363249778747559,
560
+ "learning_rate": 1.4847550925581377e-05,
561
+ "loss": 0.1645,
562
+ "step": 7900
563
+ },
564
+ {
565
+ "epoch": 4.18,
566
+ "grad_norm": 5.363257884979248,
567
+ "learning_rate": 1.3174242961870542e-05,
568
+ "loss": 0.1888,
569
+ "step": 8000
570
+ },
571
+ {
572
+ "epoch": 4.23,
573
+ "grad_norm": 2.431536912918091,
574
+ "learning_rate": 1.1594298769351286e-05,
575
+ "loss": 0.1644,
576
+ "step": 8100
577
+ },
578
+ {
579
+ "epoch": 4.28,
580
+ "grad_norm": 2.4400837421417236,
581
+ "learning_rate": 1.0109417262644261e-05,
582
+ "loss": 0.2169,
583
+ "step": 8200
584
+ },
585
+ {
586
+ "epoch": 4.33,
587
+ "grad_norm": 8.288840293884277,
588
+ "learning_rate": 8.72119513542623e-06,
589
+ "loss": 0.1992,
590
+ "step": 8300
591
+ },
592
+ {
593
+ "epoch": 4.38,
594
+ "grad_norm": 3.36771297454834,
595
+ "learning_rate": 7.431125143504525e-06,
596
+ "loss": 0.2011,
597
+ "step": 8400
598
+ },
599
+ {
600
+ "epoch": 4.44,
601
+ "grad_norm": 0.056750617921352386,
602
+ "learning_rate": 6.240594499656316e-06,
603
+ "loss": 0.1783,
604
+ "step": 8500
605
+ },
606
+ {
607
+ "epoch": 4.49,
608
+ "grad_norm": 4.8992743492126465,
609
+ "learning_rate": 5.150883381957983e-06,
610
+ "loss": 0.1813,
611
+ "step": 8600
612
+ },
613
+ {
614
+ "epoch": 4.54,
615
+ "grad_norm": 2.5150370597839355,
616
+ "learning_rate": 4.1631635572092706e-06,
617
+ "loss": 0.2027,
618
+ "step": 8700
619
+ },
620
+ {
621
+ "epoch": 4.59,
622
+ "grad_norm": 4.545584201812744,
623
+ "learning_rate": 3.2784971209318673e-06,
624
+ "loss": 0.216,
625
+ "step": 8800
626
+ },
627
+ {
628
+ "epoch": 4.65,
629
+ "grad_norm": 6.020129680633545,
630
+ "learning_rate": 2.4978353552977398e-06,
631
+ "loss": 0.202,
632
+ "step": 8900
633
+ },
634
+ {
635
+ "epoch": 4.7,
636
+ "grad_norm": 5.093991279602051,
637
+ "learning_rate": 1.822017706215029e-06,
638
+ "loss": 0.1762,
639
+ "step": 9000
640
+ },
641
+ {
642
+ "epoch": 4.75,
643
+ "grad_norm": 2.0878984928131104,
644
+ "learning_rate": 1.2517708806714657e-06,
645
+ "loss": 0.217,
646
+ "step": 9100
647
+ },
648
+ {
649
+ "epoch": 4.8,
650
+ "grad_norm": 2.145759105682373,
651
+ "learning_rate": 7.877080653061031e-07,
652
+ "loss": 0.2045,
653
+ "step": 9200
654
+ },
655
+ {
656
+ "epoch": 4.85,
657
+ "grad_norm": 8.151817321777344,
658
+ "learning_rate": 4.303282670495068e-07,
659
+ "loss": 0.2263,
660
+ "step": 9300
661
+ },
662
+ {
663
+ "epoch": 4.91,
664
+ "grad_norm": 6.520354270935059,
665
+ "learning_rate": 1.800157765413535e-07,
666
+ "loss": 0.192,
667
+ "step": 9400
668
+ },
669
+ {
670
+ "epoch": 4.96,
671
+ "grad_norm": 7.271907806396484,
672
+ "learning_rate": 3.703975490257916e-08,
673
+ "loss": 0.2436,
674
+ "step": 9500
675
+ },
676
+ {
677
+ "epoch": 5.0,
678
+ "step": 9580,
679
+ "total_flos": 1.5336527740560998e+18,
680
+ "train_loss": 0.886494568245396,
681
+ "train_runtime": 33369.3902,
682
+ "train_samples_per_second": 1.148,
683
+ "train_steps_per_second": 0.287
684
+ }
685
+ ],
686
+ "logging_steps": 100,
687
+ "max_steps": 9580,
688
+ "num_input_tokens_seen": 0,
689
+ "num_train_epochs": 5,
690
+ "save_steps": 1000,
691
+ "total_flos": 1.5336527740560998e+18,
692
+ "train_batch_size": 4,
693
+ "trial_name": null,
694
+ "trial_params": null
695
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:347246aece17461af2bf7ca00e633da815261b88a5734925bcb6f918b0b7dd08
3
+ size 5048