diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4245d3b036a6c6ac5b23e62f817077453e37edac --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d53edf161f3a07e2d551714247c93fc284b893fb5528a49a55fadf70ff8eed41 +size 7807744 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0861a6937a494837de1439bb3a6ee2496f88943d --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7670653bcd7d18fd04a1f4d7fbb09d9a94c9d2796d6ae96f3b88f0a70245ba2b +size 15644485 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec8ada51fa2b45b13768fc6f74db64f1b797309e --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:092276df08ab03d1c657bebf8c8ca092758eefddc44fbdcfe4fa09446ad954a3 +size 14575 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd932f83964ec016edf1edc168405bc6749bf2b --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a724eb8dd004def7e4c5b752d32b5f2b92570d2204069c06be7d67c6e7933312 +size 627 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-1000/tokenization_chatglm.py b/checkpoint-1000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-1000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60bd1636abc99ebc41711a5d533f1f3e4e0ea86b --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,1221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5018190942165349, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 5.320804662283469e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a38a88cd7e29d50b45fbedd78d2ef0c94e94484 --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84490235df6826161e1e701ff9a1c761a3a25670fd30944a4b41f5c5ec1504b0 +size 7807744 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..28b6a2fd54aad576bc50597507eccbb737b330e0 --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a722ed8329e8f09783e7b151a4b15eb1b7717e70e9a6487f0a9b6f72685224 +size 15644485 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d36b77d22b73c753fcb05ee42510111f58ee9415 --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4d4bea37dabe731e99dacf209629d05725ac46e96f42aecaad63dd5bf6d55f7 +size 14575 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5ea52a77bcd7baa6da7a5196e224a0dbb72b82d --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c08c85a6d7b0767f91057e7c98cc42bdfa246f54b1e5570219edec6128d8b66 +size 627 diff --git a/checkpoint-1500/special_tokens_map.json b/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-1500/tokenization_chatglm.py b/checkpoint-1500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-1500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-1500/tokenizer.model b/checkpoint-1500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-1500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f416c25db5c5efdbb23d056e0fa0b56e87771d76 --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,1821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7527286413248024, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 7.938917340256666e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d86af836c88136d3483d45d71dcce9d410a70bca --- /dev/null +++ b/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c587894dfb0d6a36b1b2b2bd92c9853c7e8efe4659095de4b7806907caa047f +size 7807744 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76430091bf74b8af6d7c4f7c5fcd4b1b7d23005 --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fdc41063e12d72eb10d41d8c4ee533b99fd680d9681612b843ce985e48672e +size 15644485 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3d88dd1b880316135090d6cf3615a1a887911da --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44baa97c777d40935f5d263141ec5c7c84944e0c2da4b888420b7db30896fc13 +size 14575 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb998259288f4232f53b9ba1ba11a9c0e761341b --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9fc87da7bbd820e1299c4cd3906d309a71ffb7d550c30c19efae928b59c8ee +size 627 diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-2000/tokenization_chatglm.py b/checkpoint-2000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-2000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2000/tokenizer.model b/checkpoint-2000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..70b96535688689d1e3e4f8e6857b0471e984be27 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,2421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0036381884330698, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.0586878161626726e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-2500/README.md b/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-2500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-2500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..498c98eab838095c35eafffba8aaf1c16cfff8d5 --- /dev/null +++ b/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40bc7150f5ecf32de16baf18217bae81b4927e8bfdd73a03a697fb34ae256da +size 7807744 diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..71c9c0aef394a4cf559cf322dd1b1a1e0eaf6938 --- /dev/null +++ b/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ea1db76a2f24e5349ca56065395a53258d6acb5f690b358d35067da24c96f32 +size 15644485 diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a972a355e9285bf12eacd7b2350dc86cd022f14c --- /dev/null +++ b/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0423f88bd1e68d212dffb01bfbbae16c786b716d88f4e45eb7c085b41fde4a35 +size 14575 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6102e1fe7ea3e095e91ac6f832852781bef12d0b --- /dev/null +++ b/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24559f8836de5461d15dc4bbc244e10bf347c0266d3ef00c7465dff3b9068e19 +size 627 diff --git a/checkpoint-2500/special_tokens_map.json b/checkpoint-2500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-2500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-2500/tokenization_chatglm.py b/checkpoint-2500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-2500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-2500/tokenizer.model b/checkpoint-2500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-2500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-2500/tokenizer_config.json b/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..50d9d4d9b05554eba6aeffcccf4be167cf541185 --- /dev/null +++ b/checkpoint-2500/trainer_state.json @@ -0,0 +1,3021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2545477355413372, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.3252546968069734e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-3000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-3000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ae4073c06d30e374d8a96dd4de5fff74c911570 --- /dev/null +++ b/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d319648a494b00766b3b9a758e8492bc2dcb112810924c069bf26f51112c9ef6 +size 7807744 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4b695c6365c3d51997022fcf3b302ad2f8c4cf1 --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2598b8eb93be91b40b702edd4aaed3b7a7c180809a3d10d6c71fe80f922443 +size 15644485 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dcb54cb5b1b817249393aa121bc1e33976a2277 --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fde22b325f5cc8addc1a4a4081f512c41aa6548bc14595191ba5adcbcd1b3da +size 14575 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..486d13788b7a57bb7a0ee083ea319fe5433fddd3 --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf1413422559e6bcd0383ec960fc1d82525cae6243fee50e97cb5395485731fa +size 627 diff --git a/checkpoint-3000/special_tokens_map.json b/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-3000/tokenization_chatglm.py b/checkpoint-3000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-3000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3000/tokenizer.model b/checkpoint-3000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..92b816befe4bf54050ab3881f5f3637e78e122b1 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,3621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5054572826496049, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.5893562356072448e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-3500/README.md b/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-3500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-3500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aafc7049f0e3dfe8db1ccf622f568cf97b90843e --- /dev/null +++ b/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa15481d408031b11a832ea88669f0b08e6976ce0b7a93565f3fc1c63521384 +size 7807744 diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d81a120fd45c89deb8b8cf74e29b6ba1888f73e3 --- /dev/null +++ b/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b14ee417f2abbfc154c463f0ba8ba18b9794d11aa209acafcfe3a287cfc0458 +size 15644485 diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2836a41c055db35b7727fd9d0fe53b425e7a0665 --- /dev/null +++ b/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72fc7a1a8b50b196047c7feda73c709041b1a29adb7a8ee86acd6168932d7443 +size 14575 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d90530febe9ac01456c3f4cf6031ced13660f60 --- /dev/null +++ b/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08ab60099d2d7b21aeb8150414614583db64899df22f4e15c13e550874b09788 +size 627 diff --git a/checkpoint-3500/special_tokens_map.json b/checkpoint-3500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-3500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-3500/tokenization_chatglm.py b/checkpoint-3500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-3500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-3500/tokenizer.model b/checkpoint-3500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-3500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-3500/tokenizer_config.json b/checkpoint-3500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-3500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba4e6dd013ccda7e502f5cada8f0fb0e9e03c663 --- /dev/null +++ b/checkpoint-3500/trainer_state.json @@ -0,0 +1,4221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7563668297578723, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.8545254195367117e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..54a97e74d9d419a718122c1e75e670451535eb23 --- /dev/null +++ b/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bbe1c217f9fb7bc1a8408fcd117262d03f13f89e296af7022817cd0aa334f16 +size 7807744 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..03b5eab5f05157e12de6abb013156d5192562142 --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09445e4c7249c0901329fef524bb55a19e15bbef71b18b4d7553381b70e4ae12 +size 15644485 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a6f2ead853d509688413394b36d660da8f9a95f --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c2539cd66fdcec58e2de98c9eb76d63d340e79905c42e2045f0329b9fa7cf8 +size 14575 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d299e8252e87d57218debd515e946ce6366e4a5 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d2e4d7f9054fbc822f278024ae2c37e06725d5ef1704b75110e7cc575c7ac2 +size 627 diff --git a/checkpoint-4000/special_tokens_map.json b/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-4000/tokenization_chatglm.py b/checkpoint-4000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-4000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-4000/tokenizer.model b/checkpoint-4000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-4000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1a111aca4a37af13439b8af816a8277522ad7f72 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,4821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0072763768661397, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 2.1173584122426163e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-4500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-4500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1b251b4968a79b10cbe25c0091d6a61397223d1a --- /dev/null +++ b/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6f30b5794b8e7ffe4b244596c31a10c656fd898dca4c14a4240ddf186603fbd +size 7807744 diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e081d000235ff19da574a78f725d77dc0996250 --- /dev/null +++ b/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60ea38ab4e6de3696a04cce57324cdf705d69b93ee36949a3d16f360e17deab +size 15644485 diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..110da527956368745aaab1201125738bccd4fdde --- /dev/null +++ b/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74bc2172885c9ff5af9eff74447617064c0a26db597d3c6dc30599c93ee30598 +size 14575 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..15358edf03881bea55b32918df8c79008cefdf22 --- /dev/null +++ b/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e52890236d6514c86628654afd12d1c574ddd01f645ce3ba5199df1800652f6 +size 627 diff --git a/checkpoint-4500/special_tokens_map.json b/checkpoint-4500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-4500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-4500/tokenization_chatglm.py b/checkpoint-4500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-4500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-4500/tokenizer.model b/checkpoint-4500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-4500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-4500/tokenizer_config.json b/checkpoint-4500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-4500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad9cd8c4f4dc405ec92167be9119c75a5fca3c0c --- /dev/null +++ b/checkpoint-4500/trainer_state.json @@ -0,0 +1,5421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2581859239744073, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 2.3803234255827763e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..526bea6e734e21d0eb6b6f2399e385adf5a9867b --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:facb1c1c9a0c3a8fbded436b8f1e56574477b4fe04c4734871e4fd5127ccf2bd +size 7807744 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e06f1e83504e7c8000f8872707496f8b72c5b826 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb342d4e4f50a9e88b5b25d1b6f9ac3a84a2c5635f77dc3fef8bbd4321c5b367 +size 15644485 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d09833d0e017893c46c4777d5806a19adb5d3dac --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ca89b98c65775118f8563e7a3cf99577e01333bb35e016cc7b027cb14ba6512 +size 14575 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bd24beefd9b268ff91b70b84a80e9fd92fc357d --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409fe0b16e73d74c3f54c738d3690ad059453ac0cbbcf8448fb4dc600b8f25b5 +size 627 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-500/tokenization_chatglm.py b/checkpoint-500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..54e03965183d113c4611778508532429b9c5db30 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25090954710826746, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 2.6430243976544256e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82df0725a785b5aca2ab24f14bc802d511ef67d1 --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1662e5d71681afad84e785cc14af097bc9ae2c462792f90c06b54300bfa5c51b +size 7807744 diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c519f09038576f85926753eb10207d39b0ec2674 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c609f71f0174ec21579915197f24b51864247a2c1ab0fcb7c30504bfc091dc60 +size 15644485 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9e1b423be47a37d2c9b9f3823a473b875209b84 --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb41c358388171d319da16152836a879ac6ee17c5a016644234cefa47506514a +size 14575 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..644b3422f3ba480ee7fe518b87f13856fcb80aca --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27a3d21fe1e6e9b649efa67c7f24a1e5f96448bb3bea2d1ce32c7853768191c4 +size 627 diff --git a/checkpoint-5000/special_tokens_map.json b/checkpoint-5000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-5000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-5000/tokenization_chatglm.py b/checkpoint-5000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-5000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-5000/tokenizer.model b/checkpoint-5000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-5000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-5000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1f0e8a550d9cee4d82dd52b14a1d6f26e3bb92c --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,6021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5090954710826745, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 2.6457365606842368e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-5500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-5500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13c414e01bc4fc7190eb7ef1d776d036f12f151b --- /dev/null +++ b/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7aa993e71206b0eb33aa40d02307a4da7c123d83b8fa544ba21f975a3bafc9e +size 7807744 diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..07dccd9ae697d9842cd7c929baa715315a48c3c0 --- /dev/null +++ b/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c3f7029e9acdcd526a27c6078249e24ea59fa1ea10d865ebcc431864098501a +size 15644485 diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a4ba868dcafbde4be8c1d086674bd812d8ea872 --- /dev/null +++ b/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:323d662c38855f7facc47e3400349efc6fe164f3a1b6e719be65e851be9a1ed8 +size 14575 diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8bf8ccc7a1f34500f5adcaad933462af9b09c2c --- /dev/null +++ b/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd1e203ee398c199685a164902bc3c09b5ac1b941bd4c0ef8c19af40a167615 +size 627 diff --git a/checkpoint-5500/special_tokens_map.json b/checkpoint-5500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-5500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-5500/tokenization_chatglm.py b/checkpoint-5500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-5500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-5500/tokenizer.model b/checkpoint-5500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-5500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-5500/tokenizer_config.json b/checkpoint-5500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-5500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..71b3dba42cda0d1118012602957156f7f7209524 --- /dev/null +++ b/checkpoint-5500/trainer_state.json @@ -0,0 +1,6621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.760005018190942, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 2.9096831187399475e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-6000/README.md b/checkpoint-6000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-6000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-6000/adapter_config.json b/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-6000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6000/adapter_model.safetensors b/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b001ad0d11472099a08080aa3c5d7cdf15342790 --- /dev/null +++ b/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dad0444094c174f5a7dae5097f63542d7d9b4aab492883ed281e7cc1d5e7db5 +size 7807744 diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..24cfce97704293833eb030a9b87843d3c45c0954 --- /dev/null +++ b/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f31be04aecacbe2ca94fcd64fc612a9eac6cc74fe0d09306b1e414277e841ab +size 15644485 diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7841aec05d27174353c30114e4a0588c0a14d59c --- /dev/null +++ b/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeadcd36465b189eced59f640409c6b3316913827e101967470720b90c3eb8c2 +size 14575 diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f7771e4409f177da7d9bc1523fb10b1a2836741 --- /dev/null +++ b/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131d06786b2c4990bbe0b740d22c77341397abcf6540bba055dc036655eac60b +size 627 diff --git a/checkpoint-6000/special_tokens_map.json b/checkpoint-6000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-6000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-6000/tokenization_chatglm.py b/checkpoint-6000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-6000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-6000/tokenizer.model b/checkpoint-6000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-6000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-6000/tokenizer_config.json b/checkpoint-6000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-6000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-6000/trainer_state.json b/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1c88d6e99506c763eb5b0903a1c38f9207fe2468 --- /dev/null +++ b/checkpoint-6000/trainer_state.json @@ -0,0 +1,7221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0109145652992098, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 3.175526755909632e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6000/training_args.bin b/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-6500/README.md b/checkpoint-6500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-6500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-6500/adapter_config.json b/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-6500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6500/adapter_model.safetensors b/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7721faeefc1f790008f6aec6a9581e66ccf1c88e --- /dev/null +++ b/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5dd4c1b72e62c91f3889103bf4430b8887b3e1b815690104ca081717e67bb3b +size 7807744 diff --git a/checkpoint-6500/optimizer.pt b/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2d4c7cdeba43b20b8d7c537cb6322930f2ec89a --- /dev/null +++ b/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d1cd353db23d73df0f852f9e7aeea2f0ec747d5b3bb10f25902088f943782c +size 15644485 diff --git a/checkpoint-6500/rng_state.pth b/checkpoint-6500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5165af2723d65416f2e7fa8c7e43d78b6c8e3105 --- /dev/null +++ b/checkpoint-6500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944c6924bcfb7e600a4b395ffb27b0af369036b1abe3d48aeadfb986e39ec480 +size 14575 diff --git a/checkpoint-6500/scheduler.pt b/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4c40c054e5928bb8408676849530288041d64f9 --- /dev/null +++ b/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:051300151788bbed2b24c2ddc86ff7a7b8ed56af8bfab30e428cb88724058d9e +size 627 diff --git a/checkpoint-6500/special_tokens_map.json b/checkpoint-6500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-6500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-6500/tokenization_chatglm.py b/checkpoint-6500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-6500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-6500/tokenizer.model b/checkpoint-6500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-6500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-6500/tokenizer_config.json b/checkpoint-6500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-6500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-6500/trainer_state.json b/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..05d5e0521967aa02ba036c8fb7c9435949f270a5 --- /dev/null +++ b/checkpoint-6500/trainer_state.json @@ -0,0 +1,7821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.261824112407477, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 3.439312593193206e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6500/training_args.bin b/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-7000/README.md b/checkpoint-7000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-7000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-7000/adapter_config.json b/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-7000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7000/adapter_model.safetensors b/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b0821ebf1def1d1ee8f665fa1437ef1bd838b1b --- /dev/null +++ b/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2bf2659535511ce25bbb71469cfdda635f0367a573c3322f7d02e7f5821bf0a +size 7807744 diff --git a/checkpoint-7000/optimizer.pt b/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fec75851fc81276dd0b0eafaa66e2cdd3a453b62 --- /dev/null +++ b/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf9966a256685903e68e994297ba4356de4230f98c454b90380cfee4a33c15d +size 15644485 diff --git a/checkpoint-7000/rng_state.pth b/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f827a85f7a0f99c92b391ecb531c4539b2ffdd6 --- /dev/null +++ b/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9668641252e97c69fa217a8cee9765ef2a073e779a1fcd3c3332adb1a31d25dc +size 14575 diff --git a/checkpoint-7000/scheduler.pt b/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e875925bee3cdf885896faa153853f2e921e5a7 --- /dev/null +++ b/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df921c09a8b09dd0b0ed76a34dacafb2bc65fd989a7ef0fd8dc2071b4e2c250 +size 627 diff --git a/checkpoint-7000/special_tokens_map.json b/checkpoint-7000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-7000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-7000/tokenization_chatglm.py b/checkpoint-7000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-7000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-7000/tokenizer.model b/checkpoint-7000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-7000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-7000/tokenizer_config.json b/checkpoint-7000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-7000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-7000/trainer_state.json b/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88cf18cbb99bf724f1e6f5823822a20345922d27 --- /dev/null +++ b/checkpoint-7000/trainer_state.json @@ -0,0 +1,8421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5127336595157446, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + }, + { + "epoch": 3.26, + "learning_rate": 1.3453953217942436e-05, + "loss": 2.5565, + "step": 6505 + }, + { + "epoch": 3.27, + "learning_rate": 1.3418996780919804e-05, + "loss": 2.5866, + "step": 6510 + }, + { + "epoch": 3.27, + "learning_rate": 1.3384069148850087e-05, + "loss": 2.5992, + "step": 6515 + }, + { + "epoch": 3.27, + "learning_rate": 1.3349170408607342e-05, + "loss": 2.4388, + "step": 6520 + }, + { + "epoch": 3.27, + "learning_rate": 1.3314300646993771e-05, + "loss": 2.2734, + "step": 6525 + }, + { + "epoch": 3.28, + "learning_rate": 1.3279459950739489e-05, + "loss": 2.7683, + "step": 6530 + }, + { + "epoch": 3.28, + "learning_rate": 1.3244648406502331e-05, + "loss": 2.3653, + "step": 6535 + }, + { + "epoch": 3.28, + "learning_rate": 1.3209866100867613e-05, + "loss": 2.6401, + "step": 6540 + }, + { + "epoch": 3.28, + "learning_rate": 1.3175113120347943e-05, + "loss": 2.5218, + "step": 6545 + }, + { + "epoch": 3.29, + "learning_rate": 1.3140389551382975e-05, + "loss": 2.4681, + "step": 6550 + }, + { + "epoch": 3.29, + "learning_rate": 1.3105695480339206e-05, + "loss": 2.4681, + "step": 6555 + }, + { + "epoch": 3.29, + "learning_rate": 1.3071030993509788e-05, + "loss": 2.5743, + "step": 6560 + }, + { + "epoch": 3.29, + "learning_rate": 1.303639617711427e-05, + "loss": 2.5423, + "step": 6565 + }, + { + "epoch": 3.3, + "learning_rate": 1.3001791117298395e-05, + "loss": 2.4267, + "step": 6570 + }, + { + "epoch": 3.3, + "learning_rate": 1.2967215900133911e-05, + "loss": 2.5537, + "step": 6575 + }, + { + "epoch": 3.3, + "learning_rate": 1.2932670611618336e-05, + "loss": 2.5451, + "step": 6580 + }, + { + "epoch": 3.3, + "learning_rate": 1.2898155337674744e-05, + "loss": 2.4048, + "step": 6585 + }, + { + "epoch": 3.31, + "learning_rate": 1.2863670164151551e-05, + "loss": 2.6769, + "step": 6590 + }, + { + "epoch": 3.31, + "learning_rate": 1.2829215176822316e-05, + "loss": 2.2118, + "step": 6595 + }, + { + "epoch": 3.31, + "learning_rate": 1.2794790461385508e-05, + "loss": 2.2912, + "step": 6600 + }, + { + "epoch": 3.31, + "learning_rate": 1.2760396103464309e-05, + "loss": 2.3978, + "step": 6605 + }, + { + "epoch": 3.32, + "learning_rate": 1.2726032188606388e-05, + "loss": 2.4801, + "step": 6610 + }, + { + "epoch": 3.32, + "learning_rate": 1.2691698802283697e-05, + "loss": 2.5522, + "step": 6615 + }, + { + "epoch": 3.32, + "learning_rate": 1.2657396029892258e-05, + "loss": 2.6728, + "step": 6620 + }, + { + "epoch": 3.32, + "learning_rate": 1.2623123956751943e-05, + "loss": 2.2937, + "step": 6625 + }, + { + "epoch": 3.33, + "learning_rate": 1.258888266810627e-05, + "loss": 2.5459, + "step": 6630 + }, + { + "epoch": 3.33, + "learning_rate": 1.2554672249122187e-05, + "loss": 2.6329, + "step": 6635 + }, + { + "epoch": 3.33, + "learning_rate": 1.2520492784889865e-05, + "loss": 2.4845, + "step": 6640 + }, + { + "epoch": 3.33, + "learning_rate": 1.2486344360422475e-05, + "loss": 2.5023, + "step": 6645 + }, + { + "epoch": 3.34, + "learning_rate": 1.2452227060655993e-05, + "loss": 2.5674, + "step": 6650 + }, + { + "epoch": 3.34, + "learning_rate": 1.2418140970448975e-05, + "loss": 2.4996, + "step": 6655 + }, + { + "epoch": 3.34, + "learning_rate": 1.2384086174582336e-05, + "loss": 2.4704, + "step": 6660 + }, + { + "epoch": 3.34, + "learning_rate": 1.2350062757759193e-05, + "loss": 2.4928, + "step": 6665 + }, + { + "epoch": 3.35, + "learning_rate": 1.2316070804604576e-05, + "loss": 2.5498, + "step": 6670 + }, + { + "epoch": 3.35, + "learning_rate": 1.228211039966528e-05, + "loss": 2.5641, + "step": 6675 + }, + { + "epoch": 3.35, + "learning_rate": 1.2248181627409619e-05, + "loss": 2.5725, + "step": 6680 + }, + { + "epoch": 3.35, + "learning_rate": 1.221428457222723e-05, + "loss": 2.5827, + "step": 6685 + }, + { + "epoch": 3.36, + "learning_rate": 1.2180419318428868e-05, + "loss": 2.3591, + "step": 6690 + }, + { + "epoch": 3.36, + "learning_rate": 1.2146585950246186e-05, + "loss": 2.5772, + "step": 6695 + }, + { + "epoch": 3.36, + "learning_rate": 1.2112784551831533e-05, + "loss": 2.4008, + "step": 6700 + }, + { + "epoch": 3.36, + "learning_rate": 1.2079015207257724e-05, + "loss": 2.3334, + "step": 6705 + }, + { + "epoch": 3.37, + "learning_rate": 1.2045278000517857e-05, + "loss": 2.8023, + "step": 6710 + }, + { + "epoch": 3.37, + "learning_rate": 1.2011573015525118e-05, + "loss": 2.6145, + "step": 6715 + }, + { + "epoch": 3.37, + "learning_rate": 1.1977900336112519e-05, + "loss": 2.6568, + "step": 6720 + }, + { + "epoch": 3.37, + "learning_rate": 1.1944260046032735e-05, + "loss": 2.1771, + "step": 6725 + }, + { + "epoch": 3.38, + "learning_rate": 1.1910652228957872e-05, + "loss": 2.4932, + "step": 6730 + }, + { + "epoch": 3.38, + "learning_rate": 1.187707696847927e-05, + "loss": 2.3883, + "step": 6735 + }, + { + "epoch": 3.38, + "learning_rate": 1.1843534348107294e-05, + "loss": 2.7792, + "step": 6740 + }, + { + "epoch": 3.38, + "learning_rate": 1.1810024451271125e-05, + "loss": 2.5825, + "step": 6745 + }, + { + "epoch": 3.39, + "learning_rate": 1.1776547361318551e-05, + "loss": 2.406, + "step": 6750 + }, + { + "epoch": 3.39, + "learning_rate": 1.1743103161515762e-05, + "loss": 2.5823, + "step": 6755 + }, + { + "epoch": 3.39, + "learning_rate": 1.1709691935047137e-05, + "loss": 2.7587, + "step": 6760 + }, + { + "epoch": 3.39, + "learning_rate": 1.1676313765015038e-05, + "loss": 2.5183, + "step": 6765 + }, + { + "epoch": 3.4, + "learning_rate": 1.1642968734439633e-05, + "loss": 2.6452, + "step": 6770 + }, + { + "epoch": 3.4, + "learning_rate": 1.1609656926258634e-05, + "loss": 2.4641, + "step": 6775 + }, + { + "epoch": 3.4, + "learning_rate": 1.1576378423327131e-05, + "loss": 2.6462, + "step": 6780 + }, + { + "epoch": 3.4, + "learning_rate": 1.1543133308417378e-05, + "loss": 2.5271, + "step": 6785 + }, + { + "epoch": 3.41, + "learning_rate": 1.1509921664218587e-05, + "loss": 2.4245, + "step": 6790 + }, + { + "epoch": 3.41, + "learning_rate": 1.14767435733367e-05, + "loss": 2.3622, + "step": 6795 + }, + { + "epoch": 3.41, + "learning_rate": 1.1443599118294227e-05, + "loss": 2.5564, + "step": 6800 + }, + { + "epoch": 3.41, + "learning_rate": 1.1410488381530005e-05, + "loss": 2.342, + "step": 6805 + }, + { + "epoch": 3.42, + "learning_rate": 1.1377411445399006e-05, + "loss": 2.4976, + "step": 6810 + }, + { + "epoch": 3.42, + "learning_rate": 1.1344368392172125e-05, + "loss": 2.4792, + "step": 6815 + }, + { + "epoch": 3.42, + "learning_rate": 1.1311359304036013e-05, + "loss": 2.4829, + "step": 6820 + }, + { + "epoch": 3.42, + "learning_rate": 1.1278384263092797e-05, + "loss": 2.3949, + "step": 6825 + }, + { + "epoch": 3.43, + "learning_rate": 1.124544335135995e-05, + "loss": 2.5555, + "step": 6830 + }, + { + "epoch": 3.43, + "learning_rate": 1.1212536650770041e-05, + "loss": 2.5479, + "step": 6835 + }, + { + "epoch": 3.43, + "learning_rate": 1.1179664243170554e-05, + "loss": 2.5333, + "step": 6840 + }, + { + "epoch": 3.43, + "learning_rate": 1.1146826210323677e-05, + "loss": 2.0832, + "step": 6845 + }, + { + "epoch": 3.44, + "learning_rate": 1.1114022633906096e-05, + "loss": 2.7639, + "step": 6850 + }, + { + "epoch": 3.44, + "learning_rate": 1.10812535955088e-05, + "loss": 2.663, + "step": 6855 + }, + { + "epoch": 3.44, + "learning_rate": 1.104851917663687e-05, + "loss": 2.485, + "step": 6860 + }, + { + "epoch": 3.44, + "learning_rate": 1.1015819458709279e-05, + "loss": 2.3004, + "step": 6865 + }, + { + "epoch": 3.45, + "learning_rate": 1.0983154523058687e-05, + "loss": 2.3924, + "step": 6870 + }, + { + "epoch": 3.45, + "learning_rate": 1.095052445093124e-05, + "loss": 2.4694, + "step": 6875 + }, + { + "epoch": 3.45, + "learning_rate": 1.0917929323486398e-05, + "loss": 2.5255, + "step": 6880 + }, + { + "epoch": 3.46, + "learning_rate": 1.0885369221796657e-05, + "loss": 2.211, + "step": 6885 + }, + { + "epoch": 3.46, + "learning_rate": 1.0852844226847425e-05, + "loss": 2.5446, + "step": 6890 + }, + { + "epoch": 3.46, + "learning_rate": 1.0820354419536786e-05, + "loss": 2.778, + "step": 6895 + }, + { + "epoch": 3.46, + "learning_rate": 1.0787899880675298e-05, + "loss": 2.5628, + "step": 6900 + }, + { + "epoch": 3.47, + "learning_rate": 1.0755480690985803e-05, + "loss": 2.5333, + "step": 6905 + }, + { + "epoch": 3.47, + "learning_rate": 1.0723096931103218e-05, + "loss": 2.7511, + "step": 6910 + }, + { + "epoch": 3.47, + "learning_rate": 1.0690748681574336e-05, + "loss": 2.2807, + "step": 6915 + }, + { + "epoch": 3.47, + "learning_rate": 1.0658436022857617e-05, + "loss": 2.5652, + "step": 6920 + }, + { + "epoch": 3.48, + "learning_rate": 1.062615903532303e-05, + "loss": 2.7855, + "step": 6925 + }, + { + "epoch": 3.48, + "learning_rate": 1.0593917799251785e-05, + "loss": 2.5029, + "step": 6930 + }, + { + "epoch": 3.48, + "learning_rate": 1.0561712394836184e-05, + "loss": 2.3403, + "step": 6935 + }, + { + "epoch": 3.48, + "learning_rate": 1.0529542902179406e-05, + "loss": 2.748, + "step": 6940 + }, + { + "epoch": 3.49, + "learning_rate": 1.0497409401295303e-05, + "loss": 2.4717, + "step": 6945 + }, + { + "epoch": 3.49, + "learning_rate": 1.0465311972108214e-05, + "loss": 2.6532, + "step": 6950 + }, + { + "epoch": 3.49, + "learning_rate": 1.043325069445275e-05, + "loss": 2.3954, + "step": 6955 + }, + { + "epoch": 3.49, + "learning_rate": 1.0401225648073612e-05, + "loss": 2.4491, + "step": 6960 + }, + { + "epoch": 3.5, + "learning_rate": 1.0369236912625377e-05, + "loss": 2.8167, + "step": 6965 + }, + { + "epoch": 3.5, + "learning_rate": 1.0337284567672314e-05, + "loss": 2.4416, + "step": 6970 + }, + { + "epoch": 3.5, + "learning_rate": 1.0305368692688174e-05, + "loss": 2.4095, + "step": 6975 + }, + { + "epoch": 3.5, + "learning_rate": 1.0273489367056002e-05, + "loss": 2.6135, + "step": 6980 + }, + { + "epoch": 3.51, + "learning_rate": 1.0241646670067932e-05, + "loss": 2.7131, + "step": 6985 + }, + { + "epoch": 3.51, + "learning_rate": 1.0209840680924993e-05, + "loss": 2.45, + "step": 6990 + }, + { + "epoch": 3.51, + "learning_rate": 1.0178071478736914e-05, + "loss": 2.4902, + "step": 6995 + }, + { + "epoch": 3.51, + "learning_rate": 1.0146339142521926e-05, + "loss": 2.572, + "step": 7000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 3.7024871175399014e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7000/training_args.bin b/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-7500/README.md b/checkpoint-7500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-7500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-7500/adapter_config.json b/checkpoint-7500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-7500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7500/adapter_model.safetensors b/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..100d323572326ec6e02316e861b6cf1aa851c5d3 --- /dev/null +++ b/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cffe8636987892e7f3f95ae26e2cb0e0991dde07159cb5b0d808f9c8d0dfb790 +size 7807744 diff --git a/checkpoint-7500/optimizer.pt b/checkpoint-7500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a008a7493a9304dbe8b5465c6fe0bdfe6aeb4acb --- /dev/null +++ b/checkpoint-7500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:493b62aa9f6b830ea91473ff52ecd6764a575927a31ab7dbc5a11256a492dff6 +size 15644485 diff --git a/checkpoint-7500/rng_state.pth b/checkpoint-7500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6e7913ea3bda6fc22803344d449323969514ead --- /dev/null +++ b/checkpoint-7500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a41f448ddee37556724620e34d74fb60abcc9fe6ace04c55105e7faebc5ad88d +size 14575 diff --git a/checkpoint-7500/scheduler.pt b/checkpoint-7500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbdcd39687b080d2277f24504ff01f5510344bf9 --- /dev/null +++ b/checkpoint-7500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e677f66ff90d1aa11a61c09c2fae158912436bb0d6d36c43167d0a868e4234 +size 627 diff --git a/checkpoint-7500/special_tokens_map.json b/checkpoint-7500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-7500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-7500/tokenization_chatglm.py b/checkpoint-7500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-7500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-7500/tokenizer.model b/checkpoint-7500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-7500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-7500/tokenizer_config.json b/checkpoint-7500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-7500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-7500/trainer_state.json b/checkpoint-7500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..efe71feab7f140b116d588fde8ea58ab1509ebab --- /dev/null +++ b/checkpoint-7500/trainer_state.json @@ -0,0 +1,9021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.763643206624012, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + }, + { + "epoch": 3.26, + "learning_rate": 1.3453953217942436e-05, + "loss": 2.5565, + "step": 6505 + }, + { + "epoch": 3.27, + "learning_rate": 1.3418996780919804e-05, + "loss": 2.5866, + "step": 6510 + }, + { + "epoch": 3.27, + "learning_rate": 1.3384069148850087e-05, + "loss": 2.5992, + "step": 6515 + }, + { + "epoch": 3.27, + "learning_rate": 1.3349170408607342e-05, + "loss": 2.4388, + "step": 6520 + }, + { + "epoch": 3.27, + "learning_rate": 1.3314300646993771e-05, + "loss": 2.2734, + "step": 6525 + }, + { + "epoch": 3.28, + "learning_rate": 1.3279459950739489e-05, + "loss": 2.7683, + "step": 6530 + }, + { + "epoch": 3.28, + "learning_rate": 1.3244648406502331e-05, + "loss": 2.3653, + "step": 6535 + }, + { + "epoch": 3.28, + "learning_rate": 1.3209866100867613e-05, + "loss": 2.6401, + "step": 6540 + }, + { + "epoch": 3.28, + "learning_rate": 1.3175113120347943e-05, + "loss": 2.5218, + "step": 6545 + }, + { + "epoch": 3.29, + "learning_rate": 1.3140389551382975e-05, + "loss": 2.4681, + "step": 6550 + }, + { + "epoch": 3.29, + "learning_rate": 1.3105695480339206e-05, + "loss": 2.4681, + "step": 6555 + }, + { + "epoch": 3.29, + "learning_rate": 1.3071030993509788e-05, + "loss": 2.5743, + "step": 6560 + }, + { + "epoch": 3.29, + "learning_rate": 1.303639617711427e-05, + "loss": 2.5423, + "step": 6565 + }, + { + "epoch": 3.3, + "learning_rate": 1.3001791117298395e-05, + "loss": 2.4267, + "step": 6570 + }, + { + "epoch": 3.3, + "learning_rate": 1.2967215900133911e-05, + "loss": 2.5537, + "step": 6575 + }, + { + "epoch": 3.3, + "learning_rate": 1.2932670611618336e-05, + "loss": 2.5451, + "step": 6580 + }, + { + "epoch": 3.3, + "learning_rate": 1.2898155337674744e-05, + "loss": 2.4048, + "step": 6585 + }, + { + "epoch": 3.31, + "learning_rate": 1.2863670164151551e-05, + "loss": 2.6769, + "step": 6590 + }, + { + "epoch": 3.31, + "learning_rate": 1.2829215176822316e-05, + "loss": 2.2118, + "step": 6595 + }, + { + "epoch": 3.31, + "learning_rate": 1.2794790461385508e-05, + "loss": 2.2912, + "step": 6600 + }, + { + "epoch": 3.31, + "learning_rate": 1.2760396103464309e-05, + "loss": 2.3978, + "step": 6605 + }, + { + "epoch": 3.32, + "learning_rate": 1.2726032188606388e-05, + "loss": 2.4801, + "step": 6610 + }, + { + "epoch": 3.32, + "learning_rate": 1.2691698802283697e-05, + "loss": 2.5522, + "step": 6615 + }, + { + "epoch": 3.32, + "learning_rate": 1.2657396029892258e-05, + "loss": 2.6728, + "step": 6620 + }, + { + "epoch": 3.32, + "learning_rate": 1.2623123956751943e-05, + "loss": 2.2937, + "step": 6625 + }, + { + "epoch": 3.33, + "learning_rate": 1.258888266810627e-05, + "loss": 2.5459, + "step": 6630 + }, + { + "epoch": 3.33, + "learning_rate": 1.2554672249122187e-05, + "loss": 2.6329, + "step": 6635 + }, + { + "epoch": 3.33, + "learning_rate": 1.2520492784889865e-05, + "loss": 2.4845, + "step": 6640 + }, + { + "epoch": 3.33, + "learning_rate": 1.2486344360422475e-05, + "loss": 2.5023, + "step": 6645 + }, + { + "epoch": 3.34, + "learning_rate": 1.2452227060655993e-05, + "loss": 2.5674, + "step": 6650 + }, + { + "epoch": 3.34, + "learning_rate": 1.2418140970448975e-05, + "loss": 2.4996, + "step": 6655 + }, + { + "epoch": 3.34, + "learning_rate": 1.2384086174582336e-05, + "loss": 2.4704, + "step": 6660 + }, + { + "epoch": 3.34, + "learning_rate": 1.2350062757759193e-05, + "loss": 2.4928, + "step": 6665 + }, + { + "epoch": 3.35, + "learning_rate": 1.2316070804604576e-05, + "loss": 2.5498, + "step": 6670 + }, + { + "epoch": 3.35, + "learning_rate": 1.228211039966528e-05, + "loss": 2.5641, + "step": 6675 + }, + { + "epoch": 3.35, + "learning_rate": 1.2248181627409619e-05, + "loss": 2.5725, + "step": 6680 + }, + { + "epoch": 3.35, + "learning_rate": 1.221428457222723e-05, + "loss": 2.5827, + "step": 6685 + }, + { + "epoch": 3.36, + "learning_rate": 1.2180419318428868e-05, + "loss": 2.3591, + "step": 6690 + }, + { + "epoch": 3.36, + "learning_rate": 1.2146585950246186e-05, + "loss": 2.5772, + "step": 6695 + }, + { + "epoch": 3.36, + "learning_rate": 1.2112784551831533e-05, + "loss": 2.4008, + "step": 6700 + }, + { + "epoch": 3.36, + "learning_rate": 1.2079015207257724e-05, + "loss": 2.3334, + "step": 6705 + }, + { + "epoch": 3.37, + "learning_rate": 1.2045278000517857e-05, + "loss": 2.8023, + "step": 6710 + }, + { + "epoch": 3.37, + "learning_rate": 1.2011573015525118e-05, + "loss": 2.6145, + "step": 6715 + }, + { + "epoch": 3.37, + "learning_rate": 1.1977900336112519e-05, + "loss": 2.6568, + "step": 6720 + }, + { + "epoch": 3.37, + "learning_rate": 1.1944260046032735e-05, + "loss": 2.1771, + "step": 6725 + }, + { + "epoch": 3.38, + "learning_rate": 1.1910652228957872e-05, + "loss": 2.4932, + "step": 6730 + }, + { + "epoch": 3.38, + "learning_rate": 1.187707696847927e-05, + "loss": 2.3883, + "step": 6735 + }, + { + "epoch": 3.38, + "learning_rate": 1.1843534348107294e-05, + "loss": 2.7792, + "step": 6740 + }, + { + "epoch": 3.38, + "learning_rate": 1.1810024451271125e-05, + "loss": 2.5825, + "step": 6745 + }, + { + "epoch": 3.39, + "learning_rate": 1.1776547361318551e-05, + "loss": 2.406, + "step": 6750 + }, + { + "epoch": 3.39, + "learning_rate": 1.1743103161515762e-05, + "loss": 2.5823, + "step": 6755 + }, + { + "epoch": 3.39, + "learning_rate": 1.1709691935047137e-05, + "loss": 2.7587, + "step": 6760 + }, + { + "epoch": 3.39, + "learning_rate": 1.1676313765015038e-05, + "loss": 2.5183, + "step": 6765 + }, + { + "epoch": 3.4, + "learning_rate": 1.1642968734439633e-05, + "loss": 2.6452, + "step": 6770 + }, + { + "epoch": 3.4, + "learning_rate": 1.1609656926258634e-05, + "loss": 2.4641, + "step": 6775 + }, + { + "epoch": 3.4, + "learning_rate": 1.1576378423327131e-05, + "loss": 2.6462, + "step": 6780 + }, + { + "epoch": 3.4, + "learning_rate": 1.1543133308417378e-05, + "loss": 2.5271, + "step": 6785 + }, + { + "epoch": 3.41, + "learning_rate": 1.1509921664218587e-05, + "loss": 2.4245, + "step": 6790 + }, + { + "epoch": 3.41, + "learning_rate": 1.14767435733367e-05, + "loss": 2.3622, + "step": 6795 + }, + { + "epoch": 3.41, + "learning_rate": 1.1443599118294227e-05, + "loss": 2.5564, + "step": 6800 + }, + { + "epoch": 3.41, + "learning_rate": 1.1410488381530005e-05, + "loss": 2.342, + "step": 6805 + }, + { + "epoch": 3.42, + "learning_rate": 1.1377411445399006e-05, + "loss": 2.4976, + "step": 6810 + }, + { + "epoch": 3.42, + "learning_rate": 1.1344368392172125e-05, + "loss": 2.4792, + "step": 6815 + }, + { + "epoch": 3.42, + "learning_rate": 1.1311359304036013e-05, + "loss": 2.4829, + "step": 6820 + }, + { + "epoch": 3.42, + "learning_rate": 1.1278384263092797e-05, + "loss": 2.3949, + "step": 6825 + }, + { + "epoch": 3.43, + "learning_rate": 1.124544335135995e-05, + "loss": 2.5555, + "step": 6830 + }, + { + "epoch": 3.43, + "learning_rate": 1.1212536650770041e-05, + "loss": 2.5479, + "step": 6835 + }, + { + "epoch": 3.43, + "learning_rate": 1.1179664243170554e-05, + "loss": 2.5333, + "step": 6840 + }, + { + "epoch": 3.43, + "learning_rate": 1.1146826210323677e-05, + "loss": 2.0832, + "step": 6845 + }, + { + "epoch": 3.44, + "learning_rate": 1.1114022633906096e-05, + "loss": 2.7639, + "step": 6850 + }, + { + "epoch": 3.44, + "learning_rate": 1.10812535955088e-05, + "loss": 2.663, + "step": 6855 + }, + { + "epoch": 3.44, + "learning_rate": 1.104851917663687e-05, + "loss": 2.485, + "step": 6860 + }, + { + "epoch": 3.44, + "learning_rate": 1.1015819458709279e-05, + "loss": 2.3004, + "step": 6865 + }, + { + "epoch": 3.45, + "learning_rate": 1.0983154523058687e-05, + "loss": 2.3924, + "step": 6870 + }, + { + "epoch": 3.45, + "learning_rate": 1.095052445093124e-05, + "loss": 2.4694, + "step": 6875 + }, + { + "epoch": 3.45, + "learning_rate": 1.0917929323486398e-05, + "loss": 2.5255, + "step": 6880 + }, + { + "epoch": 3.46, + "learning_rate": 1.0885369221796657e-05, + "loss": 2.211, + "step": 6885 + }, + { + "epoch": 3.46, + "learning_rate": 1.0852844226847425e-05, + "loss": 2.5446, + "step": 6890 + }, + { + "epoch": 3.46, + "learning_rate": 1.0820354419536786e-05, + "loss": 2.778, + "step": 6895 + }, + { + "epoch": 3.46, + "learning_rate": 1.0787899880675298e-05, + "loss": 2.5628, + "step": 6900 + }, + { + "epoch": 3.47, + "learning_rate": 1.0755480690985803e-05, + "loss": 2.5333, + "step": 6905 + }, + { + "epoch": 3.47, + "learning_rate": 1.0723096931103218e-05, + "loss": 2.7511, + "step": 6910 + }, + { + "epoch": 3.47, + "learning_rate": 1.0690748681574336e-05, + "loss": 2.2807, + "step": 6915 + }, + { + "epoch": 3.47, + "learning_rate": 1.0658436022857617e-05, + "loss": 2.5652, + "step": 6920 + }, + { + "epoch": 3.48, + "learning_rate": 1.062615903532303e-05, + "loss": 2.7855, + "step": 6925 + }, + { + "epoch": 3.48, + "learning_rate": 1.0593917799251785e-05, + "loss": 2.5029, + "step": 6930 + }, + { + "epoch": 3.48, + "learning_rate": 1.0561712394836184e-05, + "loss": 2.3403, + "step": 6935 + }, + { + "epoch": 3.48, + "learning_rate": 1.0529542902179406e-05, + "loss": 2.748, + "step": 6940 + }, + { + "epoch": 3.49, + "learning_rate": 1.0497409401295303e-05, + "loss": 2.4717, + "step": 6945 + }, + { + "epoch": 3.49, + "learning_rate": 1.0465311972108214e-05, + "loss": 2.6532, + "step": 6950 + }, + { + "epoch": 3.49, + "learning_rate": 1.043325069445275e-05, + "loss": 2.3954, + "step": 6955 + }, + { + "epoch": 3.49, + "learning_rate": 1.0401225648073612e-05, + "loss": 2.4491, + "step": 6960 + }, + { + "epoch": 3.5, + "learning_rate": 1.0369236912625377e-05, + "loss": 2.8167, + "step": 6965 + }, + { + "epoch": 3.5, + "learning_rate": 1.0337284567672314e-05, + "loss": 2.4416, + "step": 6970 + }, + { + "epoch": 3.5, + "learning_rate": 1.0305368692688174e-05, + "loss": 2.4095, + "step": 6975 + }, + { + "epoch": 3.5, + "learning_rate": 1.0273489367056002e-05, + "loss": 2.6135, + "step": 6980 + }, + { + "epoch": 3.51, + "learning_rate": 1.0241646670067932e-05, + "loss": 2.7131, + "step": 6985 + }, + { + "epoch": 3.51, + "learning_rate": 1.0209840680924993e-05, + "loss": 2.45, + "step": 6990 + }, + { + "epoch": 3.51, + "learning_rate": 1.0178071478736914e-05, + "loss": 2.4902, + "step": 6995 + }, + { + "epoch": 3.51, + "learning_rate": 1.0146339142521926e-05, + "loss": 2.572, + "step": 7000 + }, + { + "epoch": 3.52, + "learning_rate": 1.0114643751206562e-05, + "loss": 2.3915, + "step": 7005 + }, + { + "epoch": 3.52, + "learning_rate": 1.0082985383625468e-05, + "loss": 2.3651, + "step": 7010 + }, + { + "epoch": 3.52, + "learning_rate": 1.0051364118521197e-05, + "loss": 2.0744, + "step": 7015 + }, + { + "epoch": 3.52, + "learning_rate": 1.0019780034544022e-05, + "loss": 2.3651, + "step": 7020 + }, + { + "epoch": 3.53, + "learning_rate": 9.988233210251723e-06, + "loss": 2.423, + "step": 7025 + }, + { + "epoch": 3.53, + "learning_rate": 9.956723724109441e-06, + "loss": 2.7706, + "step": 7030 + }, + { + "epoch": 3.53, + "learning_rate": 9.925251654489415e-06, + "loss": 2.4113, + "step": 7035 + }, + { + "epoch": 3.53, + "learning_rate": 9.893817079670825e-06, + "loss": 2.2758, + "step": 7040 + }, + { + "epoch": 3.54, + "learning_rate": 9.8624200778396e-06, + "loss": 2.4551, + "step": 7045 + }, + { + "epoch": 3.54, + "learning_rate": 9.831060727088215e-06, + "loss": 2.3271, + "step": 7050 + }, + { + "epoch": 3.54, + "learning_rate": 9.799739105415483e-06, + "loss": 2.348, + "step": 7055 + }, + { + "epoch": 3.54, + "learning_rate": 9.768455290726402e-06, + "loss": 2.803, + "step": 7060 + }, + { + "epoch": 3.55, + "learning_rate": 9.737209360831895e-06, + "loss": 2.4977, + "step": 7065 + }, + { + "epoch": 3.55, + "learning_rate": 9.70600139344868e-06, + "loss": 2.6904, + "step": 7070 + }, + { + "epoch": 3.55, + "learning_rate": 9.67483146619907e-06, + "loss": 2.4839, + "step": 7075 + }, + { + "epoch": 3.55, + "learning_rate": 9.64369965661073e-06, + "loss": 2.6168, + "step": 7080 + }, + { + "epoch": 3.56, + "learning_rate": 9.612606042116535e-06, + "loss": 2.3343, + "step": 7085 + }, + { + "epoch": 3.56, + "learning_rate": 9.581550700054345e-06, + "loss": 2.4697, + "step": 7090 + }, + { + "epoch": 3.56, + "learning_rate": 9.550533707666842e-06, + "loss": 2.7164, + "step": 7095 + }, + { + "epoch": 3.56, + "learning_rate": 9.519555142101311e-06, + "loss": 2.5116, + "step": 7100 + }, + { + "epoch": 3.57, + "learning_rate": 9.488615080409468e-06, + "loss": 2.4768, + "step": 7105 + }, + { + "epoch": 3.57, + "learning_rate": 9.457713599547252e-06, + "loss": 2.4756, + "step": 7110 + }, + { + "epoch": 3.57, + "learning_rate": 9.426850776374646e-06, + "loss": 2.4257, + "step": 7115 + }, + { + "epoch": 3.57, + "learning_rate": 9.396026687655483e-06, + "loss": 2.5385, + "step": 7120 + }, + { + "epoch": 3.58, + "learning_rate": 9.365241410057246e-06, + "loss": 2.5497, + "step": 7125 + }, + { + "epoch": 3.58, + "learning_rate": 9.334495020150885e-06, + "loss": 2.5848, + "step": 7130 + }, + { + "epoch": 3.58, + "learning_rate": 9.303787594410648e-06, + "loss": 2.5811, + "step": 7135 + }, + { + "epoch": 3.58, + "learning_rate": 9.273119209213841e-06, + "loss": 2.2504, + "step": 7140 + }, + { + "epoch": 3.59, + "learning_rate": 9.242489940840684e-06, + "loss": 2.4348, + "step": 7145 + }, + { + "epoch": 3.59, + "learning_rate": 9.211899865474086e-06, + "loss": 2.6538, + "step": 7150 + }, + { + "epoch": 3.59, + "learning_rate": 9.181349059199484e-06, + "loss": 2.9365, + "step": 7155 + }, + { + "epoch": 3.59, + "learning_rate": 9.150837598004648e-06, + "loss": 2.4267, + "step": 7160 + }, + { + "epoch": 3.6, + "learning_rate": 9.120365557779472e-06, + "loss": 2.3872, + "step": 7165 + }, + { + "epoch": 3.6, + "learning_rate": 9.089933014315818e-06, + "loss": 2.5116, + "step": 7170 + }, + { + "epoch": 3.6, + "learning_rate": 9.059540043307293e-06, + "loss": 2.3202, + "step": 7175 + }, + { + "epoch": 3.6, + "learning_rate": 9.029186720349078e-06, + "loss": 2.7859, + "step": 7180 + }, + { + "epoch": 3.61, + "learning_rate": 8.998873120937762e-06, + "loss": 2.6064, + "step": 7185 + }, + { + "epoch": 3.61, + "learning_rate": 8.968599320471102e-06, + "loss": 2.8572, + "step": 7190 + }, + { + "epoch": 3.61, + "learning_rate": 8.938365394247877e-06, + "loss": 2.4965, + "step": 7195 + }, + { + "epoch": 3.61, + "learning_rate": 8.908171417467692e-06, + "loss": 2.7261, + "step": 7200 + }, + { + "epoch": 3.62, + "learning_rate": 8.878017465230778e-06, + "loss": 2.6582, + "step": 7205 + }, + { + "epoch": 3.62, + "learning_rate": 8.847903612537826e-06, + "loss": 2.3756, + "step": 7210 + }, + { + "epoch": 3.62, + "learning_rate": 8.817829934289775e-06, + "loss": 2.5582, + "step": 7215 + }, + { + "epoch": 3.62, + "learning_rate": 8.787796505287657e-06, + "loss": 2.6091, + "step": 7220 + }, + { + "epoch": 3.63, + "learning_rate": 8.757803400232379e-06, + "loss": 2.5523, + "step": 7225 + }, + { + "epoch": 3.63, + "learning_rate": 8.727850693724558e-06, + "loss": 2.6721, + "step": 7230 + }, + { + "epoch": 3.63, + "learning_rate": 8.697938460264326e-06, + "loss": 2.6035, + "step": 7235 + }, + { + "epoch": 3.63, + "learning_rate": 8.668066774251158e-06, + "loss": 2.4755, + "step": 7240 + }, + { + "epoch": 3.64, + "learning_rate": 8.638235709983664e-06, + "loss": 2.6591, + "step": 7245 + }, + { + "epoch": 3.64, + "learning_rate": 8.608445341659423e-06, + "loss": 2.3781, + "step": 7250 + }, + { + "epoch": 3.64, + "learning_rate": 8.578695743374798e-06, + "loss": 2.5149, + "step": 7255 + }, + { + "epoch": 3.64, + "learning_rate": 8.548986989124737e-06, + "loss": 2.6264, + "step": 7260 + }, + { + "epoch": 3.65, + "learning_rate": 8.519319152802601e-06, + "loss": 2.638, + "step": 7265 + }, + { + "epoch": 3.65, + "learning_rate": 8.489692308199981e-06, + "loss": 2.4959, + "step": 7270 + }, + { + "epoch": 3.65, + "learning_rate": 8.460106529006511e-06, + "loss": 2.3365, + "step": 7275 + }, + { + "epoch": 3.65, + "learning_rate": 8.430561888809676e-06, + "loss": 2.3178, + "step": 7280 + }, + { + "epoch": 3.66, + "learning_rate": 8.401058461094643e-06, + "loss": 2.5691, + "step": 7285 + }, + { + "epoch": 3.66, + "learning_rate": 8.371596319244087e-06, + "loss": 2.4521, + "step": 7290 + }, + { + "epoch": 3.66, + "learning_rate": 8.342175536537975e-06, + "loss": 2.6887, + "step": 7295 + }, + { + "epoch": 3.66, + "learning_rate": 8.312796186153405e-06, + "loss": 2.2551, + "step": 7300 + }, + { + "epoch": 3.67, + "learning_rate": 8.283458341164432e-06, + "loss": 2.5463, + "step": 7305 + }, + { + "epoch": 3.67, + "learning_rate": 8.254162074541868e-06, + "loss": 2.6583, + "step": 7310 + }, + { + "epoch": 3.67, + "learning_rate": 8.224907459153114e-06, + "loss": 2.5084, + "step": 7315 + }, + { + "epoch": 3.67, + "learning_rate": 8.195694567761968e-06, + "loss": 2.3259, + "step": 7320 + }, + { + "epoch": 3.68, + "learning_rate": 8.166523473028465e-06, + "loss": 2.3955, + "step": 7325 + }, + { + "epoch": 3.68, + "learning_rate": 8.137394247508644e-06, + "loss": 2.5088, + "step": 7330 + }, + { + "epoch": 3.68, + "learning_rate": 8.108306963654452e-06, + "loss": 2.5981, + "step": 7335 + }, + { + "epoch": 3.68, + "learning_rate": 8.079261693813487e-06, + "loss": 2.6233, + "step": 7340 + }, + { + "epoch": 3.69, + "learning_rate": 8.05025851022885e-06, + "loss": 2.4973, + "step": 7345 + }, + { + "epoch": 3.69, + "learning_rate": 8.02129748503897e-06, + "loss": 2.5353, + "step": 7350 + }, + { + "epoch": 3.69, + "learning_rate": 7.992378690277416e-06, + "loss": 2.5229, + "step": 7355 + }, + { + "epoch": 3.69, + "learning_rate": 7.96350219787271e-06, + "loss": 2.2312, + "step": 7360 + }, + { + "epoch": 3.7, + "learning_rate": 7.93466807964817e-06, + "loss": 2.6011, + "step": 7365 + }, + { + "epoch": 3.7, + "learning_rate": 7.905876407321711e-06, + "loss": 2.4813, + "step": 7370 + }, + { + "epoch": 3.7, + "learning_rate": 7.87712725250567e-06, + "loss": 2.4722, + "step": 7375 + }, + { + "epoch": 3.7, + "learning_rate": 7.848420686706643e-06, + "loss": 2.6481, + "step": 7380 + }, + { + "epoch": 3.71, + "learning_rate": 7.819756781325285e-06, + "loss": 2.5964, + "step": 7385 + }, + { + "epoch": 3.71, + "learning_rate": 7.791135607656147e-06, + "loss": 2.4698, + "step": 7390 + }, + { + "epoch": 3.71, + "learning_rate": 7.762557236887507e-06, + "loss": 2.6941, + "step": 7395 + }, + { + "epoch": 3.71, + "learning_rate": 7.734021740101168e-06, + "loss": 2.6679, + "step": 7400 + }, + { + "epoch": 3.72, + "learning_rate": 7.705529188272295e-06, + "loss": 2.7456, + "step": 7405 + }, + { + "epoch": 3.72, + "learning_rate": 7.67707965226924e-06, + "loss": 2.3179, + "step": 7410 + }, + { + "epoch": 3.72, + "learning_rate": 7.64867320285337e-06, + "loss": 2.5265, + "step": 7415 + }, + { + "epoch": 3.72, + "learning_rate": 7.620309910678866e-06, + "loss": 2.4766, + "step": 7420 + }, + { + "epoch": 3.73, + "learning_rate": 7.59198984629258e-06, + "loss": 2.5339, + "step": 7425 + }, + { + "epoch": 3.73, + "learning_rate": 7.56371308013385e-06, + "loss": 2.5665, + "step": 7430 + }, + { + "epoch": 3.73, + "learning_rate": 7.535479682534302e-06, + "loss": 2.6048, + "step": 7435 + }, + { + "epoch": 3.73, + "learning_rate": 7.50728972371772e-06, + "loss": 2.5792, + "step": 7440 + }, + { + "epoch": 3.74, + "learning_rate": 7.479143273799818e-06, + "loss": 2.6327, + "step": 7445 + }, + { + "epoch": 3.74, + "learning_rate": 7.451040402788109e-06, + "loss": 2.4764, + "step": 7450 + }, + { + "epoch": 3.74, + "learning_rate": 7.4229811805817065e-06, + "loss": 2.5359, + "step": 7455 + }, + { + "epoch": 3.74, + "learning_rate": 7.394965676971158e-06, + "loss": 2.3672, + "step": 7460 + }, + { + "epoch": 3.75, + "learning_rate": 7.3669939616382744e-06, + "loss": 2.2471, + "step": 7465 + }, + { + "epoch": 3.75, + "learning_rate": 7.33906610415595e-06, + "loss": 2.2825, + "step": 7470 + }, + { + "epoch": 3.75, + "learning_rate": 7.311182173987999e-06, + "loss": 2.8013, + "step": 7475 + }, + { + "epoch": 3.75, + "learning_rate": 7.283342240488972e-06, + "loss": 2.6741, + "step": 7480 + }, + { + "epoch": 3.76, + "learning_rate": 7.25554637290399e-06, + "loss": 2.4988, + "step": 7485 + }, + { + "epoch": 3.76, + "learning_rate": 7.227794640368573e-06, + "loss": 2.6571, + "step": 7490 + }, + { + "epoch": 3.76, + "learning_rate": 7.2000871119084575e-06, + "loss": 2.7367, + "step": 7495 + }, + { + "epoch": 3.76, + "learning_rate": 7.172423856439459e-06, + "loss": 2.6667, + "step": 7500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 3.967475490600714e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7500/training_args.bin b/checkpoint-7500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-7500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-8000/README.md b/checkpoint-8000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-8000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-8000/adapter_config.json b/checkpoint-8000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-8000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-8000/adapter_model.safetensors b/checkpoint-8000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fe4e477231eb826557a2b73fb00ffd13b95c5723 --- /dev/null +++ b/checkpoint-8000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd01bce7876227cfee4e4947e5dc65617b15049fdb814a45a0639beb299ab95e +size 7807744 diff --git a/checkpoint-8000/optimizer.pt b/checkpoint-8000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..baf7c5fc7fa69a17b0568be5b72508ead716b299 --- /dev/null +++ b/checkpoint-8000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c2feefdf40759094ac4de1a521a622360a3e4cbe1e4668007e0d907e4f0611 +size 15644485 diff --git a/checkpoint-8000/rng_state.pth b/checkpoint-8000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9df067fc9ec061e047f69d8dfcb9e17b5ff16a3d --- /dev/null +++ b/checkpoint-8000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c664f339134bb534b40e61f842ef393ff361e414075d086c15a341edc345fd9e +size 14575 diff --git a/checkpoint-8000/scheduler.pt b/checkpoint-8000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..826579e27d000f2e1e86006aa29986642385c80f --- /dev/null +++ b/checkpoint-8000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a0a280200196c87a655f530dd06a545cba126993908796a763215260d42d50 +size 627 diff --git a/checkpoint-8000/special_tokens_map.json b/checkpoint-8000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-8000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-8000/tokenization_chatglm.py b/checkpoint-8000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-8000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-8000/tokenizer.model b/checkpoint-8000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-8000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-8000/tokenizer_config.json b/checkpoint-8000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-8000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-8000/trainer_state.json b/checkpoint-8000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..889d3dd573a59e37df9b010d5a45e5dd79da6d50 --- /dev/null +++ b/checkpoint-8000/trainer_state.json @@ -0,0 +1,9621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.014552753732279, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + }, + { + "epoch": 3.26, + "learning_rate": 1.3453953217942436e-05, + "loss": 2.5565, + "step": 6505 + }, + { + "epoch": 3.27, + "learning_rate": 1.3418996780919804e-05, + "loss": 2.5866, + "step": 6510 + }, + { + "epoch": 3.27, + "learning_rate": 1.3384069148850087e-05, + "loss": 2.5992, + "step": 6515 + }, + { + "epoch": 3.27, + "learning_rate": 1.3349170408607342e-05, + "loss": 2.4388, + "step": 6520 + }, + { + "epoch": 3.27, + "learning_rate": 1.3314300646993771e-05, + "loss": 2.2734, + "step": 6525 + }, + { + "epoch": 3.28, + "learning_rate": 1.3279459950739489e-05, + "loss": 2.7683, + "step": 6530 + }, + { + "epoch": 3.28, + "learning_rate": 1.3244648406502331e-05, + "loss": 2.3653, + "step": 6535 + }, + { + "epoch": 3.28, + "learning_rate": 1.3209866100867613e-05, + "loss": 2.6401, + "step": 6540 + }, + { + "epoch": 3.28, + "learning_rate": 1.3175113120347943e-05, + "loss": 2.5218, + "step": 6545 + }, + { + "epoch": 3.29, + "learning_rate": 1.3140389551382975e-05, + "loss": 2.4681, + "step": 6550 + }, + { + "epoch": 3.29, + "learning_rate": 1.3105695480339206e-05, + "loss": 2.4681, + "step": 6555 + }, + { + "epoch": 3.29, + "learning_rate": 1.3071030993509788e-05, + "loss": 2.5743, + "step": 6560 + }, + { + "epoch": 3.29, + "learning_rate": 1.303639617711427e-05, + "loss": 2.5423, + "step": 6565 + }, + { + "epoch": 3.3, + "learning_rate": 1.3001791117298395e-05, + "loss": 2.4267, + "step": 6570 + }, + { + "epoch": 3.3, + "learning_rate": 1.2967215900133911e-05, + "loss": 2.5537, + "step": 6575 + }, + { + "epoch": 3.3, + "learning_rate": 1.2932670611618336e-05, + "loss": 2.5451, + "step": 6580 + }, + { + "epoch": 3.3, + "learning_rate": 1.2898155337674744e-05, + "loss": 2.4048, + "step": 6585 + }, + { + "epoch": 3.31, + "learning_rate": 1.2863670164151551e-05, + "loss": 2.6769, + "step": 6590 + }, + { + "epoch": 3.31, + "learning_rate": 1.2829215176822316e-05, + "loss": 2.2118, + "step": 6595 + }, + { + "epoch": 3.31, + "learning_rate": 1.2794790461385508e-05, + "loss": 2.2912, + "step": 6600 + }, + { + "epoch": 3.31, + "learning_rate": 1.2760396103464309e-05, + "loss": 2.3978, + "step": 6605 + }, + { + "epoch": 3.32, + "learning_rate": 1.2726032188606388e-05, + "loss": 2.4801, + "step": 6610 + }, + { + "epoch": 3.32, + "learning_rate": 1.2691698802283697e-05, + "loss": 2.5522, + "step": 6615 + }, + { + "epoch": 3.32, + "learning_rate": 1.2657396029892258e-05, + "loss": 2.6728, + "step": 6620 + }, + { + "epoch": 3.32, + "learning_rate": 1.2623123956751943e-05, + "loss": 2.2937, + "step": 6625 + }, + { + "epoch": 3.33, + "learning_rate": 1.258888266810627e-05, + "loss": 2.5459, + "step": 6630 + }, + { + "epoch": 3.33, + "learning_rate": 1.2554672249122187e-05, + "loss": 2.6329, + "step": 6635 + }, + { + "epoch": 3.33, + "learning_rate": 1.2520492784889865e-05, + "loss": 2.4845, + "step": 6640 + }, + { + "epoch": 3.33, + "learning_rate": 1.2486344360422475e-05, + "loss": 2.5023, + "step": 6645 + }, + { + "epoch": 3.34, + "learning_rate": 1.2452227060655993e-05, + "loss": 2.5674, + "step": 6650 + }, + { + "epoch": 3.34, + "learning_rate": 1.2418140970448975e-05, + "loss": 2.4996, + "step": 6655 + }, + { + "epoch": 3.34, + "learning_rate": 1.2384086174582336e-05, + "loss": 2.4704, + "step": 6660 + }, + { + "epoch": 3.34, + "learning_rate": 1.2350062757759193e-05, + "loss": 2.4928, + "step": 6665 + }, + { + "epoch": 3.35, + "learning_rate": 1.2316070804604576e-05, + "loss": 2.5498, + "step": 6670 + }, + { + "epoch": 3.35, + "learning_rate": 1.228211039966528e-05, + "loss": 2.5641, + "step": 6675 + }, + { + "epoch": 3.35, + "learning_rate": 1.2248181627409619e-05, + "loss": 2.5725, + "step": 6680 + }, + { + "epoch": 3.35, + "learning_rate": 1.221428457222723e-05, + "loss": 2.5827, + "step": 6685 + }, + { + "epoch": 3.36, + "learning_rate": 1.2180419318428868e-05, + "loss": 2.3591, + "step": 6690 + }, + { + "epoch": 3.36, + "learning_rate": 1.2146585950246186e-05, + "loss": 2.5772, + "step": 6695 + }, + { + "epoch": 3.36, + "learning_rate": 1.2112784551831533e-05, + "loss": 2.4008, + "step": 6700 + }, + { + "epoch": 3.36, + "learning_rate": 1.2079015207257724e-05, + "loss": 2.3334, + "step": 6705 + }, + { + "epoch": 3.37, + "learning_rate": 1.2045278000517857e-05, + "loss": 2.8023, + "step": 6710 + }, + { + "epoch": 3.37, + "learning_rate": 1.2011573015525118e-05, + "loss": 2.6145, + "step": 6715 + }, + { + "epoch": 3.37, + "learning_rate": 1.1977900336112519e-05, + "loss": 2.6568, + "step": 6720 + }, + { + "epoch": 3.37, + "learning_rate": 1.1944260046032735e-05, + "loss": 2.1771, + "step": 6725 + }, + { + "epoch": 3.38, + "learning_rate": 1.1910652228957872e-05, + "loss": 2.4932, + "step": 6730 + }, + { + "epoch": 3.38, + "learning_rate": 1.187707696847927e-05, + "loss": 2.3883, + "step": 6735 + }, + { + "epoch": 3.38, + "learning_rate": 1.1843534348107294e-05, + "loss": 2.7792, + "step": 6740 + }, + { + "epoch": 3.38, + "learning_rate": 1.1810024451271125e-05, + "loss": 2.5825, + "step": 6745 + }, + { + "epoch": 3.39, + "learning_rate": 1.1776547361318551e-05, + "loss": 2.406, + "step": 6750 + }, + { + "epoch": 3.39, + "learning_rate": 1.1743103161515762e-05, + "loss": 2.5823, + "step": 6755 + }, + { + "epoch": 3.39, + "learning_rate": 1.1709691935047137e-05, + "loss": 2.7587, + "step": 6760 + }, + { + "epoch": 3.39, + "learning_rate": 1.1676313765015038e-05, + "loss": 2.5183, + "step": 6765 + }, + { + "epoch": 3.4, + "learning_rate": 1.1642968734439633e-05, + "loss": 2.6452, + "step": 6770 + }, + { + "epoch": 3.4, + "learning_rate": 1.1609656926258634e-05, + "loss": 2.4641, + "step": 6775 + }, + { + "epoch": 3.4, + "learning_rate": 1.1576378423327131e-05, + "loss": 2.6462, + "step": 6780 + }, + { + "epoch": 3.4, + "learning_rate": 1.1543133308417378e-05, + "loss": 2.5271, + "step": 6785 + }, + { + "epoch": 3.41, + "learning_rate": 1.1509921664218587e-05, + "loss": 2.4245, + "step": 6790 + }, + { + "epoch": 3.41, + "learning_rate": 1.14767435733367e-05, + "loss": 2.3622, + "step": 6795 + }, + { + "epoch": 3.41, + "learning_rate": 1.1443599118294227e-05, + "loss": 2.5564, + "step": 6800 + }, + { + "epoch": 3.41, + "learning_rate": 1.1410488381530005e-05, + "loss": 2.342, + "step": 6805 + }, + { + "epoch": 3.42, + "learning_rate": 1.1377411445399006e-05, + "loss": 2.4976, + "step": 6810 + }, + { + "epoch": 3.42, + "learning_rate": 1.1344368392172125e-05, + "loss": 2.4792, + "step": 6815 + }, + { + "epoch": 3.42, + "learning_rate": 1.1311359304036013e-05, + "loss": 2.4829, + "step": 6820 + }, + { + "epoch": 3.42, + "learning_rate": 1.1278384263092797e-05, + "loss": 2.3949, + "step": 6825 + }, + { + "epoch": 3.43, + "learning_rate": 1.124544335135995e-05, + "loss": 2.5555, + "step": 6830 + }, + { + "epoch": 3.43, + "learning_rate": 1.1212536650770041e-05, + "loss": 2.5479, + "step": 6835 + }, + { + "epoch": 3.43, + "learning_rate": 1.1179664243170554e-05, + "loss": 2.5333, + "step": 6840 + }, + { + "epoch": 3.43, + "learning_rate": 1.1146826210323677e-05, + "loss": 2.0832, + "step": 6845 + }, + { + "epoch": 3.44, + "learning_rate": 1.1114022633906096e-05, + "loss": 2.7639, + "step": 6850 + }, + { + "epoch": 3.44, + "learning_rate": 1.10812535955088e-05, + "loss": 2.663, + "step": 6855 + }, + { + "epoch": 3.44, + "learning_rate": 1.104851917663687e-05, + "loss": 2.485, + "step": 6860 + }, + { + "epoch": 3.44, + "learning_rate": 1.1015819458709279e-05, + "loss": 2.3004, + "step": 6865 + }, + { + "epoch": 3.45, + "learning_rate": 1.0983154523058687e-05, + "loss": 2.3924, + "step": 6870 + }, + { + "epoch": 3.45, + "learning_rate": 1.095052445093124e-05, + "loss": 2.4694, + "step": 6875 + }, + { + "epoch": 3.45, + "learning_rate": 1.0917929323486398e-05, + "loss": 2.5255, + "step": 6880 + }, + { + "epoch": 3.46, + "learning_rate": 1.0885369221796657e-05, + "loss": 2.211, + "step": 6885 + }, + { + "epoch": 3.46, + "learning_rate": 1.0852844226847425e-05, + "loss": 2.5446, + "step": 6890 + }, + { + "epoch": 3.46, + "learning_rate": 1.0820354419536786e-05, + "loss": 2.778, + "step": 6895 + }, + { + "epoch": 3.46, + "learning_rate": 1.0787899880675298e-05, + "loss": 2.5628, + "step": 6900 + }, + { + "epoch": 3.47, + "learning_rate": 1.0755480690985803e-05, + "loss": 2.5333, + "step": 6905 + }, + { + "epoch": 3.47, + "learning_rate": 1.0723096931103218e-05, + "loss": 2.7511, + "step": 6910 + }, + { + "epoch": 3.47, + "learning_rate": 1.0690748681574336e-05, + "loss": 2.2807, + "step": 6915 + }, + { + "epoch": 3.47, + "learning_rate": 1.0658436022857617e-05, + "loss": 2.5652, + "step": 6920 + }, + { + "epoch": 3.48, + "learning_rate": 1.062615903532303e-05, + "loss": 2.7855, + "step": 6925 + }, + { + "epoch": 3.48, + "learning_rate": 1.0593917799251785e-05, + "loss": 2.5029, + "step": 6930 + }, + { + "epoch": 3.48, + "learning_rate": 1.0561712394836184e-05, + "loss": 2.3403, + "step": 6935 + }, + { + "epoch": 3.48, + "learning_rate": 1.0529542902179406e-05, + "loss": 2.748, + "step": 6940 + }, + { + "epoch": 3.49, + "learning_rate": 1.0497409401295303e-05, + "loss": 2.4717, + "step": 6945 + }, + { + "epoch": 3.49, + "learning_rate": 1.0465311972108214e-05, + "loss": 2.6532, + "step": 6950 + }, + { + "epoch": 3.49, + "learning_rate": 1.043325069445275e-05, + "loss": 2.3954, + "step": 6955 + }, + { + "epoch": 3.49, + "learning_rate": 1.0401225648073612e-05, + "loss": 2.4491, + "step": 6960 + }, + { + "epoch": 3.5, + "learning_rate": 1.0369236912625377e-05, + "loss": 2.8167, + "step": 6965 + }, + { + "epoch": 3.5, + "learning_rate": 1.0337284567672314e-05, + "loss": 2.4416, + "step": 6970 + }, + { + "epoch": 3.5, + "learning_rate": 1.0305368692688174e-05, + "loss": 2.4095, + "step": 6975 + }, + { + "epoch": 3.5, + "learning_rate": 1.0273489367056002e-05, + "loss": 2.6135, + "step": 6980 + }, + { + "epoch": 3.51, + "learning_rate": 1.0241646670067932e-05, + "loss": 2.7131, + "step": 6985 + }, + { + "epoch": 3.51, + "learning_rate": 1.0209840680924993e-05, + "loss": 2.45, + "step": 6990 + }, + { + "epoch": 3.51, + "learning_rate": 1.0178071478736914e-05, + "loss": 2.4902, + "step": 6995 + }, + { + "epoch": 3.51, + "learning_rate": 1.0146339142521926e-05, + "loss": 2.572, + "step": 7000 + }, + { + "epoch": 3.52, + "learning_rate": 1.0114643751206562e-05, + "loss": 2.3915, + "step": 7005 + }, + { + "epoch": 3.52, + "learning_rate": 1.0082985383625468e-05, + "loss": 2.3651, + "step": 7010 + }, + { + "epoch": 3.52, + "learning_rate": 1.0051364118521197e-05, + "loss": 2.0744, + "step": 7015 + }, + { + "epoch": 3.52, + "learning_rate": 1.0019780034544022e-05, + "loss": 2.3651, + "step": 7020 + }, + { + "epoch": 3.53, + "learning_rate": 9.988233210251723e-06, + "loss": 2.423, + "step": 7025 + }, + { + "epoch": 3.53, + "learning_rate": 9.956723724109441e-06, + "loss": 2.7706, + "step": 7030 + }, + { + "epoch": 3.53, + "learning_rate": 9.925251654489415e-06, + "loss": 2.4113, + "step": 7035 + }, + { + "epoch": 3.53, + "learning_rate": 9.893817079670825e-06, + "loss": 2.2758, + "step": 7040 + }, + { + "epoch": 3.54, + "learning_rate": 9.8624200778396e-06, + "loss": 2.4551, + "step": 7045 + }, + { + "epoch": 3.54, + "learning_rate": 9.831060727088215e-06, + "loss": 2.3271, + "step": 7050 + }, + { + "epoch": 3.54, + "learning_rate": 9.799739105415483e-06, + "loss": 2.348, + "step": 7055 + }, + { + "epoch": 3.54, + "learning_rate": 9.768455290726402e-06, + "loss": 2.803, + "step": 7060 + }, + { + "epoch": 3.55, + "learning_rate": 9.737209360831895e-06, + "loss": 2.4977, + "step": 7065 + }, + { + "epoch": 3.55, + "learning_rate": 9.70600139344868e-06, + "loss": 2.6904, + "step": 7070 + }, + { + "epoch": 3.55, + "learning_rate": 9.67483146619907e-06, + "loss": 2.4839, + "step": 7075 + }, + { + "epoch": 3.55, + "learning_rate": 9.64369965661073e-06, + "loss": 2.6168, + "step": 7080 + }, + { + "epoch": 3.56, + "learning_rate": 9.612606042116535e-06, + "loss": 2.3343, + "step": 7085 + }, + { + "epoch": 3.56, + "learning_rate": 9.581550700054345e-06, + "loss": 2.4697, + "step": 7090 + }, + { + "epoch": 3.56, + "learning_rate": 9.550533707666842e-06, + "loss": 2.7164, + "step": 7095 + }, + { + "epoch": 3.56, + "learning_rate": 9.519555142101311e-06, + "loss": 2.5116, + "step": 7100 + }, + { + "epoch": 3.57, + "learning_rate": 9.488615080409468e-06, + "loss": 2.4768, + "step": 7105 + }, + { + "epoch": 3.57, + "learning_rate": 9.457713599547252e-06, + "loss": 2.4756, + "step": 7110 + }, + { + "epoch": 3.57, + "learning_rate": 9.426850776374646e-06, + "loss": 2.4257, + "step": 7115 + }, + { + "epoch": 3.57, + "learning_rate": 9.396026687655483e-06, + "loss": 2.5385, + "step": 7120 + }, + { + "epoch": 3.58, + "learning_rate": 9.365241410057246e-06, + "loss": 2.5497, + "step": 7125 + }, + { + "epoch": 3.58, + "learning_rate": 9.334495020150885e-06, + "loss": 2.5848, + "step": 7130 + }, + { + "epoch": 3.58, + "learning_rate": 9.303787594410648e-06, + "loss": 2.5811, + "step": 7135 + }, + { + "epoch": 3.58, + "learning_rate": 9.273119209213841e-06, + "loss": 2.2504, + "step": 7140 + }, + { + "epoch": 3.59, + "learning_rate": 9.242489940840684e-06, + "loss": 2.4348, + "step": 7145 + }, + { + "epoch": 3.59, + "learning_rate": 9.211899865474086e-06, + "loss": 2.6538, + "step": 7150 + }, + { + "epoch": 3.59, + "learning_rate": 9.181349059199484e-06, + "loss": 2.9365, + "step": 7155 + }, + { + "epoch": 3.59, + "learning_rate": 9.150837598004648e-06, + "loss": 2.4267, + "step": 7160 + }, + { + "epoch": 3.6, + "learning_rate": 9.120365557779472e-06, + "loss": 2.3872, + "step": 7165 + }, + { + "epoch": 3.6, + "learning_rate": 9.089933014315818e-06, + "loss": 2.5116, + "step": 7170 + }, + { + "epoch": 3.6, + "learning_rate": 9.059540043307293e-06, + "loss": 2.3202, + "step": 7175 + }, + { + "epoch": 3.6, + "learning_rate": 9.029186720349078e-06, + "loss": 2.7859, + "step": 7180 + }, + { + "epoch": 3.61, + "learning_rate": 8.998873120937762e-06, + "loss": 2.6064, + "step": 7185 + }, + { + "epoch": 3.61, + "learning_rate": 8.968599320471102e-06, + "loss": 2.8572, + "step": 7190 + }, + { + "epoch": 3.61, + "learning_rate": 8.938365394247877e-06, + "loss": 2.4965, + "step": 7195 + }, + { + "epoch": 3.61, + "learning_rate": 8.908171417467692e-06, + "loss": 2.7261, + "step": 7200 + }, + { + "epoch": 3.62, + "learning_rate": 8.878017465230778e-06, + "loss": 2.6582, + "step": 7205 + }, + { + "epoch": 3.62, + "learning_rate": 8.847903612537826e-06, + "loss": 2.3756, + "step": 7210 + }, + { + "epoch": 3.62, + "learning_rate": 8.817829934289775e-06, + "loss": 2.5582, + "step": 7215 + }, + { + "epoch": 3.62, + "learning_rate": 8.787796505287657e-06, + "loss": 2.6091, + "step": 7220 + }, + { + "epoch": 3.63, + "learning_rate": 8.757803400232379e-06, + "loss": 2.5523, + "step": 7225 + }, + { + "epoch": 3.63, + "learning_rate": 8.727850693724558e-06, + "loss": 2.6721, + "step": 7230 + }, + { + "epoch": 3.63, + "learning_rate": 8.697938460264326e-06, + "loss": 2.6035, + "step": 7235 + }, + { + "epoch": 3.63, + "learning_rate": 8.668066774251158e-06, + "loss": 2.4755, + "step": 7240 + }, + { + "epoch": 3.64, + "learning_rate": 8.638235709983664e-06, + "loss": 2.6591, + "step": 7245 + }, + { + "epoch": 3.64, + "learning_rate": 8.608445341659423e-06, + "loss": 2.3781, + "step": 7250 + }, + { + "epoch": 3.64, + "learning_rate": 8.578695743374798e-06, + "loss": 2.5149, + "step": 7255 + }, + { + "epoch": 3.64, + "learning_rate": 8.548986989124737e-06, + "loss": 2.6264, + "step": 7260 + }, + { + "epoch": 3.65, + "learning_rate": 8.519319152802601e-06, + "loss": 2.638, + "step": 7265 + }, + { + "epoch": 3.65, + "learning_rate": 8.489692308199981e-06, + "loss": 2.4959, + "step": 7270 + }, + { + "epoch": 3.65, + "learning_rate": 8.460106529006511e-06, + "loss": 2.3365, + "step": 7275 + }, + { + "epoch": 3.65, + "learning_rate": 8.430561888809676e-06, + "loss": 2.3178, + "step": 7280 + }, + { + "epoch": 3.66, + "learning_rate": 8.401058461094643e-06, + "loss": 2.5691, + "step": 7285 + }, + { + "epoch": 3.66, + "learning_rate": 8.371596319244087e-06, + "loss": 2.4521, + "step": 7290 + }, + { + "epoch": 3.66, + "learning_rate": 8.342175536537975e-06, + "loss": 2.6887, + "step": 7295 + }, + { + "epoch": 3.66, + "learning_rate": 8.312796186153405e-06, + "loss": 2.2551, + "step": 7300 + }, + { + "epoch": 3.67, + "learning_rate": 8.283458341164432e-06, + "loss": 2.5463, + "step": 7305 + }, + { + "epoch": 3.67, + "learning_rate": 8.254162074541868e-06, + "loss": 2.6583, + "step": 7310 + }, + { + "epoch": 3.67, + "learning_rate": 8.224907459153114e-06, + "loss": 2.5084, + "step": 7315 + }, + { + "epoch": 3.67, + "learning_rate": 8.195694567761968e-06, + "loss": 2.3259, + "step": 7320 + }, + { + "epoch": 3.68, + "learning_rate": 8.166523473028465e-06, + "loss": 2.3955, + "step": 7325 + }, + { + "epoch": 3.68, + "learning_rate": 8.137394247508644e-06, + "loss": 2.5088, + "step": 7330 + }, + { + "epoch": 3.68, + "learning_rate": 8.108306963654452e-06, + "loss": 2.5981, + "step": 7335 + }, + { + "epoch": 3.68, + "learning_rate": 8.079261693813487e-06, + "loss": 2.6233, + "step": 7340 + }, + { + "epoch": 3.69, + "learning_rate": 8.05025851022885e-06, + "loss": 2.4973, + "step": 7345 + }, + { + "epoch": 3.69, + "learning_rate": 8.02129748503897e-06, + "loss": 2.5353, + "step": 7350 + }, + { + "epoch": 3.69, + "learning_rate": 7.992378690277416e-06, + "loss": 2.5229, + "step": 7355 + }, + { + "epoch": 3.69, + "learning_rate": 7.96350219787271e-06, + "loss": 2.2312, + "step": 7360 + }, + { + "epoch": 3.7, + "learning_rate": 7.93466807964817e-06, + "loss": 2.6011, + "step": 7365 + }, + { + "epoch": 3.7, + "learning_rate": 7.905876407321711e-06, + "loss": 2.4813, + "step": 7370 + }, + { + "epoch": 3.7, + "learning_rate": 7.87712725250567e-06, + "loss": 2.4722, + "step": 7375 + }, + { + "epoch": 3.7, + "learning_rate": 7.848420686706643e-06, + "loss": 2.6481, + "step": 7380 + }, + { + "epoch": 3.71, + "learning_rate": 7.819756781325285e-06, + "loss": 2.5964, + "step": 7385 + }, + { + "epoch": 3.71, + "learning_rate": 7.791135607656147e-06, + "loss": 2.4698, + "step": 7390 + }, + { + "epoch": 3.71, + "learning_rate": 7.762557236887507e-06, + "loss": 2.6941, + "step": 7395 + }, + { + "epoch": 3.71, + "learning_rate": 7.734021740101168e-06, + "loss": 2.6679, + "step": 7400 + }, + { + "epoch": 3.72, + "learning_rate": 7.705529188272295e-06, + "loss": 2.7456, + "step": 7405 + }, + { + "epoch": 3.72, + "learning_rate": 7.67707965226924e-06, + "loss": 2.3179, + "step": 7410 + }, + { + "epoch": 3.72, + "learning_rate": 7.64867320285337e-06, + "loss": 2.5265, + "step": 7415 + }, + { + "epoch": 3.72, + "learning_rate": 7.620309910678866e-06, + "loss": 2.4766, + "step": 7420 + }, + { + "epoch": 3.73, + "learning_rate": 7.59198984629258e-06, + "loss": 2.5339, + "step": 7425 + }, + { + "epoch": 3.73, + "learning_rate": 7.56371308013385e-06, + "loss": 2.5665, + "step": 7430 + }, + { + "epoch": 3.73, + "learning_rate": 7.535479682534302e-06, + "loss": 2.6048, + "step": 7435 + }, + { + "epoch": 3.73, + "learning_rate": 7.50728972371772e-06, + "loss": 2.5792, + "step": 7440 + }, + { + "epoch": 3.74, + "learning_rate": 7.479143273799818e-06, + "loss": 2.6327, + "step": 7445 + }, + { + "epoch": 3.74, + "learning_rate": 7.451040402788109e-06, + "loss": 2.4764, + "step": 7450 + }, + { + "epoch": 3.74, + "learning_rate": 7.4229811805817065e-06, + "loss": 2.5359, + "step": 7455 + }, + { + "epoch": 3.74, + "learning_rate": 7.394965676971158e-06, + "loss": 2.3672, + "step": 7460 + }, + { + "epoch": 3.75, + "learning_rate": 7.3669939616382744e-06, + "loss": 2.2471, + "step": 7465 + }, + { + "epoch": 3.75, + "learning_rate": 7.33906610415595e-06, + "loss": 2.2825, + "step": 7470 + }, + { + "epoch": 3.75, + "learning_rate": 7.311182173987999e-06, + "loss": 2.8013, + "step": 7475 + }, + { + "epoch": 3.75, + "learning_rate": 7.283342240488972e-06, + "loss": 2.6741, + "step": 7480 + }, + { + "epoch": 3.76, + "learning_rate": 7.25554637290399e-06, + "loss": 2.4988, + "step": 7485 + }, + { + "epoch": 3.76, + "learning_rate": 7.227794640368573e-06, + "loss": 2.6571, + "step": 7490 + }, + { + "epoch": 3.76, + "learning_rate": 7.2000871119084575e-06, + "loss": 2.7367, + "step": 7495 + }, + { + "epoch": 3.76, + "learning_rate": 7.172423856439459e-06, + "loss": 2.6667, + "step": 7500 + }, + { + "epoch": 3.77, + "learning_rate": 7.144804942767231e-06, + "loss": 2.6674, + "step": 7505 + }, + { + "epoch": 3.77, + "learning_rate": 7.117230439587172e-06, + "loss": 2.8285, + "step": 7510 + }, + { + "epoch": 3.77, + "learning_rate": 7.0952028586902694e-06, + "loss": 2.512, + "step": 7515 + }, + { + "epoch": 3.77, + "learning_rate": 7.067708467155793e-06, + "loss": 2.6181, + "step": 7520 + }, + { + "epoch": 3.78, + "learning_rate": 7.040258677872366e-06, + "loss": 2.5267, + "step": 7525 + }, + { + "epoch": 3.78, + "learning_rate": 7.012853559114737e-06, + "loss": 2.4466, + "step": 7530 + }, + { + "epoch": 3.78, + "learning_rate": 6.985493179046529e-06, + "loss": 2.1915, + "step": 7535 + }, + { + "epoch": 3.78, + "learning_rate": 6.958177605720082e-06, + "loss": 2.427, + "step": 7540 + }, + { + "epoch": 3.79, + "learning_rate": 6.930906907076301e-06, + "loss": 2.3777, + "step": 7545 + }, + { + "epoch": 3.79, + "learning_rate": 6.9036811509444715e-06, + "loss": 2.3888, + "step": 7550 + }, + { + "epoch": 3.79, + "learning_rate": 6.8765004050421075e-06, + "loss": 2.6918, + "step": 7555 + }, + { + "epoch": 3.79, + "learning_rate": 6.849364736974745e-06, + "loss": 2.4888, + "step": 7560 + }, + { + "epoch": 3.8, + "learning_rate": 6.822274214235819e-06, + "loss": 2.5234, + "step": 7565 + }, + { + "epoch": 3.8, + "learning_rate": 6.7952289042064655e-06, + "loss": 2.5397, + "step": 7570 + }, + { + "epoch": 3.8, + "learning_rate": 6.768228874155388e-06, + "loss": 2.6419, + "step": 7575 + }, + { + "epoch": 3.8, + "learning_rate": 6.741274191238642e-06, + "loss": 2.5339, + "step": 7580 + }, + { + "epoch": 3.81, + "learning_rate": 6.7143649224995056e-06, + "loss": 2.4222, + "step": 7585 + }, + { + "epoch": 3.81, + "learning_rate": 6.68750113486829e-06, + "loss": 2.4719, + "step": 7590 + }, + { + "epoch": 3.81, + "learning_rate": 6.660682895162191e-06, + "loss": 2.6034, + "step": 7595 + }, + { + "epoch": 3.81, + "learning_rate": 6.6339102700851144e-06, + "loss": 2.4438, + "step": 7600 + }, + { + "epoch": 3.82, + "learning_rate": 6.607183326227509e-06, + "loss": 2.6244, + "step": 7605 + }, + { + "epoch": 3.82, + "learning_rate": 6.580502130066201e-06, + "loss": 2.4553, + "step": 7610 + }, + { + "epoch": 3.82, + "learning_rate": 6.5538667479642376e-06, + "loss": 2.7125, + "step": 7615 + }, + { + "epoch": 3.82, + "learning_rate": 6.527277246170702e-06, + "loss": 2.5128, + "step": 7620 + }, + { + "epoch": 3.83, + "learning_rate": 6.500733690820571e-06, + "loss": 2.3843, + "step": 7625 + }, + { + "epoch": 3.83, + "learning_rate": 6.474236147934529e-06, + "loss": 2.5529, + "step": 7630 + }, + { + "epoch": 3.83, + "learning_rate": 6.4477846834188425e-06, + "loss": 2.6161, + "step": 7635 + }, + { + "epoch": 3.83, + "learning_rate": 6.421379363065142e-06, + "loss": 2.4355, + "step": 7640 + }, + { + "epoch": 3.84, + "learning_rate": 6.395020252550302e-06, + "loss": 2.3781, + "step": 7645 + }, + { + "epoch": 3.84, + "learning_rate": 6.368707417436237e-06, + "loss": 2.3661, + "step": 7650 + }, + { + "epoch": 3.84, + "learning_rate": 6.34244092316979e-06, + "loss": 2.3473, + "step": 7655 + }, + { + "epoch": 3.84, + "learning_rate": 6.316220835082528e-06, + "loss": 2.6448, + "step": 7660 + }, + { + "epoch": 3.85, + "learning_rate": 6.290047218390605e-06, + "loss": 2.5152, + "step": 7665 + }, + { + "epoch": 3.85, + "learning_rate": 6.2639201381945705e-06, + "loss": 2.317, + "step": 7670 + }, + { + "epoch": 3.85, + "learning_rate": 6.237839659479239e-06, + "loss": 2.4129, + "step": 7675 + }, + { + "epoch": 3.85, + "learning_rate": 6.2118058471135195e-06, + "loss": 2.196, + "step": 7680 + }, + { + "epoch": 3.86, + "learning_rate": 6.185818765850238e-06, + "loss": 2.6682, + "step": 7685 + }, + { + "epoch": 3.86, + "learning_rate": 6.159878480325995e-06, + "loss": 2.4091, + "step": 7690 + }, + { + "epoch": 3.86, + "learning_rate": 6.133985055060992e-06, + "loss": 2.7428, + "step": 7695 + }, + { + "epoch": 3.86, + "learning_rate": 6.108138554458881e-06, + "loss": 2.4382, + "step": 7700 + }, + { + "epoch": 3.87, + "learning_rate": 6.082339042806601e-06, + "loss": 2.5848, + "step": 7705 + }, + { + "epoch": 3.87, + "learning_rate": 6.056586584274218e-06, + "loss": 2.2978, + "step": 7710 + }, + { + "epoch": 3.87, + "learning_rate": 6.030881242914757e-06, + "loss": 2.6048, + "step": 7715 + }, + { + "epoch": 3.87, + "learning_rate": 6.005223082664063e-06, + "loss": 2.4201, + "step": 7720 + }, + { + "epoch": 3.88, + "learning_rate": 5.9796121673406174e-06, + "loss": 2.4145, + "step": 7725 + }, + { + "epoch": 3.88, + "learning_rate": 5.954048560645398e-06, + "loss": 2.3145, + "step": 7730 + }, + { + "epoch": 3.88, + "learning_rate": 5.928532326161712e-06, + "loss": 2.6399, + "step": 7735 + }, + { + "epoch": 3.88, + "learning_rate": 5.9030635273550404e-06, + "loss": 2.6007, + "step": 7740 + }, + { + "epoch": 3.89, + "learning_rate": 5.8776422275728774e-06, + "loss": 2.3568, + "step": 7745 + }, + { + "epoch": 3.89, + "learning_rate": 5.8522684900445765e-06, + "loss": 2.63, + "step": 7750 + }, + { + "epoch": 3.89, + "learning_rate": 5.826942377881195e-06, + "loss": 2.3783, + "step": 7755 + }, + { + "epoch": 3.89, + "learning_rate": 5.8016639540753234e-06, + "loss": 2.6271, + "step": 7760 + }, + { + "epoch": 3.9, + "learning_rate": 5.776433281500951e-06, + "loss": 2.4406, + "step": 7765 + }, + { + "epoch": 3.9, + "learning_rate": 5.75125042291329e-06, + "loss": 2.4872, + "step": 7770 + }, + { + "epoch": 3.9, + "learning_rate": 5.726115440948626e-06, + "loss": 2.3784, + "step": 7775 + }, + { + "epoch": 3.9, + "learning_rate": 5.70102839812417e-06, + "loss": 2.3692, + "step": 7780 + }, + { + "epoch": 3.91, + "learning_rate": 5.675989356837879e-06, + "loss": 2.2512, + "step": 7785 + }, + { + "epoch": 3.91, + "learning_rate": 5.6509983793683525e-06, + "loss": 2.3779, + "step": 7790 + }, + { + "epoch": 3.91, + "learning_rate": 5.626055527874605e-06, + "loss": 2.59, + "step": 7795 + }, + { + "epoch": 3.91, + "learning_rate": 5.601160864395971e-06, + "loss": 2.3533, + "step": 7800 + }, + { + "epoch": 3.92, + "learning_rate": 5.576314450851922e-06, + "loss": 2.8845, + "step": 7805 + }, + { + "epoch": 3.92, + "learning_rate": 5.5515163490419155e-06, + "loss": 2.6464, + "step": 7810 + }, + { + "epoch": 3.92, + "learning_rate": 5.526766620645258e-06, + "loss": 2.5061, + "step": 7815 + }, + { + "epoch": 3.92, + "learning_rate": 5.5020653272209235e-06, + "loss": 2.3723, + "step": 7820 + }, + { + "epoch": 3.93, + "learning_rate": 5.477412530207435e-06, + "loss": 2.5008, + "step": 7825 + }, + { + "epoch": 3.93, + "learning_rate": 5.452808290922656e-06, + "loss": 2.7899, + "step": 7830 + }, + { + "epoch": 3.93, + "learning_rate": 5.428252670563721e-06, + "loss": 2.6914, + "step": 7835 + }, + { + "epoch": 3.93, + "learning_rate": 5.403745730206811e-06, + "loss": 2.7228, + "step": 7840 + }, + { + "epoch": 3.94, + "learning_rate": 5.379287530807023e-06, + "loss": 2.4227, + "step": 7845 + }, + { + "epoch": 3.94, + "learning_rate": 5.354878133198237e-06, + "loss": 2.5355, + "step": 7850 + }, + { + "epoch": 3.94, + "learning_rate": 5.33051759809294e-06, + "loss": 2.4151, + "step": 7855 + }, + { + "epoch": 3.94, + "learning_rate": 5.3062059860820915e-06, + "loss": 2.5943, + "step": 7860 + }, + { + "epoch": 3.95, + "learning_rate": 5.281943357634961e-06, + "loss": 2.5721, + "step": 7865 + }, + { + "epoch": 3.95, + "learning_rate": 5.257729773098988e-06, + "loss": 2.2355, + "step": 7870 + }, + { + "epoch": 3.95, + "learning_rate": 5.233565292699624e-06, + "loss": 2.6952, + "step": 7875 + }, + { + "epoch": 3.95, + "learning_rate": 5.209449976540187e-06, + "loss": 2.5381, + "step": 7880 + }, + { + "epoch": 3.96, + "learning_rate": 5.1853838846017135e-06, + "loss": 2.4474, + "step": 7885 + }, + { + "epoch": 3.96, + "learning_rate": 5.161367076742796e-06, + "loss": 2.4288, + "step": 7890 + }, + { + "epoch": 3.96, + "learning_rate": 5.1373996126994646e-06, + "loss": 2.5151, + "step": 7895 + }, + { + "epoch": 3.96, + "learning_rate": 5.113481552085001e-06, + "loss": 2.4231, + "step": 7900 + }, + { + "epoch": 3.97, + "learning_rate": 5.089612954389814e-06, + "loss": 2.5372, + "step": 7905 + }, + { + "epoch": 3.97, + "learning_rate": 5.06579387898129e-06, + "loss": 2.5082, + "step": 7910 + }, + { + "epoch": 3.97, + "learning_rate": 5.042024385103624e-06, + "loss": 2.3919, + "step": 7915 + }, + { + "epoch": 3.97, + "learning_rate": 5.018304531877704e-06, + "loss": 2.4821, + "step": 7920 + }, + { + "epoch": 3.98, + "learning_rate": 4.9946343783009495e-06, + "loss": 2.5556, + "step": 7925 + }, + { + "epoch": 3.98, + "learning_rate": 4.971013983247158e-06, + "loss": 2.4618, + "step": 7930 + }, + { + "epoch": 3.98, + "learning_rate": 4.947443405466357e-06, + "loss": 2.6289, + "step": 7935 + }, + { + "epoch": 3.98, + "learning_rate": 4.923922703584691e-06, + "loss": 2.652, + "step": 7940 + }, + { + "epoch": 3.99, + "learning_rate": 4.9004519361042275e-06, + "loss": 2.5852, + "step": 7945 + }, + { + "epoch": 3.99, + "learning_rate": 4.877031161402843e-06, + "loss": 2.3628, + "step": 7950 + }, + { + "epoch": 3.99, + "learning_rate": 4.853660437734062e-06, + "loss": 2.5503, + "step": 7955 + }, + { + "epoch": 3.99, + "learning_rate": 4.8303398232269255e-06, + "loss": 2.6815, + "step": 7960 + }, + { + "epoch": 4.0, + "learning_rate": 4.807069375885842e-06, + "loss": 2.6943, + "step": 7965 + }, + { + "epoch": 4.0, + "learning_rate": 4.783849153590436e-06, + "loss": 2.3604, + "step": 7970 + }, + { + "epoch": 4.0, + "learning_rate": 4.760679214095409e-06, + "loss": 2.4289, + "step": 7975 + }, + { + "epoch": 4.0, + "learning_rate": 4.737559615030402e-06, + "loss": 2.3585, + "step": 7980 + }, + { + "epoch": 4.01, + "learning_rate": 4.714490413899839e-06, + "loss": 2.3335, + "step": 7985 + }, + { + "epoch": 4.01, + "learning_rate": 4.6914716680828e-06, + "loss": 2.5913, + "step": 7990 + }, + { + "epoch": 4.01, + "learning_rate": 4.668503434832852e-06, + "loss": 2.6069, + "step": 7995 + }, + { + "epoch": 4.01, + "learning_rate": 4.645585771277961e-06, + "loss": 2.332, + "step": 8000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 4.234937815546921e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8000/training_args.bin b/checkpoint-8000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-8000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-8500/README.md b/checkpoint-8500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-8500/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-8500/adapter_config.json b/checkpoint-8500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-8500/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-8500/adapter_model.safetensors b/checkpoint-8500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5e9fdc16a0b5547a2a568d303a9a8c40c1fc8974 --- /dev/null +++ b/checkpoint-8500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400d83b14980ded54e4563e99268f553a81acaf5d9f73ea2bbd2d035b2b49d68 +size 7807744 diff --git a/checkpoint-8500/optimizer.pt b/checkpoint-8500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..50381bf8b4eb20c13a37a3866fd7372802b84cbf --- /dev/null +++ b/checkpoint-8500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f7f40381ea4b4b9606d748d99ca2b81de0aca9ade184d1d2d420674b66c9910 +size 15644485 diff --git a/checkpoint-8500/rng_state.pth b/checkpoint-8500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3029c945c38c11f23713f69cdc6e756a999c7f8 --- /dev/null +++ b/checkpoint-8500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf26c88952b6472bbc4f5f4a4ee25d8c61ec6c8dbbae309ce005701e2a2cd9ec +size 14575 diff --git a/checkpoint-8500/scheduler.pt b/checkpoint-8500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b35f1c462b3468badd5dc884c5742516e2bd0f0d --- /dev/null +++ b/checkpoint-8500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a19abc73d672246da97a27313e98adff9b054d43458aef2cfa3f6c1c071f4612 +size 627 diff --git a/checkpoint-8500/special_tokens_map.json b/checkpoint-8500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-8500/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-8500/tokenization_chatglm.py b/checkpoint-8500/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-8500/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-8500/tokenizer.model b/checkpoint-8500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-8500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-8500/tokenizer_config.json b/checkpoint-8500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-8500/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-8500/trainer_state.json b/checkpoint-8500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d865eaff150d56f68991daa85b22d29e8ad1801c --- /dev/null +++ b/checkpoint-8500/trainer_state.json @@ -0,0 +1,10221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.265462300840547, + "eval_steps": 500, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + }, + { + "epoch": 3.26, + "learning_rate": 1.3453953217942436e-05, + "loss": 2.5565, + "step": 6505 + }, + { + "epoch": 3.27, + "learning_rate": 1.3418996780919804e-05, + "loss": 2.5866, + "step": 6510 + }, + { + "epoch": 3.27, + "learning_rate": 1.3384069148850087e-05, + "loss": 2.5992, + "step": 6515 + }, + { + "epoch": 3.27, + "learning_rate": 1.3349170408607342e-05, + "loss": 2.4388, + "step": 6520 + }, + { + "epoch": 3.27, + "learning_rate": 1.3314300646993771e-05, + "loss": 2.2734, + "step": 6525 + }, + { + "epoch": 3.28, + "learning_rate": 1.3279459950739489e-05, + "loss": 2.7683, + "step": 6530 + }, + { + "epoch": 3.28, + "learning_rate": 1.3244648406502331e-05, + "loss": 2.3653, + "step": 6535 + }, + { + "epoch": 3.28, + "learning_rate": 1.3209866100867613e-05, + "loss": 2.6401, + "step": 6540 + }, + { + "epoch": 3.28, + "learning_rate": 1.3175113120347943e-05, + "loss": 2.5218, + "step": 6545 + }, + { + "epoch": 3.29, + "learning_rate": 1.3140389551382975e-05, + "loss": 2.4681, + "step": 6550 + }, + { + "epoch": 3.29, + "learning_rate": 1.3105695480339206e-05, + "loss": 2.4681, + "step": 6555 + }, + { + "epoch": 3.29, + "learning_rate": 1.3071030993509788e-05, + "loss": 2.5743, + "step": 6560 + }, + { + "epoch": 3.29, + "learning_rate": 1.303639617711427e-05, + "loss": 2.5423, + "step": 6565 + }, + { + "epoch": 3.3, + "learning_rate": 1.3001791117298395e-05, + "loss": 2.4267, + "step": 6570 + }, + { + "epoch": 3.3, + "learning_rate": 1.2967215900133911e-05, + "loss": 2.5537, + "step": 6575 + }, + { + "epoch": 3.3, + "learning_rate": 1.2932670611618336e-05, + "loss": 2.5451, + "step": 6580 + }, + { + "epoch": 3.3, + "learning_rate": 1.2898155337674744e-05, + "loss": 2.4048, + "step": 6585 + }, + { + "epoch": 3.31, + "learning_rate": 1.2863670164151551e-05, + "loss": 2.6769, + "step": 6590 + }, + { + "epoch": 3.31, + "learning_rate": 1.2829215176822316e-05, + "loss": 2.2118, + "step": 6595 + }, + { + "epoch": 3.31, + "learning_rate": 1.2794790461385508e-05, + "loss": 2.2912, + "step": 6600 + }, + { + "epoch": 3.31, + "learning_rate": 1.2760396103464309e-05, + "loss": 2.3978, + "step": 6605 + }, + { + "epoch": 3.32, + "learning_rate": 1.2726032188606388e-05, + "loss": 2.4801, + "step": 6610 + }, + { + "epoch": 3.32, + "learning_rate": 1.2691698802283697e-05, + "loss": 2.5522, + "step": 6615 + }, + { + "epoch": 3.32, + "learning_rate": 1.2657396029892258e-05, + "loss": 2.6728, + "step": 6620 + }, + { + "epoch": 3.32, + "learning_rate": 1.2623123956751943e-05, + "loss": 2.2937, + "step": 6625 + }, + { + "epoch": 3.33, + "learning_rate": 1.258888266810627e-05, + "loss": 2.5459, + "step": 6630 + }, + { + "epoch": 3.33, + "learning_rate": 1.2554672249122187e-05, + "loss": 2.6329, + "step": 6635 + }, + { + "epoch": 3.33, + "learning_rate": 1.2520492784889865e-05, + "loss": 2.4845, + "step": 6640 + }, + { + "epoch": 3.33, + "learning_rate": 1.2486344360422475e-05, + "loss": 2.5023, + "step": 6645 + }, + { + "epoch": 3.34, + "learning_rate": 1.2452227060655993e-05, + "loss": 2.5674, + "step": 6650 + }, + { + "epoch": 3.34, + "learning_rate": 1.2418140970448975e-05, + "loss": 2.4996, + "step": 6655 + }, + { + "epoch": 3.34, + "learning_rate": 1.2384086174582336e-05, + "loss": 2.4704, + "step": 6660 + }, + { + "epoch": 3.34, + "learning_rate": 1.2350062757759193e-05, + "loss": 2.4928, + "step": 6665 + }, + { + "epoch": 3.35, + "learning_rate": 1.2316070804604576e-05, + "loss": 2.5498, + "step": 6670 + }, + { + "epoch": 3.35, + "learning_rate": 1.228211039966528e-05, + "loss": 2.5641, + "step": 6675 + }, + { + "epoch": 3.35, + "learning_rate": 1.2248181627409619e-05, + "loss": 2.5725, + "step": 6680 + }, + { + "epoch": 3.35, + "learning_rate": 1.221428457222723e-05, + "loss": 2.5827, + "step": 6685 + }, + { + "epoch": 3.36, + "learning_rate": 1.2180419318428868e-05, + "loss": 2.3591, + "step": 6690 + }, + { + "epoch": 3.36, + "learning_rate": 1.2146585950246186e-05, + "loss": 2.5772, + "step": 6695 + }, + { + "epoch": 3.36, + "learning_rate": 1.2112784551831533e-05, + "loss": 2.4008, + "step": 6700 + }, + { + "epoch": 3.36, + "learning_rate": 1.2079015207257724e-05, + "loss": 2.3334, + "step": 6705 + }, + { + "epoch": 3.37, + "learning_rate": 1.2045278000517857e-05, + "loss": 2.8023, + "step": 6710 + }, + { + "epoch": 3.37, + "learning_rate": 1.2011573015525118e-05, + "loss": 2.6145, + "step": 6715 + }, + { + "epoch": 3.37, + "learning_rate": 1.1977900336112519e-05, + "loss": 2.6568, + "step": 6720 + }, + { + "epoch": 3.37, + "learning_rate": 1.1944260046032735e-05, + "loss": 2.1771, + "step": 6725 + }, + { + "epoch": 3.38, + "learning_rate": 1.1910652228957872e-05, + "loss": 2.4932, + "step": 6730 + }, + { + "epoch": 3.38, + "learning_rate": 1.187707696847927e-05, + "loss": 2.3883, + "step": 6735 + }, + { + "epoch": 3.38, + "learning_rate": 1.1843534348107294e-05, + "loss": 2.7792, + "step": 6740 + }, + { + "epoch": 3.38, + "learning_rate": 1.1810024451271125e-05, + "loss": 2.5825, + "step": 6745 + }, + { + "epoch": 3.39, + "learning_rate": 1.1776547361318551e-05, + "loss": 2.406, + "step": 6750 + }, + { + "epoch": 3.39, + "learning_rate": 1.1743103161515762e-05, + "loss": 2.5823, + "step": 6755 + }, + { + "epoch": 3.39, + "learning_rate": 1.1709691935047137e-05, + "loss": 2.7587, + "step": 6760 + }, + { + "epoch": 3.39, + "learning_rate": 1.1676313765015038e-05, + "loss": 2.5183, + "step": 6765 + }, + { + "epoch": 3.4, + "learning_rate": 1.1642968734439633e-05, + "loss": 2.6452, + "step": 6770 + }, + { + "epoch": 3.4, + "learning_rate": 1.1609656926258634e-05, + "loss": 2.4641, + "step": 6775 + }, + { + "epoch": 3.4, + "learning_rate": 1.1576378423327131e-05, + "loss": 2.6462, + "step": 6780 + }, + { + "epoch": 3.4, + "learning_rate": 1.1543133308417378e-05, + "loss": 2.5271, + "step": 6785 + }, + { + "epoch": 3.41, + "learning_rate": 1.1509921664218587e-05, + "loss": 2.4245, + "step": 6790 + }, + { + "epoch": 3.41, + "learning_rate": 1.14767435733367e-05, + "loss": 2.3622, + "step": 6795 + }, + { + "epoch": 3.41, + "learning_rate": 1.1443599118294227e-05, + "loss": 2.5564, + "step": 6800 + }, + { + "epoch": 3.41, + "learning_rate": 1.1410488381530005e-05, + "loss": 2.342, + "step": 6805 + }, + { + "epoch": 3.42, + "learning_rate": 1.1377411445399006e-05, + "loss": 2.4976, + "step": 6810 + }, + { + "epoch": 3.42, + "learning_rate": 1.1344368392172125e-05, + "loss": 2.4792, + "step": 6815 + }, + { + "epoch": 3.42, + "learning_rate": 1.1311359304036013e-05, + "loss": 2.4829, + "step": 6820 + }, + { + "epoch": 3.42, + "learning_rate": 1.1278384263092797e-05, + "loss": 2.3949, + "step": 6825 + }, + { + "epoch": 3.43, + "learning_rate": 1.124544335135995e-05, + "loss": 2.5555, + "step": 6830 + }, + { + "epoch": 3.43, + "learning_rate": 1.1212536650770041e-05, + "loss": 2.5479, + "step": 6835 + }, + { + "epoch": 3.43, + "learning_rate": 1.1179664243170554e-05, + "loss": 2.5333, + "step": 6840 + }, + { + "epoch": 3.43, + "learning_rate": 1.1146826210323677e-05, + "loss": 2.0832, + "step": 6845 + }, + { + "epoch": 3.44, + "learning_rate": 1.1114022633906096e-05, + "loss": 2.7639, + "step": 6850 + }, + { + "epoch": 3.44, + "learning_rate": 1.10812535955088e-05, + "loss": 2.663, + "step": 6855 + }, + { + "epoch": 3.44, + "learning_rate": 1.104851917663687e-05, + "loss": 2.485, + "step": 6860 + }, + { + "epoch": 3.44, + "learning_rate": 1.1015819458709279e-05, + "loss": 2.3004, + "step": 6865 + }, + { + "epoch": 3.45, + "learning_rate": 1.0983154523058687e-05, + "loss": 2.3924, + "step": 6870 + }, + { + "epoch": 3.45, + "learning_rate": 1.095052445093124e-05, + "loss": 2.4694, + "step": 6875 + }, + { + "epoch": 3.45, + "learning_rate": 1.0917929323486398e-05, + "loss": 2.5255, + "step": 6880 + }, + { + "epoch": 3.46, + "learning_rate": 1.0885369221796657e-05, + "loss": 2.211, + "step": 6885 + }, + { + "epoch": 3.46, + "learning_rate": 1.0852844226847425e-05, + "loss": 2.5446, + "step": 6890 + }, + { + "epoch": 3.46, + "learning_rate": 1.0820354419536786e-05, + "loss": 2.778, + "step": 6895 + }, + { + "epoch": 3.46, + "learning_rate": 1.0787899880675298e-05, + "loss": 2.5628, + "step": 6900 + }, + { + "epoch": 3.47, + "learning_rate": 1.0755480690985803e-05, + "loss": 2.5333, + "step": 6905 + }, + { + "epoch": 3.47, + "learning_rate": 1.0723096931103218e-05, + "loss": 2.7511, + "step": 6910 + }, + { + "epoch": 3.47, + "learning_rate": 1.0690748681574336e-05, + "loss": 2.2807, + "step": 6915 + }, + { + "epoch": 3.47, + "learning_rate": 1.0658436022857617e-05, + "loss": 2.5652, + "step": 6920 + }, + { + "epoch": 3.48, + "learning_rate": 1.062615903532303e-05, + "loss": 2.7855, + "step": 6925 + }, + { + "epoch": 3.48, + "learning_rate": 1.0593917799251785e-05, + "loss": 2.5029, + "step": 6930 + }, + { + "epoch": 3.48, + "learning_rate": 1.0561712394836184e-05, + "loss": 2.3403, + "step": 6935 + }, + { + "epoch": 3.48, + "learning_rate": 1.0529542902179406e-05, + "loss": 2.748, + "step": 6940 + }, + { + "epoch": 3.49, + "learning_rate": 1.0497409401295303e-05, + "loss": 2.4717, + "step": 6945 + }, + { + "epoch": 3.49, + "learning_rate": 1.0465311972108214e-05, + "loss": 2.6532, + "step": 6950 + }, + { + "epoch": 3.49, + "learning_rate": 1.043325069445275e-05, + "loss": 2.3954, + "step": 6955 + }, + { + "epoch": 3.49, + "learning_rate": 1.0401225648073612e-05, + "loss": 2.4491, + "step": 6960 + }, + { + "epoch": 3.5, + "learning_rate": 1.0369236912625377e-05, + "loss": 2.8167, + "step": 6965 + }, + { + "epoch": 3.5, + "learning_rate": 1.0337284567672314e-05, + "loss": 2.4416, + "step": 6970 + }, + { + "epoch": 3.5, + "learning_rate": 1.0305368692688174e-05, + "loss": 2.4095, + "step": 6975 + }, + { + "epoch": 3.5, + "learning_rate": 1.0273489367056002e-05, + "loss": 2.6135, + "step": 6980 + }, + { + "epoch": 3.51, + "learning_rate": 1.0241646670067932e-05, + "loss": 2.7131, + "step": 6985 + }, + { + "epoch": 3.51, + "learning_rate": 1.0209840680924993e-05, + "loss": 2.45, + "step": 6990 + }, + { + "epoch": 3.51, + "learning_rate": 1.0178071478736914e-05, + "loss": 2.4902, + "step": 6995 + }, + { + "epoch": 3.51, + "learning_rate": 1.0146339142521926e-05, + "loss": 2.572, + "step": 7000 + }, + { + "epoch": 3.52, + "learning_rate": 1.0114643751206562e-05, + "loss": 2.3915, + "step": 7005 + }, + { + "epoch": 3.52, + "learning_rate": 1.0082985383625468e-05, + "loss": 2.3651, + "step": 7010 + }, + { + "epoch": 3.52, + "learning_rate": 1.0051364118521197e-05, + "loss": 2.0744, + "step": 7015 + }, + { + "epoch": 3.52, + "learning_rate": 1.0019780034544022e-05, + "loss": 2.3651, + "step": 7020 + }, + { + "epoch": 3.53, + "learning_rate": 9.988233210251723e-06, + "loss": 2.423, + "step": 7025 + }, + { + "epoch": 3.53, + "learning_rate": 9.956723724109441e-06, + "loss": 2.7706, + "step": 7030 + }, + { + "epoch": 3.53, + "learning_rate": 9.925251654489415e-06, + "loss": 2.4113, + "step": 7035 + }, + { + "epoch": 3.53, + "learning_rate": 9.893817079670825e-06, + "loss": 2.2758, + "step": 7040 + }, + { + "epoch": 3.54, + "learning_rate": 9.8624200778396e-06, + "loss": 2.4551, + "step": 7045 + }, + { + "epoch": 3.54, + "learning_rate": 9.831060727088215e-06, + "loss": 2.3271, + "step": 7050 + }, + { + "epoch": 3.54, + "learning_rate": 9.799739105415483e-06, + "loss": 2.348, + "step": 7055 + }, + { + "epoch": 3.54, + "learning_rate": 9.768455290726402e-06, + "loss": 2.803, + "step": 7060 + }, + { + "epoch": 3.55, + "learning_rate": 9.737209360831895e-06, + "loss": 2.4977, + "step": 7065 + }, + { + "epoch": 3.55, + "learning_rate": 9.70600139344868e-06, + "loss": 2.6904, + "step": 7070 + }, + { + "epoch": 3.55, + "learning_rate": 9.67483146619907e-06, + "loss": 2.4839, + "step": 7075 + }, + { + "epoch": 3.55, + "learning_rate": 9.64369965661073e-06, + "loss": 2.6168, + "step": 7080 + }, + { + "epoch": 3.56, + "learning_rate": 9.612606042116535e-06, + "loss": 2.3343, + "step": 7085 + }, + { + "epoch": 3.56, + "learning_rate": 9.581550700054345e-06, + "loss": 2.4697, + "step": 7090 + }, + { + "epoch": 3.56, + "learning_rate": 9.550533707666842e-06, + "loss": 2.7164, + "step": 7095 + }, + { + "epoch": 3.56, + "learning_rate": 9.519555142101311e-06, + "loss": 2.5116, + "step": 7100 + }, + { + "epoch": 3.57, + "learning_rate": 9.488615080409468e-06, + "loss": 2.4768, + "step": 7105 + }, + { + "epoch": 3.57, + "learning_rate": 9.457713599547252e-06, + "loss": 2.4756, + "step": 7110 + }, + { + "epoch": 3.57, + "learning_rate": 9.426850776374646e-06, + "loss": 2.4257, + "step": 7115 + }, + { + "epoch": 3.57, + "learning_rate": 9.396026687655483e-06, + "loss": 2.5385, + "step": 7120 + }, + { + "epoch": 3.58, + "learning_rate": 9.365241410057246e-06, + "loss": 2.5497, + "step": 7125 + }, + { + "epoch": 3.58, + "learning_rate": 9.334495020150885e-06, + "loss": 2.5848, + "step": 7130 + }, + { + "epoch": 3.58, + "learning_rate": 9.303787594410648e-06, + "loss": 2.5811, + "step": 7135 + }, + { + "epoch": 3.58, + "learning_rate": 9.273119209213841e-06, + "loss": 2.2504, + "step": 7140 + }, + { + "epoch": 3.59, + "learning_rate": 9.242489940840684e-06, + "loss": 2.4348, + "step": 7145 + }, + { + "epoch": 3.59, + "learning_rate": 9.211899865474086e-06, + "loss": 2.6538, + "step": 7150 + }, + { + "epoch": 3.59, + "learning_rate": 9.181349059199484e-06, + "loss": 2.9365, + "step": 7155 + }, + { + "epoch": 3.59, + "learning_rate": 9.150837598004648e-06, + "loss": 2.4267, + "step": 7160 + }, + { + "epoch": 3.6, + "learning_rate": 9.120365557779472e-06, + "loss": 2.3872, + "step": 7165 + }, + { + "epoch": 3.6, + "learning_rate": 9.089933014315818e-06, + "loss": 2.5116, + "step": 7170 + }, + { + "epoch": 3.6, + "learning_rate": 9.059540043307293e-06, + "loss": 2.3202, + "step": 7175 + }, + { + "epoch": 3.6, + "learning_rate": 9.029186720349078e-06, + "loss": 2.7859, + "step": 7180 + }, + { + "epoch": 3.61, + "learning_rate": 8.998873120937762e-06, + "loss": 2.6064, + "step": 7185 + }, + { + "epoch": 3.61, + "learning_rate": 8.968599320471102e-06, + "loss": 2.8572, + "step": 7190 + }, + { + "epoch": 3.61, + "learning_rate": 8.938365394247877e-06, + "loss": 2.4965, + "step": 7195 + }, + { + "epoch": 3.61, + "learning_rate": 8.908171417467692e-06, + "loss": 2.7261, + "step": 7200 + }, + { + "epoch": 3.62, + "learning_rate": 8.878017465230778e-06, + "loss": 2.6582, + "step": 7205 + }, + { + "epoch": 3.62, + "learning_rate": 8.847903612537826e-06, + "loss": 2.3756, + "step": 7210 + }, + { + "epoch": 3.62, + "learning_rate": 8.817829934289775e-06, + "loss": 2.5582, + "step": 7215 + }, + { + "epoch": 3.62, + "learning_rate": 8.787796505287657e-06, + "loss": 2.6091, + "step": 7220 + }, + { + "epoch": 3.63, + "learning_rate": 8.757803400232379e-06, + "loss": 2.5523, + "step": 7225 + }, + { + "epoch": 3.63, + "learning_rate": 8.727850693724558e-06, + "loss": 2.6721, + "step": 7230 + }, + { + "epoch": 3.63, + "learning_rate": 8.697938460264326e-06, + "loss": 2.6035, + "step": 7235 + }, + { + "epoch": 3.63, + "learning_rate": 8.668066774251158e-06, + "loss": 2.4755, + "step": 7240 + }, + { + "epoch": 3.64, + "learning_rate": 8.638235709983664e-06, + "loss": 2.6591, + "step": 7245 + }, + { + "epoch": 3.64, + "learning_rate": 8.608445341659423e-06, + "loss": 2.3781, + "step": 7250 + }, + { + "epoch": 3.64, + "learning_rate": 8.578695743374798e-06, + "loss": 2.5149, + "step": 7255 + }, + { + "epoch": 3.64, + "learning_rate": 8.548986989124737e-06, + "loss": 2.6264, + "step": 7260 + }, + { + "epoch": 3.65, + "learning_rate": 8.519319152802601e-06, + "loss": 2.638, + "step": 7265 + }, + { + "epoch": 3.65, + "learning_rate": 8.489692308199981e-06, + "loss": 2.4959, + "step": 7270 + }, + { + "epoch": 3.65, + "learning_rate": 8.460106529006511e-06, + "loss": 2.3365, + "step": 7275 + }, + { + "epoch": 3.65, + "learning_rate": 8.430561888809676e-06, + "loss": 2.3178, + "step": 7280 + }, + { + "epoch": 3.66, + "learning_rate": 8.401058461094643e-06, + "loss": 2.5691, + "step": 7285 + }, + { + "epoch": 3.66, + "learning_rate": 8.371596319244087e-06, + "loss": 2.4521, + "step": 7290 + }, + { + "epoch": 3.66, + "learning_rate": 8.342175536537975e-06, + "loss": 2.6887, + "step": 7295 + }, + { + "epoch": 3.66, + "learning_rate": 8.312796186153405e-06, + "loss": 2.2551, + "step": 7300 + }, + { + "epoch": 3.67, + "learning_rate": 8.283458341164432e-06, + "loss": 2.5463, + "step": 7305 + }, + { + "epoch": 3.67, + "learning_rate": 8.254162074541868e-06, + "loss": 2.6583, + "step": 7310 + }, + { + "epoch": 3.67, + "learning_rate": 8.224907459153114e-06, + "loss": 2.5084, + "step": 7315 + }, + { + "epoch": 3.67, + "learning_rate": 8.195694567761968e-06, + "loss": 2.3259, + "step": 7320 + }, + { + "epoch": 3.68, + "learning_rate": 8.166523473028465e-06, + "loss": 2.3955, + "step": 7325 + }, + { + "epoch": 3.68, + "learning_rate": 8.137394247508644e-06, + "loss": 2.5088, + "step": 7330 + }, + { + "epoch": 3.68, + "learning_rate": 8.108306963654452e-06, + "loss": 2.5981, + "step": 7335 + }, + { + "epoch": 3.68, + "learning_rate": 8.079261693813487e-06, + "loss": 2.6233, + "step": 7340 + }, + { + "epoch": 3.69, + "learning_rate": 8.05025851022885e-06, + "loss": 2.4973, + "step": 7345 + }, + { + "epoch": 3.69, + "learning_rate": 8.02129748503897e-06, + "loss": 2.5353, + "step": 7350 + }, + { + "epoch": 3.69, + "learning_rate": 7.992378690277416e-06, + "loss": 2.5229, + "step": 7355 + }, + { + "epoch": 3.69, + "learning_rate": 7.96350219787271e-06, + "loss": 2.2312, + "step": 7360 + }, + { + "epoch": 3.7, + "learning_rate": 7.93466807964817e-06, + "loss": 2.6011, + "step": 7365 + }, + { + "epoch": 3.7, + "learning_rate": 7.905876407321711e-06, + "loss": 2.4813, + "step": 7370 + }, + { + "epoch": 3.7, + "learning_rate": 7.87712725250567e-06, + "loss": 2.4722, + "step": 7375 + }, + { + "epoch": 3.7, + "learning_rate": 7.848420686706643e-06, + "loss": 2.6481, + "step": 7380 + }, + { + "epoch": 3.71, + "learning_rate": 7.819756781325285e-06, + "loss": 2.5964, + "step": 7385 + }, + { + "epoch": 3.71, + "learning_rate": 7.791135607656147e-06, + "loss": 2.4698, + "step": 7390 + }, + { + "epoch": 3.71, + "learning_rate": 7.762557236887507e-06, + "loss": 2.6941, + "step": 7395 + }, + { + "epoch": 3.71, + "learning_rate": 7.734021740101168e-06, + "loss": 2.6679, + "step": 7400 + }, + { + "epoch": 3.72, + "learning_rate": 7.705529188272295e-06, + "loss": 2.7456, + "step": 7405 + }, + { + "epoch": 3.72, + "learning_rate": 7.67707965226924e-06, + "loss": 2.3179, + "step": 7410 + }, + { + "epoch": 3.72, + "learning_rate": 7.64867320285337e-06, + "loss": 2.5265, + "step": 7415 + }, + { + "epoch": 3.72, + "learning_rate": 7.620309910678866e-06, + "loss": 2.4766, + "step": 7420 + }, + { + "epoch": 3.73, + "learning_rate": 7.59198984629258e-06, + "loss": 2.5339, + "step": 7425 + }, + { + "epoch": 3.73, + "learning_rate": 7.56371308013385e-06, + "loss": 2.5665, + "step": 7430 + }, + { + "epoch": 3.73, + "learning_rate": 7.535479682534302e-06, + "loss": 2.6048, + "step": 7435 + }, + { + "epoch": 3.73, + "learning_rate": 7.50728972371772e-06, + "loss": 2.5792, + "step": 7440 + }, + { + "epoch": 3.74, + "learning_rate": 7.479143273799818e-06, + "loss": 2.6327, + "step": 7445 + }, + { + "epoch": 3.74, + "learning_rate": 7.451040402788109e-06, + "loss": 2.4764, + "step": 7450 + }, + { + "epoch": 3.74, + "learning_rate": 7.4229811805817065e-06, + "loss": 2.5359, + "step": 7455 + }, + { + "epoch": 3.74, + "learning_rate": 7.394965676971158e-06, + "loss": 2.3672, + "step": 7460 + }, + { + "epoch": 3.75, + "learning_rate": 7.3669939616382744e-06, + "loss": 2.2471, + "step": 7465 + }, + { + "epoch": 3.75, + "learning_rate": 7.33906610415595e-06, + "loss": 2.2825, + "step": 7470 + }, + { + "epoch": 3.75, + "learning_rate": 7.311182173987999e-06, + "loss": 2.8013, + "step": 7475 + }, + { + "epoch": 3.75, + "learning_rate": 7.283342240488972e-06, + "loss": 2.6741, + "step": 7480 + }, + { + "epoch": 3.76, + "learning_rate": 7.25554637290399e-06, + "loss": 2.4988, + "step": 7485 + }, + { + "epoch": 3.76, + "learning_rate": 7.227794640368573e-06, + "loss": 2.6571, + "step": 7490 + }, + { + "epoch": 3.76, + "learning_rate": 7.2000871119084575e-06, + "loss": 2.7367, + "step": 7495 + }, + { + "epoch": 3.76, + "learning_rate": 7.172423856439459e-06, + "loss": 2.6667, + "step": 7500 + }, + { + "epoch": 3.77, + "learning_rate": 7.144804942767231e-06, + "loss": 2.6674, + "step": 7505 + }, + { + "epoch": 3.77, + "learning_rate": 7.117230439587172e-06, + "loss": 2.8285, + "step": 7510 + }, + { + "epoch": 3.77, + "learning_rate": 7.0952028586902694e-06, + "loss": 2.512, + "step": 7515 + }, + { + "epoch": 3.77, + "learning_rate": 7.067708467155793e-06, + "loss": 2.6181, + "step": 7520 + }, + { + "epoch": 3.78, + "learning_rate": 7.040258677872366e-06, + "loss": 2.5267, + "step": 7525 + }, + { + "epoch": 3.78, + "learning_rate": 7.012853559114737e-06, + "loss": 2.4466, + "step": 7530 + }, + { + "epoch": 3.78, + "learning_rate": 6.985493179046529e-06, + "loss": 2.1915, + "step": 7535 + }, + { + "epoch": 3.78, + "learning_rate": 6.958177605720082e-06, + "loss": 2.427, + "step": 7540 + }, + { + "epoch": 3.79, + "learning_rate": 6.930906907076301e-06, + "loss": 2.3777, + "step": 7545 + }, + { + "epoch": 3.79, + "learning_rate": 6.9036811509444715e-06, + "loss": 2.3888, + "step": 7550 + }, + { + "epoch": 3.79, + "learning_rate": 6.8765004050421075e-06, + "loss": 2.6918, + "step": 7555 + }, + { + "epoch": 3.79, + "learning_rate": 6.849364736974745e-06, + "loss": 2.4888, + "step": 7560 + }, + { + "epoch": 3.8, + "learning_rate": 6.822274214235819e-06, + "loss": 2.5234, + "step": 7565 + }, + { + "epoch": 3.8, + "learning_rate": 6.7952289042064655e-06, + "loss": 2.5397, + "step": 7570 + }, + { + "epoch": 3.8, + "learning_rate": 6.768228874155388e-06, + "loss": 2.6419, + "step": 7575 + }, + { + "epoch": 3.8, + "learning_rate": 6.741274191238642e-06, + "loss": 2.5339, + "step": 7580 + }, + { + "epoch": 3.81, + "learning_rate": 6.7143649224995056e-06, + "loss": 2.4222, + "step": 7585 + }, + { + "epoch": 3.81, + "learning_rate": 6.68750113486829e-06, + "loss": 2.4719, + "step": 7590 + }, + { + "epoch": 3.81, + "learning_rate": 6.660682895162191e-06, + "loss": 2.6034, + "step": 7595 + }, + { + "epoch": 3.81, + "learning_rate": 6.6339102700851144e-06, + "loss": 2.4438, + "step": 7600 + }, + { + "epoch": 3.82, + "learning_rate": 6.607183326227509e-06, + "loss": 2.6244, + "step": 7605 + }, + { + "epoch": 3.82, + "learning_rate": 6.580502130066201e-06, + "loss": 2.4553, + "step": 7610 + }, + { + "epoch": 3.82, + "learning_rate": 6.5538667479642376e-06, + "loss": 2.7125, + "step": 7615 + }, + { + "epoch": 3.82, + "learning_rate": 6.527277246170702e-06, + "loss": 2.5128, + "step": 7620 + }, + { + "epoch": 3.83, + "learning_rate": 6.500733690820571e-06, + "loss": 2.3843, + "step": 7625 + }, + { + "epoch": 3.83, + "learning_rate": 6.474236147934529e-06, + "loss": 2.5529, + "step": 7630 + }, + { + "epoch": 3.83, + "learning_rate": 6.4477846834188425e-06, + "loss": 2.6161, + "step": 7635 + }, + { + "epoch": 3.83, + "learning_rate": 6.421379363065142e-06, + "loss": 2.4355, + "step": 7640 + }, + { + "epoch": 3.84, + "learning_rate": 6.395020252550302e-06, + "loss": 2.3781, + "step": 7645 + }, + { + "epoch": 3.84, + "learning_rate": 6.368707417436237e-06, + "loss": 2.3661, + "step": 7650 + }, + { + "epoch": 3.84, + "learning_rate": 6.34244092316979e-06, + "loss": 2.3473, + "step": 7655 + }, + { + "epoch": 3.84, + "learning_rate": 6.316220835082528e-06, + "loss": 2.6448, + "step": 7660 + }, + { + "epoch": 3.85, + "learning_rate": 6.290047218390605e-06, + "loss": 2.5152, + "step": 7665 + }, + { + "epoch": 3.85, + "learning_rate": 6.2639201381945705e-06, + "loss": 2.317, + "step": 7670 + }, + { + "epoch": 3.85, + "learning_rate": 6.237839659479239e-06, + "loss": 2.4129, + "step": 7675 + }, + { + "epoch": 3.85, + "learning_rate": 6.2118058471135195e-06, + "loss": 2.196, + "step": 7680 + }, + { + "epoch": 3.86, + "learning_rate": 6.185818765850238e-06, + "loss": 2.6682, + "step": 7685 + }, + { + "epoch": 3.86, + "learning_rate": 6.159878480325995e-06, + "loss": 2.4091, + "step": 7690 + }, + { + "epoch": 3.86, + "learning_rate": 6.133985055060992e-06, + "loss": 2.7428, + "step": 7695 + }, + { + "epoch": 3.86, + "learning_rate": 6.108138554458881e-06, + "loss": 2.4382, + "step": 7700 + }, + { + "epoch": 3.87, + "learning_rate": 6.082339042806601e-06, + "loss": 2.5848, + "step": 7705 + }, + { + "epoch": 3.87, + "learning_rate": 6.056586584274218e-06, + "loss": 2.2978, + "step": 7710 + }, + { + "epoch": 3.87, + "learning_rate": 6.030881242914757e-06, + "loss": 2.6048, + "step": 7715 + }, + { + "epoch": 3.87, + "learning_rate": 6.005223082664063e-06, + "loss": 2.4201, + "step": 7720 + }, + { + "epoch": 3.88, + "learning_rate": 5.9796121673406174e-06, + "loss": 2.4145, + "step": 7725 + }, + { + "epoch": 3.88, + "learning_rate": 5.954048560645398e-06, + "loss": 2.3145, + "step": 7730 + }, + { + "epoch": 3.88, + "learning_rate": 5.928532326161712e-06, + "loss": 2.6399, + "step": 7735 + }, + { + "epoch": 3.88, + "learning_rate": 5.9030635273550404e-06, + "loss": 2.6007, + "step": 7740 + }, + { + "epoch": 3.89, + "learning_rate": 5.8776422275728774e-06, + "loss": 2.3568, + "step": 7745 + }, + { + "epoch": 3.89, + "learning_rate": 5.8522684900445765e-06, + "loss": 2.63, + "step": 7750 + }, + { + "epoch": 3.89, + "learning_rate": 5.826942377881195e-06, + "loss": 2.3783, + "step": 7755 + }, + { + "epoch": 3.89, + "learning_rate": 5.8016639540753234e-06, + "loss": 2.6271, + "step": 7760 + }, + { + "epoch": 3.9, + "learning_rate": 5.776433281500951e-06, + "loss": 2.4406, + "step": 7765 + }, + { + "epoch": 3.9, + "learning_rate": 5.75125042291329e-06, + "loss": 2.4872, + "step": 7770 + }, + { + "epoch": 3.9, + "learning_rate": 5.726115440948626e-06, + "loss": 2.3784, + "step": 7775 + }, + { + "epoch": 3.9, + "learning_rate": 5.70102839812417e-06, + "loss": 2.3692, + "step": 7780 + }, + { + "epoch": 3.91, + "learning_rate": 5.675989356837879e-06, + "loss": 2.2512, + "step": 7785 + }, + { + "epoch": 3.91, + "learning_rate": 5.6509983793683525e-06, + "loss": 2.3779, + "step": 7790 + }, + { + "epoch": 3.91, + "learning_rate": 5.626055527874605e-06, + "loss": 2.59, + "step": 7795 + }, + { + "epoch": 3.91, + "learning_rate": 5.601160864395971e-06, + "loss": 2.3533, + "step": 7800 + }, + { + "epoch": 3.92, + "learning_rate": 5.576314450851922e-06, + "loss": 2.8845, + "step": 7805 + }, + { + "epoch": 3.92, + "learning_rate": 5.5515163490419155e-06, + "loss": 2.6464, + "step": 7810 + }, + { + "epoch": 3.92, + "learning_rate": 5.526766620645258e-06, + "loss": 2.5061, + "step": 7815 + }, + { + "epoch": 3.92, + "learning_rate": 5.5020653272209235e-06, + "loss": 2.3723, + "step": 7820 + }, + { + "epoch": 3.93, + "learning_rate": 5.477412530207435e-06, + "loss": 2.5008, + "step": 7825 + }, + { + "epoch": 3.93, + "learning_rate": 5.452808290922656e-06, + "loss": 2.7899, + "step": 7830 + }, + { + "epoch": 3.93, + "learning_rate": 5.428252670563721e-06, + "loss": 2.6914, + "step": 7835 + }, + { + "epoch": 3.93, + "learning_rate": 5.403745730206811e-06, + "loss": 2.7228, + "step": 7840 + }, + { + "epoch": 3.94, + "learning_rate": 5.379287530807023e-06, + "loss": 2.4227, + "step": 7845 + }, + { + "epoch": 3.94, + "learning_rate": 5.354878133198237e-06, + "loss": 2.5355, + "step": 7850 + }, + { + "epoch": 3.94, + "learning_rate": 5.33051759809294e-06, + "loss": 2.4151, + "step": 7855 + }, + { + "epoch": 3.94, + "learning_rate": 5.3062059860820915e-06, + "loss": 2.5943, + "step": 7860 + }, + { + "epoch": 3.95, + "learning_rate": 5.281943357634961e-06, + "loss": 2.5721, + "step": 7865 + }, + { + "epoch": 3.95, + "learning_rate": 5.257729773098988e-06, + "loss": 2.2355, + "step": 7870 + }, + { + "epoch": 3.95, + "learning_rate": 5.233565292699624e-06, + "loss": 2.6952, + "step": 7875 + }, + { + "epoch": 3.95, + "learning_rate": 5.209449976540187e-06, + "loss": 2.5381, + "step": 7880 + }, + { + "epoch": 3.96, + "learning_rate": 5.1853838846017135e-06, + "loss": 2.4474, + "step": 7885 + }, + { + "epoch": 3.96, + "learning_rate": 5.161367076742796e-06, + "loss": 2.4288, + "step": 7890 + }, + { + "epoch": 3.96, + "learning_rate": 5.1373996126994646e-06, + "loss": 2.5151, + "step": 7895 + }, + { + "epoch": 3.96, + "learning_rate": 5.113481552085001e-06, + "loss": 2.4231, + "step": 7900 + }, + { + "epoch": 3.97, + "learning_rate": 5.089612954389814e-06, + "loss": 2.5372, + "step": 7905 + }, + { + "epoch": 3.97, + "learning_rate": 5.06579387898129e-06, + "loss": 2.5082, + "step": 7910 + }, + { + "epoch": 3.97, + "learning_rate": 5.042024385103624e-06, + "loss": 2.3919, + "step": 7915 + }, + { + "epoch": 3.97, + "learning_rate": 5.018304531877704e-06, + "loss": 2.4821, + "step": 7920 + }, + { + "epoch": 3.98, + "learning_rate": 4.9946343783009495e-06, + "loss": 2.5556, + "step": 7925 + }, + { + "epoch": 3.98, + "learning_rate": 4.971013983247158e-06, + "loss": 2.4618, + "step": 7930 + }, + { + "epoch": 3.98, + "learning_rate": 4.947443405466357e-06, + "loss": 2.6289, + "step": 7935 + }, + { + "epoch": 3.98, + "learning_rate": 4.923922703584691e-06, + "loss": 2.652, + "step": 7940 + }, + { + "epoch": 3.99, + "learning_rate": 4.9004519361042275e-06, + "loss": 2.5852, + "step": 7945 + }, + { + "epoch": 3.99, + "learning_rate": 4.877031161402843e-06, + "loss": 2.3628, + "step": 7950 + }, + { + "epoch": 3.99, + "learning_rate": 4.853660437734062e-06, + "loss": 2.5503, + "step": 7955 + }, + { + "epoch": 3.99, + "learning_rate": 4.8303398232269255e-06, + "loss": 2.6815, + "step": 7960 + }, + { + "epoch": 4.0, + "learning_rate": 4.807069375885842e-06, + "loss": 2.6943, + "step": 7965 + }, + { + "epoch": 4.0, + "learning_rate": 4.783849153590436e-06, + "loss": 2.3604, + "step": 7970 + }, + { + "epoch": 4.0, + "learning_rate": 4.760679214095409e-06, + "loss": 2.4289, + "step": 7975 + }, + { + "epoch": 4.0, + "learning_rate": 4.737559615030402e-06, + "loss": 2.3585, + "step": 7980 + }, + { + "epoch": 4.01, + "learning_rate": 4.714490413899839e-06, + "loss": 2.3335, + "step": 7985 + }, + { + "epoch": 4.01, + "learning_rate": 4.6914716680828e-06, + "loss": 2.5913, + "step": 7990 + }, + { + "epoch": 4.01, + "learning_rate": 4.668503434832852e-06, + "loss": 2.6069, + "step": 7995 + }, + { + "epoch": 4.01, + "learning_rate": 4.645585771277961e-06, + "loss": 2.332, + "step": 8000 + }, + { + "epoch": 4.02, + "learning_rate": 4.6227187344202675e-06, + "loss": 2.6009, + "step": 8005 + }, + { + "epoch": 4.02, + "learning_rate": 4.599902381136021e-06, + "loss": 2.4393, + "step": 8010 + }, + { + "epoch": 4.02, + "learning_rate": 4.577136768175391e-06, + "loss": 2.3213, + "step": 8015 + }, + { + "epoch": 4.02, + "learning_rate": 4.5544219521623576e-06, + "loss": 2.3454, + "step": 8020 + }, + { + "epoch": 4.03, + "learning_rate": 4.531757989594543e-06, + "loss": 2.6343, + "step": 8025 + }, + { + "epoch": 4.03, + "learning_rate": 4.5091449368430935e-06, + "loss": 2.6261, + "step": 8030 + }, + { + "epoch": 4.03, + "learning_rate": 4.486582850152523e-06, + "loss": 2.2482, + "step": 8035 + }, + { + "epoch": 4.03, + "learning_rate": 4.464071785640575e-06, + "loss": 2.5495, + "step": 8040 + }, + { + "epoch": 4.04, + "learning_rate": 4.4416117992981066e-06, + "loss": 2.1255, + "step": 8045 + }, + { + "epoch": 4.04, + "learning_rate": 4.419202946988912e-06, + "loss": 2.4457, + "step": 8050 + }, + { + "epoch": 4.04, + "learning_rate": 4.396845284449608e-06, + "loss": 2.4967, + "step": 8055 + }, + { + "epoch": 4.04, + "learning_rate": 4.374538867289488e-06, + "loss": 2.489, + "step": 8060 + }, + { + "epoch": 4.05, + "learning_rate": 4.352283750990388e-06, + "loss": 2.7544, + "step": 8065 + }, + { + "epoch": 4.05, + "learning_rate": 4.330079990906541e-06, + "loss": 2.2603, + "step": 8070 + }, + { + "epoch": 4.05, + "learning_rate": 4.307927642264448e-06, + "loss": 2.509, + "step": 8075 + }, + { + "epoch": 4.05, + "learning_rate": 4.285826760162731e-06, + "loss": 2.501, + "step": 8080 + }, + { + "epoch": 4.06, + "learning_rate": 4.2637773995720086e-06, + "loss": 2.5447, + "step": 8085 + }, + { + "epoch": 4.06, + "learning_rate": 4.241779615334748e-06, + "loss": 2.384, + "step": 8090 + }, + { + "epoch": 4.06, + "learning_rate": 4.219833462165132e-06, + "loss": 2.556, + "step": 8095 + }, + { + "epoch": 4.06, + "learning_rate": 4.197938994648923e-06, + "loss": 2.1611, + "step": 8100 + }, + { + "epoch": 4.07, + "learning_rate": 4.176096267243332e-06, + "loss": 2.7722, + "step": 8105 + }, + { + "epoch": 4.07, + "learning_rate": 4.154305334276876e-06, + "loss": 2.4201, + "step": 8110 + }, + { + "epoch": 4.07, + "learning_rate": 4.1325662499492435e-06, + "loss": 2.0671, + "step": 8115 + }, + { + "epoch": 4.07, + "learning_rate": 4.110879068331169e-06, + "loss": 2.4085, + "step": 8120 + }, + { + "epoch": 4.08, + "learning_rate": 4.089243843364285e-06, + "loss": 2.2808, + "step": 8125 + }, + { + "epoch": 4.08, + "learning_rate": 4.0676606288609945e-06, + "loss": 2.7176, + "step": 8130 + }, + { + "epoch": 4.08, + "learning_rate": 4.0461294785043426e-06, + "loss": 2.7093, + "step": 8135 + }, + { + "epoch": 4.08, + "learning_rate": 4.024650445847872e-06, + "loss": 2.7265, + "step": 8140 + }, + { + "epoch": 4.09, + "learning_rate": 4.003223584315491e-06, + "loss": 2.5104, + "step": 8145 + }, + { + "epoch": 4.09, + "learning_rate": 3.981848947201364e-06, + "loss": 2.5089, + "step": 8150 + }, + { + "epoch": 4.09, + "learning_rate": 3.960526587669741e-06, + "loss": 2.1579, + "step": 8155 + }, + { + "epoch": 4.09, + "learning_rate": 3.939256558754848e-06, + "loss": 2.5799, + "step": 8160 + }, + { + "epoch": 4.1, + "learning_rate": 3.918038913360753e-06, + "loss": 2.5948, + "step": 8165 + }, + { + "epoch": 4.1, + "learning_rate": 3.896873704261231e-06, + "loss": 2.4415, + "step": 8170 + }, + { + "epoch": 4.1, + "learning_rate": 3.875760984099641e-06, + "loss": 2.3362, + "step": 8175 + }, + { + "epoch": 4.1, + "learning_rate": 3.854700805388786e-06, + "loss": 2.0676, + "step": 8180 + }, + { + "epoch": 4.11, + "learning_rate": 3.83369322051077e-06, + "loss": 2.4296, + "step": 8185 + }, + { + "epoch": 4.11, + "learning_rate": 3.8127382817169018e-06, + "loss": 2.3855, + "step": 8190 + }, + { + "epoch": 4.11, + "learning_rate": 3.791836041127533e-06, + "loss": 2.4406, + "step": 8195 + }, + { + "epoch": 4.11, + "learning_rate": 3.770986550731956e-06, + "loss": 2.759, + "step": 8200 + }, + { + "epoch": 4.12, + "learning_rate": 3.750189862388248e-06, + "loss": 2.5292, + "step": 8205 + }, + { + "epoch": 4.12, + "learning_rate": 3.729446027823155e-06, + "loss": 2.6489, + "step": 8210 + }, + { + "epoch": 4.12, + "learning_rate": 3.7087550986319636e-06, + "loss": 2.4049, + "step": 8215 + }, + { + "epoch": 4.12, + "learning_rate": 3.68811712627837e-06, + "loss": 2.6333, + "step": 8220 + }, + { + "epoch": 4.13, + "learning_rate": 3.667532162094353e-06, + "loss": 2.6278, + "step": 8225 + }, + { + "epoch": 4.13, + "learning_rate": 3.6470002572800507e-06, + "loss": 2.3188, + "step": 8230 + }, + { + "epoch": 4.13, + "learning_rate": 3.6265214629036233e-06, + "loss": 2.6722, + "step": 8235 + }, + { + "epoch": 4.13, + "learning_rate": 3.606095829901132e-06, + "loss": 2.7608, + "step": 8240 + }, + { + "epoch": 4.14, + "learning_rate": 3.5857234090764163e-06, + "loss": 2.6491, + "step": 8245 + }, + { + "epoch": 4.14, + "learning_rate": 3.5654042511009596e-06, + "loss": 2.433, + "step": 8250 + }, + { + "epoch": 4.14, + "learning_rate": 3.5451384065137593e-06, + "loss": 2.4662, + "step": 8255 + }, + { + "epoch": 4.15, + "learning_rate": 3.524925925721234e-06, + "loss": 2.5401, + "step": 8260 + }, + { + "epoch": 4.15, + "learning_rate": 3.504766858997044e-06, + "loss": 2.3117, + "step": 8265 + }, + { + "epoch": 4.15, + "learning_rate": 3.4846612564820193e-06, + "loss": 2.4978, + "step": 8270 + }, + { + "epoch": 4.15, + "learning_rate": 3.464609168183977e-06, + "loss": 2.5536, + "step": 8275 + }, + { + "epoch": 4.16, + "learning_rate": 3.4446106439776664e-06, + "loss": 2.6747, + "step": 8280 + }, + { + "epoch": 4.16, + "learning_rate": 3.42466573360459e-06, + "loss": 2.4949, + "step": 8285 + }, + { + "epoch": 4.16, + "learning_rate": 3.4047744866729016e-06, + "loss": 2.4502, + "step": 8290 + }, + { + "epoch": 4.16, + "learning_rate": 3.3849369526572834e-06, + "loss": 2.3337, + "step": 8295 + }, + { + "epoch": 4.17, + "learning_rate": 3.365153180898814e-06, + "loss": 2.4335, + "step": 8300 + }, + { + "epoch": 4.17, + "learning_rate": 3.3454232206048626e-06, + "loss": 2.3121, + "step": 8305 + }, + { + "epoch": 4.17, + "learning_rate": 3.3257471208489435e-06, + "loss": 2.4353, + "step": 8310 + }, + { + "epoch": 4.17, + "learning_rate": 3.306124930570609e-06, + "loss": 2.3915, + "step": 8315 + }, + { + "epoch": 4.18, + "learning_rate": 3.286556698575327e-06, + "loss": 2.2757, + "step": 8320 + }, + { + "epoch": 4.18, + "learning_rate": 3.2670424735343597e-06, + "loss": 2.6466, + "step": 8325 + }, + { + "epoch": 4.18, + "learning_rate": 3.2475823039846283e-06, + "loss": 2.3025, + "step": 8330 + }, + { + "epoch": 4.18, + "learning_rate": 3.2281762383286203e-06, + "loss": 2.427, + "step": 8335 + }, + { + "epoch": 4.19, + "learning_rate": 3.208824324834239e-06, + "loss": 2.3778, + "step": 8340 + }, + { + "epoch": 4.19, + "learning_rate": 3.1895266116347023e-06, + "loss": 2.3377, + "step": 8345 + }, + { + "epoch": 4.19, + "learning_rate": 3.170283146728423e-06, + "loss": 2.4054, + "step": 8350 + }, + { + "epoch": 4.19, + "learning_rate": 3.1510939779788777e-06, + "loss": 2.4784, + "step": 8355 + }, + { + "epoch": 4.2, + "learning_rate": 3.131959153114494e-06, + "loss": 2.641, + "step": 8360 + }, + { + "epoch": 4.2, + "learning_rate": 3.1128787197285376e-06, + "loss": 2.2387, + "step": 8365 + }, + { + "epoch": 4.2, + "learning_rate": 3.093852725278987e-06, + "loss": 2.3799, + "step": 8370 + }, + { + "epoch": 4.2, + "learning_rate": 3.07488121708841e-06, + "loss": 2.5895, + "step": 8375 + }, + { + "epoch": 4.21, + "learning_rate": 3.0559642423438616e-06, + "loss": 2.5519, + "step": 8380 + }, + { + "epoch": 4.21, + "learning_rate": 3.037101848096757e-06, + "loss": 2.3567, + "step": 8385 + }, + { + "epoch": 4.21, + "learning_rate": 3.0182940812627488e-06, + "loss": 2.376, + "step": 8390 + }, + { + "epoch": 4.21, + "learning_rate": 2.9995409886216267e-06, + "loss": 2.5246, + "step": 8395 + }, + { + "epoch": 4.22, + "learning_rate": 2.980842616817184e-06, + "loss": 2.5203, + "step": 8400 + }, + { + "epoch": 4.22, + "learning_rate": 2.9621990123571057e-06, + "loss": 2.5897, + "step": 8405 + }, + { + "epoch": 4.22, + "learning_rate": 2.9436102216128747e-06, + "loss": 2.2612, + "step": 8410 + }, + { + "epoch": 4.22, + "learning_rate": 2.92507629081962e-06, + "loss": 2.4448, + "step": 8415 + }, + { + "epoch": 4.23, + "learning_rate": 2.906597266076022e-06, + "loss": 2.8432, + "step": 8420 + }, + { + "epoch": 4.23, + "learning_rate": 2.888173193344204e-06, + "loss": 2.8288, + "step": 8425 + }, + { + "epoch": 4.23, + "learning_rate": 2.8698041184496012e-06, + "loss": 2.5462, + "step": 8430 + }, + { + "epoch": 4.23, + "learning_rate": 2.8514900870808576e-06, + "loss": 2.5016, + "step": 8435 + }, + { + "epoch": 4.24, + "learning_rate": 2.833231144789711e-06, + "loss": 2.5918, + "step": 8440 + }, + { + "epoch": 4.24, + "learning_rate": 2.815027336990883e-06, + "loss": 2.57, + "step": 8445 + }, + { + "epoch": 4.24, + "learning_rate": 2.796878708961939e-06, + "loss": 2.6271, + "step": 8450 + }, + { + "epoch": 4.24, + "learning_rate": 2.778785305843232e-06, + "loss": 2.4207, + "step": 8455 + }, + { + "epoch": 4.25, + "learning_rate": 2.7607471726377316e-06, + "loss": 2.5623, + "step": 8460 + }, + { + "epoch": 4.25, + "learning_rate": 2.74276435421095e-06, + "loss": 2.3429, + "step": 8465 + }, + { + "epoch": 4.25, + "learning_rate": 2.7248368952908053e-06, + "loss": 2.4522, + "step": 8470 + }, + { + "epoch": 4.25, + "learning_rate": 2.7069648404675326e-06, + "loss": 2.5546, + "step": 8475 + }, + { + "epoch": 4.26, + "learning_rate": 2.689148234193556e-06, + "loss": 2.9038, + "step": 8480 + }, + { + "epoch": 4.26, + "learning_rate": 2.6713871207833892e-06, + "loss": 2.5678, + "step": 8485 + }, + { + "epoch": 4.26, + "learning_rate": 2.6536815444135194e-06, + "loss": 2.6918, + "step": 8490 + }, + { + "epoch": 4.26, + "learning_rate": 2.636031549122295e-06, + "loss": 2.5969, + "step": 8495 + }, + { + "epoch": 4.27, + "learning_rate": 2.6184371788098268e-06, + "loss": 2.5104, + "step": 8500 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 4.5006379520271974e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8500/training_args.bin b/checkpoint-8500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-8500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475 diff --git a/checkpoint-9000/README.md b/checkpoint-9000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db85c509aab3b2b4dc43e3e1a95f02f6a288d246 --- /dev/null +++ b/checkpoint-9000/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: ../chatglm3-6b-base +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-9000/adapter_config.json b/checkpoint-9000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84772af5a908c08db81ec34a3261fe08581e8855 --- /dev/null +++ b/checkpoint-9000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../chatglm3-6b-base", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-9000/adapter_model.safetensors b/checkpoint-9000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4529120132a381e448439712c4895d684c3631de --- /dev/null +++ b/checkpoint-9000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c25fbc141b317057794ddd8680ffc9327e534cedb9612dfaa84ee72d2786bf76 +size 7807744 diff --git a/checkpoint-9000/optimizer.pt b/checkpoint-9000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b22ab228ff4f6cca4285dadc2037725c0d26997 --- /dev/null +++ b/checkpoint-9000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a9c49feda1e94b3ef1f5fcc105306cba971d2b8b674f5edf63f99c293a3ccaf +size 15644485 diff --git a/checkpoint-9000/rng_state.pth b/checkpoint-9000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9999fd39c5ae6992c22b09c2e98328fffb951f5d --- /dev/null +++ b/checkpoint-9000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e13b9a969415bcc1cf89226a211298a1fd67f87b6a922e265ce0f35573be4b6 +size 14575 diff --git a/checkpoint-9000/scheduler.pt b/checkpoint-9000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0d60a95d81ed042f7cbbb0863d61bddaa8eb361 --- /dev/null +++ b/checkpoint-9000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d8e7c9c7884c8be31774ddc781cd5e444e9dcad2f6415440e81a2a72a101c5 +size 627 diff --git a/checkpoint-9000/special_tokens_map.json b/checkpoint-9000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..dd02cd16ef3e1cfed3ce0f8cd09b983412317a48 --- /dev/null +++ b/checkpoint-9000/special_tokens_map.json @@ -0,0 +1,18 @@ +{ + "additional_special_tokens": [ + { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ] +} diff --git a/checkpoint-9000/tokenization_chatglm.py b/checkpoint-9000/tokenization_chatglm.py new file mode 100644 index 0000000000000000000000000000000000000000..50e44b05e4b3e54d2f1c3f0cab8247ea53a7d4e5 --- /dev/null +++ b/checkpoint-9000/tokenization_chatglm.py @@ -0,0 +1,300 @@ +import json +import os +import re +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) + + def tokenize(self, s: str, encode_special_tokens=False): + if encode_special_tokens: + last_index = 0 + t = [] + for match in re.finditer(self.role_special_token_expression, s): + if last_index < match.start(): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) + t.append(s[match.start():match.end()]) + last_index = match.end() + if last_index < len(s): + t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) + return t + else: + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + self.encode_special_tokens = encode_special_tokens + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + encode_special_tokens=encode_special_tokens, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) + input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/checkpoint-9000/tokenizer.model b/checkpoint-9000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-9000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-9000/tokenizer_config.json b/checkpoint-9000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e543dcb5c184576e9e88e2c48b586290d71953 --- /dev/null +++ b/checkpoint-9000/tokenizer_config.json @@ -0,0 +1,41 @@ +{ + "added_tokens_decoder": { + "64795": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "64797": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|user|>", + "<|observation|>" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "encode_special_tokens": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/checkpoint-9000/trainer_state.json b/checkpoint-9000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..590a8b0b79857b5e1bf93587c9fb373b00f7989f --- /dev/null +++ b/checkpoint-9000/trainer_state.json @@ -0,0 +1,10821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.516371847948815, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9999980101927616e-05, + "loss": 3.2533, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.99999204077421e-05, + "loss": 3.2279, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 4.999978982687695e-05, + "loss": 3.193, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 4.9999597065062966e-05, + "loss": 3.2863, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.999934212277958e-05, + "loss": 3.1724, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 4.999902500066093e-05, + "loss": 3.1088, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 4.999864569949576e-05, + "loss": 3.3241, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 4.99982042202275e-05, + "loss": 3.1904, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 4.999770056395421e-05, + "loss": 2.9254, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 4.999713473192863e-05, + "loss": 3.1845, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 4.999650672555812e-05, + "loss": 2.8475, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995816546404695e-05, + "loss": 3.0486, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 4.9995064196185014e-05, + "loss": 2.6464, + "step": 65 + }, + { + "epoch": 0.04, + "learning_rate": 4.9994249676770364e-05, + "loss": 3.0446, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.999337299018667e-05, + "loss": 3.375, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 4.999243413861447e-05, + "loss": 2.8787, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.999143312438893e-05, + "loss": 3.014, + "step": 85 + }, + { + "epoch": 0.05, + "learning_rate": 4.999036994999985e-05, + "loss": 2.8288, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.9989244618091596e-05, + "loss": 3.1879, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 4.998805713146317e-05, + "loss": 2.9749, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 4.9986807493068165e-05, + "loss": 2.6304, + "step": 105 + }, + { + "epoch": 0.06, + "learning_rate": 4.998549570601475e-05, + "loss": 2.943, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 4.998412177356568e-05, + "loss": 2.7595, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 4.9982685699138275e-05, + "loss": 2.7377, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 4.9981187486304423e-05, + "loss": 2.9878, + "step": 125 + }, + { + "epoch": 0.07, + "learning_rate": 4.997962713879058e-05, + "loss": 2.7882, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 4.997800466047772e-05, + "loss": 2.7802, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 4.997632005540138e-05, + "loss": 3.0129, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 4.997457332775159e-05, + "loss": 2.9444, + "step": 145 + }, + { + "epoch": 0.08, + "learning_rate": 4.997276448187294e-05, + "loss": 2.9664, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 4.9970893522264476e-05, + "loss": 2.7367, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 4.996896045357977e-05, + "loss": 2.7012, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 4.9966965280626856e-05, + "loss": 2.8493, + "step": 165 + }, + { + "epoch": 0.09, + "learning_rate": 4.996490800836825e-05, + "loss": 2.9274, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 4.996278864192092e-05, + "loss": 2.8093, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 4.9960607186556286e-05, + "loss": 3.1782, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 4.995836364770018e-05, + "loss": 2.7507, + "step": 185 + }, + { + "epoch": 0.1, + "learning_rate": 4.995605803093287e-05, + "loss": 2.6724, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 4.9953690341989026e-05, + "loss": 3.0258, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 4.9951260586757694e-05, + "loss": 3.1134, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 4.9948768771282314e-05, + "loss": 2.9937, + "step": 205 + }, + { + "epoch": 0.11, + "learning_rate": 4.9946214901760665e-05, + "loss": 2.7394, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 4.99435989845449e-05, + "loss": 2.9696, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 4.994092102614146e-05, + "loss": 2.753, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 4.993818103321113e-05, + "loss": 3.0759, + "step": 225 + }, + { + "epoch": 0.12, + "learning_rate": 4.9935379012568985e-05, + "loss": 2.7512, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 4.993251497118438e-05, + "loss": 2.8656, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 4.992958891618091e-05, + "loss": 2.6628, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 4.992660085483645e-05, + "loss": 2.8012, + "step": 245 + }, + { + "epoch": 0.13, + "learning_rate": 4.992355079458307e-05, + "loss": 2.8141, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 4.992043874300706e-05, + "loss": 2.9083, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 4.991726470784891e-05, + "loss": 2.5846, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 4.991402869700325e-05, + "loss": 2.8088, + "step": 265 + }, + { + "epoch": 0.14, + "learning_rate": 4.991073071851889e-05, + "loss": 2.9979, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 4.990737078059875e-05, + "loss": 3.0171, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 4.990394889159986e-05, + "loss": 2.9278, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 4.9900465060033364e-05, + "loss": 2.7998, + "step": 285 + }, + { + "epoch": 0.15, + "learning_rate": 4.989691929456443e-05, + "loss": 2.721, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9893311604012306e-05, + "loss": 3.0291, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 4.988964199735024e-05, + "loss": 2.9777, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 4.988591048370552e-05, + "loss": 2.318, + "step": 305 + }, + { + "epoch": 0.16, + "learning_rate": 4.988211707235936e-05, + "loss": 3.0332, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 4.987826177274697e-05, + "loss": 2.4888, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 4.987434459445748e-05, + "loss": 2.9259, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 4.987036554723391e-05, + "loss": 2.9568, + "step": 325 + }, + { + "epoch": 0.17, + "learning_rate": 4.98663246409732e-05, + "loss": 2.8708, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 4.986222188572611e-05, + "loss": 2.7848, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 4.985805729169728e-05, + "loss": 2.6178, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 4.985383086924511e-05, + "loss": 2.8326, + "step": 345 + }, + { + "epoch": 0.18, + "learning_rate": 4.984954262888182e-05, + "loss": 2.8845, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 4.9845192581273365e-05, + "loss": 2.6151, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 4.984078073723944e-05, + "loss": 2.8474, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9836307107753455e-05, + "loss": 2.808, + "step": 365 + }, + { + "epoch": 0.19, + "learning_rate": 4.983177170394248e-05, + "loss": 2.8491, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 4.9827174537087226e-05, + "loss": 2.7764, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 4.982251561862205e-05, + "loss": 2.7582, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 4.981779496013489e-05, + "loss": 2.5379, + "step": 385 + }, + { + "epoch": 0.2, + "learning_rate": 4.981301257336723e-05, + "loss": 2.9937, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 4.980816847021412e-05, + "loss": 2.8574, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 4.980326266272409e-05, + "loss": 2.9369, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 4.979829516309915e-05, + "loss": 2.836, + "step": 405 + }, + { + "epoch": 0.21, + "learning_rate": 4.979326598369477e-05, + "loss": 2.9369, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 4.9788175137019814e-05, + "loss": 2.6667, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 4.9783022635736534e-05, + "loss": 2.999, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 4.977780849266054e-05, + "loss": 2.907, + "step": 425 + }, + { + "epoch": 0.22, + "learning_rate": 4.9772532720760744e-05, + "loss": 2.8028, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 4.976719533315937e-05, + "loss": 2.7999, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 4.976179634313187e-05, + "loss": 2.8918, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 4.9756335764106944e-05, + "loss": 2.7926, + "step": 445 + }, + { + "epoch": 0.23, + "learning_rate": 4.975081360966646e-05, + "loss": 2.6709, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745229893545436e-05, + "loss": 2.8248, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 4.973958462963203e-05, + "loss": 3.1146, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 4.973387783196747e-05, + "loss": 2.9991, + "step": 465 + }, + { + "epoch": 0.24, + "learning_rate": 4.972810951474605e-05, + "loss": 2.7726, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 4.972227969231505e-05, + "loss": 2.9025, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 4.971638837917475e-05, + "loss": 2.6521, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 4.971043558997839e-05, + "loss": 2.9511, + "step": 485 + }, + { + "epoch": 0.25, + "learning_rate": 4.9704421339532075e-05, + "loss": 2.6938, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 4.969834564279482e-05, + "loss": 2.8533, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 4.9692208514878444e-05, + "loss": 2.6411, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 4.968600997104758e-05, + "loss": 2.6486, + "step": 505 + }, + { + "epoch": 0.26, + "learning_rate": 4.967975002671961e-05, + "loss": 2.8561, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 4.967342869746463e-05, + "loss": 2.6984, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 4.9667045999005424e-05, + "loss": 2.91, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 4.966060194721742e-05, + "loss": 2.7205, + "step": 525 + }, + { + "epoch": 0.27, + "learning_rate": 4.965409655812865e-05, + "loss": 2.7634, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 4.9647529847919684e-05, + "loss": 2.9647, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 4.964090183292364e-05, + "loss": 2.6357, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 4.963421252962609e-05, + "loss": 3.0285, + "step": 545 + }, + { + "epoch": 0.28, + "learning_rate": 4.96274619546651e-05, + "loss": 2.9895, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 4.962065012483106e-05, + "loss": 2.7286, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 4.961377705706677e-05, + "loss": 3.1337, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 4.960684276846733e-05, + "loss": 2.8268, + "step": 565 + }, + { + "epoch": 0.29, + "learning_rate": 4.959984727628011e-05, + "loss": 2.5851, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 4.959279059790471e-05, + "loss": 2.9359, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 4.958567275089291e-05, + "loss": 2.8842, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 4.957849375294864e-05, + "loss": 2.6186, + "step": 585 + }, + { + "epoch": 0.3, + "learning_rate": 4.957125362192794e-05, + "loss": 3.0116, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 4.956395237583887e-05, + "loss": 2.7045, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 4.9556590032841526e-05, + "loss": 2.8766, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 4.954916661124797e-05, + "loss": 2.7858, + "step": 605 + }, + { + "epoch": 0.31, + "learning_rate": 4.954168212952216e-05, + "loss": 2.7379, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 4.953413660627995e-05, + "loss": 2.475, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526530060289e-05, + "loss": 2.8357, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 4.951886251046876e-05, + "loss": 2.9284, + "step": 625 + }, + { + "epoch": 0.32, + "learning_rate": 4.951113397589042e-05, + "loss": 2.8144, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 4.9503344475776846e-05, + "loss": 2.6845, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 4.9495494029502535e-05, + "loss": 2.8937, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 4.9487582656593575e-05, + "loss": 3.0325, + "step": 645 + }, + { + "epoch": 0.33, + "learning_rate": 4.94796103767276e-05, + "loss": 2.7058, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 4.9471577209733746e-05, + "loss": 2.6162, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 4.946348317559257e-05, + "loss": 3.0129, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 4.945532829443603e-05, + "loss": 2.7587, + "step": 665 + }, + { + "epoch": 0.34, + "learning_rate": 4.944711258654742e-05, + "loss": 2.8915, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 4.943883607236135e-05, + "loss": 2.7343, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 4.943049877246364e-05, + "loss": 2.881, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 4.942210070759131e-05, + "loss": 2.488, + "step": 685 + }, + { + "epoch": 0.35, + "learning_rate": 4.941364189863253e-05, + "loss": 2.896, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 4.940512236662654e-05, + "loss": 2.7757, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 4.9396542132763634e-05, + "loss": 2.6271, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 4.938790121838506e-05, + "loss": 2.6804, + "step": 705 + }, + { + "epoch": 0.36, + "learning_rate": 4.937919964498302e-05, + "loss": 2.7313, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 4.937043743420058e-05, + "loss": 2.9277, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 4.9361614607831605e-05, + "loss": 2.6366, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 4.935273118782078e-05, + "loss": 3.0556, + "step": 725 + }, + { + "epoch": 0.37, + "learning_rate": 4.934378719626345e-05, + "loss": 3.0182, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 4.933478265540564e-05, + "loss": 2.824, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 4.932571758764398e-05, + "loss": 2.636, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 4.931659201552563e-05, + "loss": 2.7025, + "step": 745 + }, + { + "epoch": 0.38, + "learning_rate": 4.930740596174827e-05, + "loss": 2.8919, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 4.9298159449159965e-05, + "loss": 2.8434, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 4.928885250075921e-05, + "loss": 2.9448, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 4.927948513969478e-05, + "loss": 2.7312, + "step": 765 + }, + { + "epoch": 0.39, + "learning_rate": 4.927005738926573e-05, + "loss": 2.8688, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 4.926056927292132e-05, + "loss": 2.8639, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 4.925102081426095e-05, + "loss": 2.7809, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 4.9241412037034115e-05, + "loss": 3.0111, + "step": 785 + }, + { + "epoch": 0.4, + "learning_rate": 4.9231742965140314e-05, + "loss": 2.7252, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 4.922201362262905e-05, + "loss": 2.9717, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 4.92122240336997e-05, + "loss": 2.7191, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 4.920237422270153e-05, + "loss": 2.9346, + "step": 805 + }, + { + "epoch": 0.41, + "learning_rate": 4.9192464214133536e-05, + "loss": 2.7967, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 4.9182494032644496e-05, + "loss": 2.7326, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 4.917246370303284e-05, + "loss": 2.8933, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 4.9162373250246575e-05, + "loss": 2.7939, + "step": 825 + }, + { + "epoch": 0.42, + "learning_rate": 4.9152222699383273e-05, + "loss": 2.7807, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 4.9142012075689994e-05, + "loss": 2.6996, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 4.913174140456319e-05, + "loss": 2.7569, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 4.912141071154869e-05, + "loss": 2.9347, + "step": 845 + }, + { + "epoch": 0.43, + "learning_rate": 4.911102002234159e-05, + "loss": 2.7251, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 4.910056936278623e-05, + "loss": 3.1228, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 4.90900587588761e-05, + "loss": 2.9902, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 4.9079488236753803e-05, + "loss": 2.5095, + "step": 865 + }, + { + "epoch": 0.44, + "learning_rate": 4.906885782271095e-05, + "loss": 2.7523, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 4.905816754318814e-05, + "loss": 2.6656, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 4.9047417424774874e-05, + "loss": 2.8454, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 4.903660749420946e-05, + "loss": 2.8194, + "step": 885 + }, + { + "epoch": 0.45, + "learning_rate": 4.9025737778379025e-05, + "loss": 2.8421, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 4.9014808304319326e-05, + "loss": 2.9656, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 4.900381909921482e-05, + "loss": 2.7484, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 4.899277019039849e-05, + "loss": 2.9044, + "step": 905 + }, + { + "epoch": 0.46, + "learning_rate": 4.898166160535186e-05, + "loss": 2.8016, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 4.8970493371704826e-05, + "loss": 2.6278, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 4.895926551723569e-05, + "loss": 2.7214, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 4.8947978069871036e-05, + "loss": 2.7679, + "step": 925 + }, + { + "epoch": 0.47, + "learning_rate": 4.8936631057685654e-05, + "loss": 2.7196, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 4.8925224508902514e-05, + "loss": 2.7866, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 4.8913758451892644e-05, + "loss": 2.72, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 4.89022329151751e-05, + "loss": 2.752, + "step": 945 + }, + { + "epoch": 0.48, + "learning_rate": 4.8890647927416887e-05, + "loss": 3.0487, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 4.8879003517432857e-05, + "loss": 2.8928, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 4.886729971418568e-05, + "loss": 2.7793, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 4.8855536546785726e-05, + "loss": 2.537, + "step": 965 + }, + { + "epoch": 0.49, + "learning_rate": 4.884371404449105e-05, + "loss": 2.8345, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 4.8831832236707284e-05, + "loss": 2.9673, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 4.8819891152987546e-05, + "loss": 2.8295, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 4.880789082303241e-05, + "loss": 3.0753, + "step": 985 + }, + { + "epoch": 0.5, + "learning_rate": 4.879583127668979e-05, + "loss": 2.6748, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 4.878371254395492e-05, + "loss": 2.8115, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 4.877153465497022e-05, + "loss": 2.7039, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 4.8759297640025235e-05, + "loss": 2.9469, + "step": 1005 + }, + { + "epoch": 0.51, + "learning_rate": 4.874700152955661e-05, + "loss": 2.9745, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 4.8734646354147936e-05, + "loss": 2.9341, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 4.8722232144529754e-05, + "loss": 3.0511, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 4.870975893157941e-05, + "loss": 2.5396, + "step": 1025 + }, + { + "epoch": 0.52, + "learning_rate": 4.8697226746321004e-05, + "loss": 2.6699, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 4.868463561992532e-05, + "loss": 2.4887, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 4.867198558370977e-05, + "loss": 2.4773, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 4.865927666913825e-05, + "loss": 2.7612, + "step": 1045 + }, + { + "epoch": 0.53, + "learning_rate": 4.864650890782113e-05, + "loss": 2.9116, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 4.863368233151514e-05, + "loss": 2.8205, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 4.862079697212329e-05, + "loss": 2.6711, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 4.8607852861694804e-05, + "loss": 3.1138, + "step": 1065 + }, + { + "epoch": 0.54, + "learning_rate": 4.859485003242503e-05, + "loss": 2.5603, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 4.858178851665539e-05, + "loss": 2.7981, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 4.856866834687323e-05, + "loss": 2.507, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 4.855548955571183e-05, + "loss": 3.0315, + "step": 1085 + }, + { + "epoch": 0.55, + "learning_rate": 4.8542252175950244e-05, + "loss": 2.75, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 4.852895624051326e-05, + "loss": 2.6794, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 4.851560178247132e-05, + "loss": 2.8278, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 4.850218883504041e-05, + "loss": 2.6993, + "step": 1105 + }, + { + "epoch": 0.56, + "learning_rate": 4.8488717431582005e-05, + "loss": 2.875, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 4.8475187605602974e-05, + "loss": 2.6057, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 4.84615993907555e-05, + "loss": 2.847, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 4.844795282083697e-05, + "loss": 2.7652, + "step": 1125 + }, + { + "epoch": 0.57, + "learning_rate": 4.843424792978997e-05, + "loss": 2.8128, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 4.842048475170209e-05, + "loss": 2.7077, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 4.840666332080592e-05, + "loss": 2.7081, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 4.8392783671478934e-05, + "loss": 2.6479, + "step": 1145 + }, + { + "epoch": 0.58, + "learning_rate": 4.837884583824342e-05, + "loss": 2.667, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 4.836484985576638e-05, + "loss": 2.7514, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 4.835079575885944e-05, + "loss": 2.5058, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 4.833668358247876e-05, + "loss": 2.6183, + "step": 1165 + }, + { + "epoch": 0.59, + "learning_rate": 4.8322513361725006e-05, + "loss": 2.9959, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 4.830828513184317e-05, + "loss": 2.5714, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 4.8293998928222536e-05, + "loss": 2.965, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 4.827965478639661e-05, + "loss": 2.2106, + "step": 1185 + }, + { + "epoch": 0.6, + "learning_rate": 4.8265252742042965e-05, + "loss": 2.7456, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 4.8250792830983225e-05, + "loss": 2.5694, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 4.8236275089182936e-05, + "loss": 2.6826, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 4.8221699552751465e-05, + "loss": 2.878, + "step": 1205 + }, + { + "epoch": 0.61, + "learning_rate": 4.820706625794196e-05, + "loss": 2.8191, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 4.81923752411512e-05, + "loss": 2.739, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 4.8177626538919565e-05, + "loss": 3.0544, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 4.8162820187930875e-05, + "loss": 2.8393, + "step": 1225 + }, + { + "epoch": 0.62, + "learning_rate": 4.814795622501237e-05, + "loss": 2.4457, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 4.813303468713456e-05, + "loss": 2.8575, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 4.8118055611411197e-05, + "loss": 2.8307, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 4.810301903509909e-05, + "loss": 2.7951, + "step": 1245 + }, + { + "epoch": 0.63, + "learning_rate": 4.8087924995598125e-05, + "loss": 2.8456, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 4.807277353045106e-05, + "loss": 2.3564, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 4.8057564677343524e-05, + "loss": 2.5076, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 4.8042298474103884e-05, + "loss": 2.605, + "step": 1265 + }, + { + "epoch": 0.64, + "learning_rate": 4.8026974958703116e-05, + "loss": 2.4782, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 4.8011594169254784e-05, + "loss": 2.7193, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 4.799615614401488e-05, + "loss": 2.8284, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 4.798066092138178e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.65, + "learning_rate": 4.796510853989612e-05, + "loss": 2.7396, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 4.794949903824069e-05, + "loss": 2.7948, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 4.793383245524035e-05, + "loss": 2.7818, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 4.791810882986197e-05, + "loss": 2.7334, + "step": 1305 + }, + { + "epoch": 0.66, + "learning_rate": 4.7902328201214256e-05, + "loss": 2.4824, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 4.7886490608547727e-05, + "loss": 2.6131, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 4.7870596091254584e-05, + "loss": 2.7778, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 4.7854644688868594e-05, + "loss": 2.5263, + "step": 1325 + }, + { + "epoch": 0.67, + "learning_rate": 4.783863644106502e-05, + "loss": 2.7825, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 4.782257138766053e-05, + "loss": 2.7902, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 4.7806449568613066e-05, + "loss": 2.8333, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 4.779027102402177e-05, + "loss": 2.856, + "step": 1345 + }, + { + "epoch": 0.68, + "learning_rate": 4.777403579412686e-05, + "loss": 2.8021, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 4.775774391930956e-05, + "loss": 2.6639, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741395440091976e-05, + "loss": 2.5226, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.772499039713702e-05, + "loss": 2.6803, + "step": 1365 + }, + { + "epoch": 0.69, + "learning_rate": 4.7708528831248274e-05, + "loss": 2.4608, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.769201078336991e-05, + "loss": 2.786, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.7675436294586586e-05, + "loss": 2.8294, + "step": 1380 + }, + { + "epoch": 0.7, + "learning_rate": 4.7658805406123356e-05, + "loss": 2.6776, + "step": 1385 + }, + { + "epoch": 0.7, + "learning_rate": 4.7642118159345544e-05, + "loss": 2.8003, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.762537459575865e-05, + "loss": 2.7796, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.7608574757008245e-05, + "loss": 2.9156, + "step": 1400 + }, + { + "epoch": 0.71, + "learning_rate": 4.7591718684879883e-05, + "loss": 3.0521, + "step": 1405 + }, + { + "epoch": 0.71, + "learning_rate": 4.7574806421298976e-05, + "loss": 2.6469, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.755783800833071e-05, + "loss": 2.8512, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.754081348817991e-05, + "loss": 2.66, + "step": 1420 + }, + { + "epoch": 0.72, + "learning_rate": 4.752373290319096e-05, + "loss": 2.6625, + "step": 1425 + }, + { + "epoch": 0.72, + "learning_rate": 4.7506596295847716e-05, + "loss": 2.6711, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 4.7489403708773346e-05, + "loss": 2.8951, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 4.747215518473026e-05, + "loss": 2.7375, + "step": 1440 + }, + { + "epoch": 0.73, + "learning_rate": 4.745485076662e-05, + "loss": 2.8037, + "step": 1445 + }, + { + "epoch": 0.73, + "learning_rate": 4.743749049748315e-05, + "loss": 2.5375, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 4.742007442049918e-05, + "loss": 2.7664, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 4.7402602578986374e-05, + "loss": 2.9644, + "step": 1460 + }, + { + "epoch": 0.74, + "learning_rate": 4.738507501640175e-05, + "loss": 2.8212, + "step": 1465 + }, + { + "epoch": 0.74, + "learning_rate": 4.736749177634087e-05, + "loss": 2.7201, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 4.734985290253782e-05, + "loss": 2.7087, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 4.7332158438865035e-05, + "loss": 2.5502, + "step": 1480 + }, + { + "epoch": 0.75, + "learning_rate": 4.731440842933322e-05, + "loss": 2.7607, + "step": 1485 + }, + { + "epoch": 0.75, + "learning_rate": 4.729660291809126e-05, + "loss": 2.5601, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 4.727874194942606e-05, + "loss": 2.5562, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 4.7260825567762486e-05, + "loss": 2.5539, + "step": 1500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7242853817663204e-05, + "loss": 2.6405, + "step": 1505 + }, + { + "epoch": 0.76, + "learning_rate": 4.72248267438286e-05, + "loss": 2.9237, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 4.72067443910967e-05, + "loss": 3.0453, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 4.718860680444297e-05, + "loss": 2.7176, + "step": 1520 + }, + { + "epoch": 0.77, + "learning_rate": 4.71704140289803e-05, + "loss": 2.6713, + "step": 1525 + }, + { + "epoch": 0.77, + "learning_rate": 4.715216610995883e-05, + "loss": 2.7284, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 4.713386309276585e-05, + "loss": 2.5022, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 4.7115505022925706e-05, + "loss": 2.8323, + "step": 1540 + }, + { + "epoch": 0.78, + "learning_rate": 4.7097091946099666e-05, + "loss": 2.7005, + "step": 1545 + }, + { + "epoch": 0.78, + "learning_rate": 4.7078623908085825e-05, + "loss": 2.8142, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 4.7060100954818974e-05, + "loss": 2.8505, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 4.70415231323705e-05, + "loss": 2.5892, + "step": 1560 + }, + { + "epoch": 0.79, + "learning_rate": 4.7022890486948236e-05, + "loss": 2.8951, + "step": 1565 + }, + { + "epoch": 0.79, + "learning_rate": 4.700420306489641e-05, + "loss": 2.7551, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 4.698546091269547e-05, + "loss": 2.6814, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 4.696666407696201e-05, + "loss": 2.8698, + "step": 1580 + }, + { + "epoch": 0.8, + "learning_rate": 4.694781260444862e-05, + "loss": 2.8389, + "step": 1585 + }, + { + "epoch": 0.8, + "learning_rate": 4.6928906542043786e-05, + "loss": 2.8353, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 4.690994593677179e-05, + "loss": 2.6565, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 4.689093083579256e-05, + "loss": 2.6958, + "step": 1600 + }, + { + "epoch": 0.81, + "learning_rate": 4.687186128640157e-05, + "loss": 2.6768, + "step": 1605 + }, + { + "epoch": 0.81, + "learning_rate": 4.685273733602975e-05, + "loss": 2.734, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 4.6833559032243284e-05, + "loss": 2.5348, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 4.6814326422743594e-05, + "loss": 2.7775, + "step": 1620 + }, + { + "epoch": 0.82, + "learning_rate": 4.679503955536715e-05, + "loss": 2.6578, + "step": 1625 + }, + { + "epoch": 0.82, + "learning_rate": 4.6775698478085393e-05, + "loss": 2.8792, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 4.675630323900458e-05, + "loss": 2.7883, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 4.67368538863657e-05, + "loss": 2.8824, + "step": 1640 + }, + { + "epoch": 0.83, + "learning_rate": 4.671735046854433e-05, + "loss": 2.6775, + "step": 1645 + }, + { + "epoch": 0.83, + "learning_rate": 4.669779303405051e-05, + "loss": 2.5658, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 4.667818163152864e-05, + "loss": 2.5262, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 4.665851630975736e-05, + "loss": 2.5749, + "step": 1660 + }, + { + "epoch": 0.84, + "learning_rate": 4.6638797117649424e-05, + "loss": 2.8718, + "step": 1665 + }, + { + "epoch": 0.84, + "learning_rate": 4.661902410425155e-05, + "loss": 2.8039, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 4.659919731874435e-05, + "loss": 2.8089, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 4.6579316810442174e-05, + "loss": 2.8144, + "step": 1680 + }, + { + "epoch": 0.85, + "learning_rate": 4.6559382628792995e-05, + "loss": 2.5963, + "step": 1685 + }, + { + "epoch": 0.85, + "learning_rate": 4.653939482337828e-05, + "loss": 2.5755, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 4.651935344391286e-05, + "loss": 2.7631, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 4.649925854024486e-05, + "loss": 2.8117, + "step": 1700 + }, + { + "epoch": 0.86, + "learning_rate": 4.647911016235549e-05, + "loss": 2.7352, + "step": 1705 + }, + { + "epoch": 0.86, + "learning_rate": 4.6458908360358985e-05, + "loss": 2.5572, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 4.643865318450246e-05, + "loss": 2.8372, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 4.6418344685165774e-05, + "loss": 2.7892, + "step": 1720 + }, + { + "epoch": 0.87, + "learning_rate": 4.639798291286143e-05, + "loss": 2.7832, + "step": 1725 + }, + { + "epoch": 0.87, + "learning_rate": 4.637756791823442e-05, + "loss": 2.5401, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 4.635709975206213e-05, + "loss": 2.9286, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 4.633657846525417e-05, + "loss": 2.6474, + "step": 1740 + }, + { + "epoch": 0.88, + "learning_rate": 4.6316004108852305e-05, + "loss": 2.5047, + "step": 1745 + }, + { + "epoch": 0.88, + "learning_rate": 4.629537673403029e-05, + "loss": 2.8374, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 4.627469639209373e-05, + "loss": 2.634, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 4.6253963134480006e-05, + "loss": 2.6884, + "step": 1760 + }, + { + "epoch": 0.89, + "learning_rate": 4.623317701275809e-05, + "loss": 2.6334, + "step": 1765 + }, + { + "epoch": 0.89, + "learning_rate": 4.621233807862844e-05, + "loss": 2.764, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 4.6191446383922886e-05, + "loss": 3.0864, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 4.617050198060448e-05, + "loss": 2.7964, + "step": 1780 + }, + { + "epoch": 0.9, + "learning_rate": 4.6149504920767376e-05, + "loss": 2.4538, + "step": 1785 + }, + { + "epoch": 0.9, + "learning_rate": 4.6128455256636706e-05, + "loss": 2.7352, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 4.6107353040568416e-05, + "loss": 2.6893, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 4.6086198325049185e-05, + "loss": 2.8609, + "step": 1800 + }, + { + "epoch": 0.91, + "learning_rate": 4.6064991162696275e-05, + "loss": 2.5285, + "step": 1805 + }, + { + "epoch": 0.91, + "learning_rate": 4.604373160625739e-05, + "loss": 2.5707, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.602241970861053e-05, + "loss": 2.7198, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.6001055522763926e-05, + "loss": 2.855, + "step": 1820 + }, + { + "epoch": 0.92, + "learning_rate": 4.597963910185582e-05, + "loss": 2.4175, + "step": 1825 + }, + { + "epoch": 0.92, + "learning_rate": 4.595817049915441e-05, + "loss": 2.5444, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 4.5936649768057646e-05, + "loss": 2.6893, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 4.591507696209318e-05, + "loss": 2.6725, + "step": 1840 + }, + { + "epoch": 0.93, + "learning_rate": 4.589345213491817e-05, + "loss": 2.834, + "step": 1845 + }, + { + "epoch": 0.93, + "learning_rate": 4.587177534031914e-05, + "loss": 2.8259, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 4.585004663221188e-05, + "loss": 2.6146, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 4.582826606464134e-05, + "loss": 2.4673, + "step": 1860 + }, + { + "epoch": 0.94, + "learning_rate": 4.5806433691781416e-05, + "loss": 2.9178, + "step": 1865 + }, + { + "epoch": 0.94, + "learning_rate": 4.578454956793487e-05, + "loss": 2.9224, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 4.576261374753318e-05, + "loss": 2.7987, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 4.574062628513642e-05, + "loss": 2.3848, + "step": 1880 + }, + { + "epoch": 0.95, + "learning_rate": 4.57185872354331e-05, + "loss": 2.6054, + "step": 1885 + }, + { + "epoch": 0.95, + "learning_rate": 4.569649665324003e-05, + "loss": 2.6743, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 4.567435459350222e-05, + "loss": 2.9375, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 4.565216111129269e-05, + "loss": 2.9372, + "step": 1900 + }, + { + "epoch": 0.96, + "learning_rate": 4.562991626181239e-05, + "loss": 2.669, + "step": 1905 + }, + { + "epoch": 0.96, + "learning_rate": 4.560762010039001e-05, + "loss": 2.7644, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 4.558527268248187e-05, + "loss": 2.6886, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 4.55628740636718e-05, + "loss": 2.8618, + "step": 1920 + }, + { + "epoch": 0.97, + "learning_rate": 4.554042429967095e-05, + "loss": 2.8411, + "step": 1925 + }, + { + "epoch": 0.97, + "learning_rate": 4.55179234463177e-05, + "loss": 2.6047, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 4.5495371559577496e-05, + "loss": 2.8675, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 4.547276869554271e-05, + "loss": 2.751, + "step": 1940 + }, + { + "epoch": 0.98, + "learning_rate": 4.545011491043253e-05, + "loss": 2.8638, + "step": 1945 + }, + { + "epoch": 0.98, + "learning_rate": 4.5427410260592775e-05, + "loss": 2.5092, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 4.540465480249579e-05, + "loss": 2.5059, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 4.5381848592740285e-05, + "loss": 2.9433, + "step": 1960 + }, + { + "epoch": 0.99, + "learning_rate": 4.535899168805121e-05, + "loss": 2.6959, + "step": 1965 + }, + { + "epoch": 0.99, + "learning_rate": 4.533608414527961e-05, + "loss": 2.552, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 4.5313126021402465e-05, + "loss": 2.7169, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 4.529011737352258e-05, + "loss": 2.5672, + "step": 1980 + }, + { + "epoch": 1.0, + "learning_rate": 4.526705825886841e-05, + "loss": 2.6434, + "step": 1985 + }, + { + "epoch": 1.0, + "learning_rate": 4.5243948734793947e-05, + "loss": 2.8062, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 4.5220788858778556e-05, + "loss": 2.6929, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 4.519757868842684e-05, + "loss": 2.5837, + "step": 2000 + }, + { + "epoch": 1.01, + "learning_rate": 4.517431828146852e-05, + "loss": 2.821, + "step": 2005 + }, + { + "epoch": 1.01, + "learning_rate": 4.515100769575824e-05, + "loss": 2.7913, + "step": 2010 + }, + { + "epoch": 1.01, + "learning_rate": 4.512764698927545e-05, + "loss": 2.699, + "step": 2015 + }, + { + "epoch": 1.01, + "learning_rate": 4.5104236220124286e-05, + "loss": 2.6574, + "step": 2020 + }, + { + "epoch": 1.02, + "learning_rate": 4.508077544653338e-05, + "loss": 2.5909, + "step": 2025 + }, + { + "epoch": 1.02, + "learning_rate": 4.5057264726855765e-05, + "loss": 2.5695, + "step": 2030 + }, + { + "epoch": 1.02, + "learning_rate": 4.5033704119568675e-05, + "loss": 2.4937, + "step": 2035 + }, + { + "epoch": 1.02, + "learning_rate": 4.501009368327344e-05, + "loss": 2.7253, + "step": 2040 + }, + { + "epoch": 1.03, + "learning_rate": 4.4986433476695334e-05, + "loss": 2.8681, + "step": 2045 + }, + { + "epoch": 1.03, + "learning_rate": 4.496272355868341e-05, + "loss": 2.74, + "step": 2050 + }, + { + "epoch": 1.03, + "learning_rate": 4.4938963988210365e-05, + "loss": 2.7439, + "step": 2055 + }, + { + "epoch": 1.03, + "learning_rate": 4.491515482437242e-05, + "loss": 2.9539, + "step": 2060 + }, + { + "epoch": 1.04, + "learning_rate": 4.4891296126389104e-05, + "loss": 2.8165, + "step": 2065 + }, + { + "epoch": 1.04, + "learning_rate": 4.48673879536032e-05, + "loss": 2.601, + "step": 2070 + }, + { + "epoch": 1.04, + "learning_rate": 4.484343036548051e-05, + "loss": 2.5189, + "step": 2075 + }, + { + "epoch": 1.04, + "learning_rate": 4.481942342160976e-05, + "loss": 2.6276, + "step": 2080 + }, + { + "epoch": 1.05, + "learning_rate": 4.479536718170243e-05, + "loss": 2.5027, + "step": 2085 + }, + { + "epoch": 1.05, + "learning_rate": 4.477126170559262e-05, + "loss": 2.8654, + "step": 2090 + }, + { + "epoch": 1.05, + "learning_rate": 4.474710705323688e-05, + "loss": 2.8511, + "step": 2095 + }, + { + "epoch": 1.05, + "learning_rate": 4.47229032847141e-05, + "loss": 2.6129, + "step": 2100 + }, + { + "epoch": 1.06, + "learning_rate": 4.469865046022531e-05, + "loss": 2.8964, + "step": 2105 + }, + { + "epoch": 1.06, + "learning_rate": 4.4674348640093554e-05, + "loss": 2.7502, + "step": 2110 + }, + { + "epoch": 1.06, + "learning_rate": 4.4649997884763765e-05, + "loss": 2.591, + "step": 2115 + }, + { + "epoch": 1.06, + "learning_rate": 4.462559825480257e-05, + "loss": 2.7982, + "step": 2120 + }, + { + "epoch": 1.07, + "learning_rate": 4.460114981089815e-05, + "loss": 2.576, + "step": 2125 + }, + { + "epoch": 1.07, + "learning_rate": 4.457665261386014e-05, + "loss": 2.692, + "step": 2130 + }, + { + "epoch": 1.07, + "learning_rate": 4.455210672461938e-05, + "loss": 2.5818, + "step": 2135 + }, + { + "epoch": 1.07, + "learning_rate": 4.452751220422787e-05, + "loss": 2.6792, + "step": 2140 + }, + { + "epoch": 1.08, + "learning_rate": 4.450286911385856e-05, + "loss": 2.8352, + "step": 2145 + }, + { + "epoch": 1.08, + "learning_rate": 4.4478177514805166e-05, + "loss": 2.5787, + "step": 2150 + }, + { + "epoch": 1.08, + "learning_rate": 4.4453437468482103e-05, + "loss": 2.655, + "step": 2155 + }, + { + "epoch": 1.08, + "learning_rate": 4.442864903642428e-05, + "loss": 2.7664, + "step": 2160 + }, + { + "epoch": 1.09, + "learning_rate": 4.440381228028692e-05, + "loss": 2.7231, + "step": 2165 + }, + { + "epoch": 1.09, + "learning_rate": 4.437892726184548e-05, + "loss": 2.416, + "step": 2170 + }, + { + "epoch": 1.09, + "learning_rate": 4.4353994042995446e-05, + "loss": 2.4619, + "step": 2175 + }, + { + "epoch": 1.09, + "learning_rate": 4.4329012685752183e-05, + "loss": 2.7816, + "step": 2180 + }, + { + "epoch": 1.1, + "learning_rate": 4.430398325225078e-05, + "loss": 2.8678, + "step": 2185 + }, + { + "epoch": 1.1, + "learning_rate": 4.427890580474594e-05, + "loss": 2.6007, + "step": 2190 + }, + { + "epoch": 1.1, + "learning_rate": 4.4253780405611754e-05, + "loss": 2.8195, + "step": 2195 + }, + { + "epoch": 1.1, + "learning_rate": 4.42286071173416e-05, + "loss": 2.5117, + "step": 2200 + }, + { + "epoch": 1.11, + "learning_rate": 4.4203386002547956e-05, + "loss": 2.5527, + "step": 2205 + }, + { + "epoch": 1.11, + "learning_rate": 4.417811712396226e-05, + "loss": 2.662, + "step": 2210 + }, + { + "epoch": 1.11, + "learning_rate": 4.415280054443477e-05, + "loss": 2.7563, + "step": 2215 + }, + { + "epoch": 1.11, + "learning_rate": 4.4127436326934354e-05, + "loss": 2.5118, + "step": 2220 + }, + { + "epoch": 1.12, + "learning_rate": 4.41020245345484e-05, + "loss": 2.7208, + "step": 2225 + }, + { + "epoch": 1.12, + "learning_rate": 4.4076565230482607e-05, + "loss": 2.649, + "step": 2230 + }, + { + "epoch": 1.12, + "learning_rate": 4.4051058478060856e-05, + "loss": 2.652, + "step": 2235 + }, + { + "epoch": 1.12, + "learning_rate": 4.402550434072505e-05, + "loss": 2.6885, + "step": 2240 + }, + { + "epoch": 1.13, + "learning_rate": 4.3999902882034935e-05, + "loss": 2.8545, + "step": 2245 + }, + { + "epoch": 1.13, + "learning_rate": 4.397425416566797e-05, + "loss": 2.9435, + "step": 2250 + }, + { + "epoch": 1.13, + "learning_rate": 4.3948558255419146e-05, + "loss": 2.6555, + "step": 2255 + }, + { + "epoch": 1.13, + "learning_rate": 4.392281521520085e-05, + "loss": 2.6108, + "step": 2260 + }, + { + "epoch": 1.14, + "learning_rate": 4.389702510904269e-05, + "loss": 2.6256, + "step": 2265 + }, + { + "epoch": 1.14, + "learning_rate": 4.387118800109133e-05, + "loss": 2.7996, + "step": 2270 + }, + { + "epoch": 1.14, + "learning_rate": 4.384530395561035e-05, + "loss": 2.6649, + "step": 2275 + }, + { + "epoch": 1.14, + "learning_rate": 4.381937303698006e-05, + "loss": 3.0073, + "step": 2280 + }, + { + "epoch": 1.15, + "learning_rate": 4.379339530969738e-05, + "loss": 2.8493, + "step": 2285 + }, + { + "epoch": 1.15, + "learning_rate": 4.3767370838375635e-05, + "loss": 2.5572, + "step": 2290 + }, + { + "epoch": 1.15, + "learning_rate": 4.374129968774443e-05, + "loss": 2.3837, + "step": 2295 + }, + { + "epoch": 1.15, + "learning_rate": 4.371518192264946e-05, + "loss": 2.4604, + "step": 2300 + }, + { + "epoch": 1.16, + "learning_rate": 4.3689017608052374e-05, + "loss": 2.8866, + "step": 2305 + }, + { + "epoch": 1.16, + "learning_rate": 4.3662806809030585e-05, + "loss": 2.8943, + "step": 2310 + }, + { + "epoch": 1.16, + "learning_rate": 4.3636549590777144e-05, + "loss": 2.7629, + "step": 2315 + }, + { + "epoch": 1.16, + "learning_rate": 4.361024601860054e-05, + "loss": 2.8629, + "step": 2320 + }, + { + "epoch": 1.17, + "learning_rate": 4.3583896157924574e-05, + "loss": 2.7952, + "step": 2325 + }, + { + "epoch": 1.17, + "learning_rate": 4.355750007428817e-05, + "loss": 2.2057, + "step": 2330 + }, + { + "epoch": 1.17, + "learning_rate": 4.3531057833345216e-05, + "loss": 2.9143, + "step": 2335 + }, + { + "epoch": 1.17, + "learning_rate": 4.3504569500864424e-05, + "loss": 2.5354, + "step": 2340 + }, + { + "epoch": 1.18, + "learning_rate": 4.347803514272911e-05, + "loss": 2.7451, + "step": 2345 + }, + { + "epoch": 1.18, + "learning_rate": 4.3451454824937113e-05, + "loss": 3.0827, + "step": 2350 + }, + { + "epoch": 1.18, + "learning_rate": 4.3424828613600555e-05, + "loss": 2.5842, + "step": 2355 + }, + { + "epoch": 1.18, + "learning_rate": 4.339815657494572e-05, + "loss": 2.4492, + "step": 2360 + }, + { + "epoch": 1.19, + "learning_rate": 4.3371438775312865e-05, + "loss": 2.5067, + "step": 2365 + }, + { + "epoch": 1.19, + "learning_rate": 4.334467528115608e-05, + "loss": 2.5902, + "step": 2370 + }, + { + "epoch": 1.19, + "learning_rate": 4.33178661590431e-05, + "loss": 2.8618, + "step": 2375 + }, + { + "epoch": 1.19, + "learning_rate": 4.329101147565515e-05, + "loss": 2.6831, + "step": 2380 + }, + { + "epoch": 1.2, + "learning_rate": 4.3264111297786794e-05, + "loss": 2.7531, + "step": 2385 + }, + { + "epoch": 1.2, + "learning_rate": 4.323716569234572e-05, + "loss": 2.819, + "step": 2390 + }, + { + "epoch": 1.2, + "learning_rate": 4.321017472635263e-05, + "loss": 2.6319, + "step": 2395 + }, + { + "epoch": 1.2, + "learning_rate": 4.318313846694105e-05, + "loss": 2.3078, + "step": 2400 + }, + { + "epoch": 1.21, + "learning_rate": 4.315605698135714e-05, + "loss": 2.7962, + "step": 2405 + }, + { + "epoch": 1.21, + "learning_rate": 4.312893033695958e-05, + "loss": 2.8479, + "step": 2410 + }, + { + "epoch": 1.21, + "learning_rate": 4.310175860121936e-05, + "loss": 2.6289, + "step": 2415 + }, + { + "epoch": 1.21, + "learning_rate": 4.30745418417196e-05, + "loss": 2.7003, + "step": 2420 + }, + { + "epoch": 1.22, + "learning_rate": 4.304728012615543e-05, + "loss": 2.4947, + "step": 2425 + }, + { + "epoch": 1.22, + "learning_rate": 4.3019973522333815e-05, + "loss": 2.6911, + "step": 2430 + }, + { + "epoch": 1.22, + "learning_rate": 4.2992622098173334e-05, + "loss": 2.6931, + "step": 2435 + }, + { + "epoch": 1.22, + "learning_rate": 4.296522592170406e-05, + "loss": 2.774, + "step": 2440 + }, + { + "epoch": 1.23, + "learning_rate": 4.293778506106737e-05, + "loss": 2.759, + "step": 2445 + }, + { + "epoch": 1.23, + "learning_rate": 4.29102995845158e-05, + "loss": 2.5543, + "step": 2450 + }, + { + "epoch": 1.23, + "learning_rate": 4.288276956041284e-05, + "loss": 2.7702, + "step": 2455 + }, + { + "epoch": 1.23, + "learning_rate": 4.285519505723278e-05, + "loss": 2.835, + "step": 2460 + }, + { + "epoch": 1.24, + "learning_rate": 4.282757614356055e-05, + "loss": 2.6516, + "step": 2465 + }, + { + "epoch": 1.24, + "learning_rate": 4.2799912888091544e-05, + "loss": 2.7392, + "step": 2470 + }, + { + "epoch": 1.24, + "learning_rate": 4.277220535963143e-05, + "loss": 2.6522, + "step": 2475 + }, + { + "epoch": 1.24, + "learning_rate": 4.274445362709601e-05, + "loss": 2.6514, + "step": 2480 + }, + { + "epoch": 1.25, + "learning_rate": 4.271665775951104e-05, + "loss": 2.7507, + "step": 2485 + }, + { + "epoch": 1.25, + "learning_rate": 4.2688817826012005e-05, + "loss": 3.0499, + "step": 2490 + }, + { + "epoch": 1.25, + "learning_rate": 4.2660933895844055e-05, + "loss": 2.6927, + "step": 2495 + }, + { + "epoch": 1.25, + "learning_rate": 4.2633006038361736e-05, + "loss": 2.8456, + "step": 2500 + }, + { + "epoch": 1.26, + "learning_rate": 4.2605034323028844e-05, + "loss": 2.671, + "step": 2505 + }, + { + "epoch": 1.26, + "learning_rate": 4.2577018819418296e-05, + "loss": 2.6543, + "step": 2510 + }, + { + "epoch": 1.26, + "learning_rate": 4.254895959721189e-05, + "loss": 2.566, + "step": 2515 + }, + { + "epoch": 1.26, + "learning_rate": 4.252085672620019e-05, + "loss": 2.4943, + "step": 2520 + }, + { + "epoch": 1.27, + "learning_rate": 4.249271027628228e-05, + "loss": 2.5115, + "step": 2525 + }, + { + "epoch": 1.27, + "learning_rate": 4.24645203174657e-05, + "loss": 2.3859, + "step": 2530 + }, + { + "epoch": 1.27, + "learning_rate": 4.243628691986617e-05, + "loss": 2.8148, + "step": 2535 + }, + { + "epoch": 1.27, + "learning_rate": 4.240801015370743e-05, + "loss": 2.5597, + "step": 2540 + }, + { + "epoch": 1.28, + "learning_rate": 4.2379690089321145e-05, + "loss": 2.805, + "step": 2545 + }, + { + "epoch": 1.28, + "learning_rate": 4.235132679714664e-05, + "loss": 2.7308, + "step": 2550 + }, + { + "epoch": 1.28, + "learning_rate": 4.232292034773076e-05, + "loss": 2.675, + "step": 2555 + }, + { + "epoch": 1.28, + "learning_rate": 4.2294470811727704e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 1.29, + "learning_rate": 4.226597825989883e-05, + "loss": 2.4288, + "step": 2565 + }, + { + "epoch": 1.29, + "learning_rate": 4.223744276311249e-05, + "loss": 2.642, + "step": 2570 + }, + { + "epoch": 1.29, + "learning_rate": 4.220886439234385e-05, + "loss": 2.7375, + "step": 2575 + }, + { + "epoch": 1.29, + "learning_rate": 4.218024321867472e-05, + "loss": 2.6114, + "step": 2580 + }, + { + "epoch": 1.3, + "learning_rate": 4.2151579313293364e-05, + "loss": 2.4941, + "step": 2585 + }, + { + "epoch": 1.3, + "learning_rate": 4.212287274749434e-05, + "loss": 2.6086, + "step": 2590 + }, + { + "epoch": 1.3, + "learning_rate": 4.2094123592678295e-05, + "loss": 2.7854, + "step": 2595 + }, + { + "epoch": 1.3, + "learning_rate": 4.206533192035184e-05, + "loss": 2.8291, + "step": 2600 + }, + { + "epoch": 1.31, + "learning_rate": 4.2036497802127294e-05, + "loss": 2.6846, + "step": 2605 + }, + { + "epoch": 1.31, + "learning_rate": 4.200762130972259e-05, + "loss": 2.6667, + "step": 2610 + }, + { + "epoch": 1.31, + "learning_rate": 4.197870251496103e-05, + "loss": 2.7895, + "step": 2615 + }, + { + "epoch": 1.31, + "learning_rate": 4.1949741489771155e-05, + "loss": 2.6958, + "step": 2620 + }, + { + "epoch": 1.32, + "learning_rate": 4.192073830618652e-05, + "loss": 2.8536, + "step": 2625 + }, + { + "epoch": 1.32, + "learning_rate": 4.189169303634555e-05, + "loss": 2.5218, + "step": 2630 + }, + { + "epoch": 1.32, + "learning_rate": 4.186260575249136e-05, + "loss": 2.6141, + "step": 2635 + }, + { + "epoch": 1.32, + "learning_rate": 4.1833476526971546e-05, + "loss": 2.6231, + "step": 2640 + }, + { + "epoch": 1.33, + "learning_rate": 4.1804305432238036e-05, + "loss": 2.8229, + "step": 2645 + }, + { + "epoch": 1.33, + "learning_rate": 4.1775092540846885e-05, + "loss": 2.4763, + "step": 2650 + }, + { + "epoch": 1.33, + "learning_rate": 4.174583792545813e-05, + "loss": 2.5162, + "step": 2655 + }, + { + "epoch": 1.33, + "learning_rate": 4.1716541658835574e-05, + "loss": 2.7301, + "step": 2660 + }, + { + "epoch": 1.34, + "learning_rate": 4.16872038138466e-05, + "loss": 2.6055, + "step": 2665 + }, + { + "epoch": 1.34, + "learning_rate": 4.165782446346203e-05, + "loss": 2.6529, + "step": 2670 + }, + { + "epoch": 1.34, + "learning_rate": 4.162840368075591e-05, + "loss": 2.7934, + "step": 2675 + }, + { + "epoch": 1.34, + "learning_rate": 4.159894153890536e-05, + "loss": 2.9108, + "step": 2680 + }, + { + "epoch": 1.35, + "learning_rate": 4.1569438111190326e-05, + "loss": 2.6543, + "step": 2685 + }, + { + "epoch": 1.35, + "learning_rate": 4.1539893470993496e-05, + "loss": 2.6038, + "step": 2690 + }, + { + "epoch": 1.35, + "learning_rate": 4.151030769180002e-05, + "loss": 2.5741, + "step": 2695 + }, + { + "epoch": 1.35, + "learning_rate": 4.14806808471974e-05, + "loss": 2.8525, + "step": 2700 + }, + { + "epoch": 1.36, + "learning_rate": 4.1451013010875275e-05, + "loss": 2.5151, + "step": 2705 + }, + { + "epoch": 1.36, + "learning_rate": 4.1421304256625206e-05, + "loss": 2.5666, + "step": 2710 + }, + { + "epoch": 1.36, + "learning_rate": 4.139155465834058e-05, + "loss": 2.7287, + "step": 2715 + }, + { + "epoch": 1.36, + "learning_rate": 4.136176429001634e-05, + "loss": 2.8867, + "step": 2720 + }, + { + "epoch": 1.37, + "learning_rate": 4.133193322574885e-05, + "loss": 2.6398, + "step": 2725 + }, + { + "epoch": 1.37, + "learning_rate": 4.130206153973568e-05, + "loss": 2.699, + "step": 2730 + }, + { + "epoch": 1.37, + "learning_rate": 4.1272149306275447e-05, + "loss": 2.5248, + "step": 2735 + }, + { + "epoch": 1.37, + "learning_rate": 4.124219659976762e-05, + "loss": 2.8947, + "step": 2740 + }, + { + "epoch": 1.38, + "learning_rate": 4.1212203494712344e-05, + "loss": 2.6473, + "step": 2745 + }, + { + "epoch": 1.38, + "learning_rate": 4.1182170065710226e-05, + "loss": 2.3734, + "step": 2750 + }, + { + "epoch": 1.38, + "learning_rate": 4.115209638746218e-05, + "loss": 2.685, + "step": 2755 + }, + { + "epoch": 1.39, + "learning_rate": 4.1121982534769224e-05, + "loss": 2.9712, + "step": 2760 + }, + { + "epoch": 1.39, + "learning_rate": 4.109182858253231e-05, + "loss": 2.6578, + "step": 2765 + }, + { + "epoch": 1.39, + "learning_rate": 4.106163460575212e-05, + "loss": 2.6842, + "step": 2770 + }, + { + "epoch": 1.39, + "learning_rate": 4.10314006795289e-05, + "loss": 2.511, + "step": 2775 + }, + { + "epoch": 1.4, + "learning_rate": 4.100112687906224e-05, + "loss": 2.8426, + "step": 2780 + }, + { + "epoch": 1.4, + "learning_rate": 4.097081327965092e-05, + "loss": 2.8462, + "step": 2785 + }, + { + "epoch": 1.4, + "learning_rate": 4.094045995669271e-05, + "loss": 2.7552, + "step": 2790 + }, + { + "epoch": 1.4, + "learning_rate": 4.091006698568419e-05, + "loss": 2.3802, + "step": 2795 + }, + { + "epoch": 1.41, + "learning_rate": 4.087963444222054e-05, + "loss": 2.3967, + "step": 2800 + }, + { + "epoch": 1.41, + "learning_rate": 4.084916240199537e-05, + "loss": 2.7945, + "step": 2805 + }, + { + "epoch": 1.41, + "learning_rate": 4.0818650940800524e-05, + "loss": 2.5304, + "step": 2810 + }, + { + "epoch": 1.41, + "learning_rate": 4.0788100134525925e-05, + "loss": 2.6779, + "step": 2815 + }, + { + "epoch": 1.42, + "learning_rate": 4.075751005915932e-05, + "loss": 2.6202, + "step": 2820 + }, + { + "epoch": 1.42, + "learning_rate": 4.072688079078616e-05, + "loss": 2.5915, + "step": 2825 + }, + { + "epoch": 1.42, + "learning_rate": 4.069621240558935e-05, + "loss": 2.5139, + "step": 2830 + }, + { + "epoch": 1.42, + "learning_rate": 4.066550497984911e-05, + "loss": 2.5506, + "step": 2835 + }, + { + "epoch": 1.43, + "learning_rate": 4.063475858994276e-05, + "loss": 2.7154, + "step": 2840 + }, + { + "epoch": 1.43, + "learning_rate": 4.060397331234452e-05, + "loss": 2.6309, + "step": 2845 + }, + { + "epoch": 1.43, + "learning_rate": 4.0573149223625365e-05, + "loss": 2.6922, + "step": 2850 + }, + { + "epoch": 1.43, + "learning_rate": 4.054228640045276e-05, + "loss": 2.703, + "step": 2855 + }, + { + "epoch": 1.44, + "learning_rate": 4.051138491959053e-05, + "loss": 2.3475, + "step": 2860 + }, + { + "epoch": 1.44, + "learning_rate": 4.048044485789869e-05, + "loss": 2.7186, + "step": 2865 + }, + { + "epoch": 1.44, + "learning_rate": 4.044946629233316e-05, + "loss": 2.4667, + "step": 2870 + }, + { + "epoch": 1.44, + "learning_rate": 4.041844929994566e-05, + "loss": 2.562, + "step": 2875 + }, + { + "epoch": 1.45, + "learning_rate": 4.038739395788347e-05, + "loss": 2.4116, + "step": 2880 + }, + { + "epoch": 1.45, + "learning_rate": 4.0356300343389276e-05, + "loss": 2.6133, + "step": 2885 + }, + { + "epoch": 1.45, + "learning_rate": 4.032516853380094e-05, + "loss": 2.6893, + "step": 2890 + }, + { + "epoch": 1.45, + "learning_rate": 4.029399860655132e-05, + "loss": 2.8196, + "step": 2895 + }, + { + "epoch": 1.46, + "learning_rate": 4.026279063916811e-05, + "loss": 2.8933, + "step": 2900 + }, + { + "epoch": 1.46, + "learning_rate": 4.023154470927361e-05, + "loss": 3.0289, + "step": 2905 + }, + { + "epoch": 1.46, + "learning_rate": 4.0200260894584516e-05, + "loss": 2.5586, + "step": 2910 + }, + { + "epoch": 1.46, + "learning_rate": 4.016893927291179e-05, + "loss": 2.785, + "step": 2915 + }, + { + "epoch": 1.47, + "learning_rate": 4.0137579922160395e-05, + "loss": 2.5293, + "step": 2920 + }, + { + "epoch": 1.47, + "learning_rate": 4.010618292032917e-05, + "loss": 2.6696, + "step": 2925 + }, + { + "epoch": 1.47, + "learning_rate": 4.007474834551058e-05, + "loss": 2.7003, + "step": 2930 + }, + { + "epoch": 1.47, + "learning_rate": 4.004327627589056e-05, + "loss": 2.6005, + "step": 2935 + }, + { + "epoch": 1.48, + "learning_rate": 4.001176678974828e-05, + "loss": 2.7198, + "step": 2940 + }, + { + "epoch": 1.48, + "learning_rate": 3.998021996545599e-05, + "loss": 2.5329, + "step": 2945 + }, + { + "epoch": 1.48, + "learning_rate": 3.994863588147881e-05, + "loss": 2.7051, + "step": 2950 + }, + { + "epoch": 1.48, + "learning_rate": 3.9917014616374535e-05, + "loss": 2.9277, + "step": 2955 + }, + { + "epoch": 1.49, + "learning_rate": 3.988535624879344e-05, + "loss": 2.3729, + "step": 2960 + }, + { + "epoch": 1.49, + "learning_rate": 3.985366085747808e-05, + "loss": 2.5122, + "step": 2965 + }, + { + "epoch": 1.49, + "learning_rate": 3.982192852126309e-05, + "loss": 2.4413, + "step": 2970 + }, + { + "epoch": 1.49, + "learning_rate": 3.979015931907501e-05, + "loss": 2.6104, + "step": 2975 + }, + { + "epoch": 1.5, + "learning_rate": 3.975835332993207e-05, + "loss": 2.8855, + "step": 2980 + }, + { + "epoch": 1.5, + "learning_rate": 3.9726510632944e-05, + "loss": 2.7487, + "step": 2985 + }, + { + "epoch": 1.5, + "learning_rate": 3.969463130731183e-05, + "loss": 2.7085, + "step": 2990 + }, + { + "epoch": 1.5, + "learning_rate": 3.966271543232769e-05, + "loss": 2.6464, + "step": 2995 + }, + { + "epoch": 1.51, + "learning_rate": 3.9630763087374625e-05, + "loss": 2.4105, + "step": 3000 + }, + { + "epoch": 1.51, + "learning_rate": 3.9598774351926394e-05, + "loss": 2.7296, + "step": 3005 + }, + { + "epoch": 1.51, + "learning_rate": 3.956674930554725e-05, + "loss": 2.5319, + "step": 3010 + }, + { + "epoch": 1.51, + "learning_rate": 3.9534688027891785e-05, + "loss": 2.5544, + "step": 3015 + }, + { + "epoch": 1.52, + "learning_rate": 3.9502590598704696e-05, + "loss": 2.7981, + "step": 3020 + }, + { + "epoch": 1.52, + "learning_rate": 3.94704570978206e-05, + "loss": 2.6186, + "step": 3025 + }, + { + "epoch": 1.52, + "learning_rate": 3.943828760516382e-05, + "loss": 2.6443, + "step": 3030 + }, + { + "epoch": 1.52, + "learning_rate": 3.940608220074822e-05, + "loss": 2.4564, + "step": 3035 + }, + { + "epoch": 1.53, + "learning_rate": 3.9373840964676976e-05, + "loss": 2.8394, + "step": 3040 + }, + { + "epoch": 1.53, + "learning_rate": 3.934156397714238e-05, + "loss": 2.3683, + "step": 3045 + }, + { + "epoch": 1.53, + "learning_rate": 3.930925131842567e-05, + "loss": 2.875, + "step": 3050 + }, + { + "epoch": 1.53, + "learning_rate": 3.9276903068896784e-05, + "loss": 2.6381, + "step": 3055 + }, + { + "epoch": 1.54, + "learning_rate": 3.9244519309014206e-05, + "loss": 2.8077, + "step": 3060 + }, + { + "epoch": 1.54, + "learning_rate": 3.9212100119324706e-05, + "loss": 2.6208, + "step": 3065 + }, + { + "epoch": 1.54, + "learning_rate": 3.917964558046322e-05, + "loss": 2.9, + "step": 3070 + }, + { + "epoch": 1.54, + "learning_rate": 3.9147155773152586e-05, + "loss": 2.5678, + "step": 3075 + }, + { + "epoch": 1.55, + "learning_rate": 3.911463077820336e-05, + "loss": 2.7059, + "step": 3080 + }, + { + "epoch": 1.55, + "learning_rate": 3.90820706765136e-05, + "loss": 2.7433, + "step": 3085 + }, + { + "epoch": 1.55, + "learning_rate": 3.9049475549068757e-05, + "loss": 2.5916, + "step": 3090 + }, + { + "epoch": 1.55, + "learning_rate": 3.901684547694132e-05, + "loss": 2.6978, + "step": 3095 + }, + { + "epoch": 1.56, + "learning_rate": 3.8984180541290724e-05, + "loss": 2.8709, + "step": 3100 + }, + { + "epoch": 1.56, + "learning_rate": 3.895148082336313e-05, + "loss": 2.8652, + "step": 3105 + }, + { + "epoch": 1.56, + "learning_rate": 3.89187464044912e-05, + "loss": 3.0331, + "step": 3110 + }, + { + "epoch": 1.56, + "learning_rate": 3.8885977366093905e-05, + "loss": 2.7334, + "step": 3115 + }, + { + "epoch": 1.57, + "learning_rate": 3.885317378967633e-05, + "loss": 2.6026, + "step": 3120 + }, + { + "epoch": 1.57, + "learning_rate": 3.882033575682945e-05, + "loss": 2.831, + "step": 3125 + }, + { + "epoch": 1.57, + "learning_rate": 3.878746334922996e-05, + "loss": 2.5585, + "step": 3130 + }, + { + "epoch": 1.57, + "learning_rate": 3.875455664864005e-05, + "loss": 2.762, + "step": 3135 + }, + { + "epoch": 1.58, + "learning_rate": 3.8721615736907205e-05, + "loss": 2.769, + "step": 3140 + }, + { + "epoch": 1.58, + "learning_rate": 3.868864069596399e-05, + "loss": 2.6496, + "step": 3145 + }, + { + "epoch": 1.58, + "learning_rate": 3.8655631607827876e-05, + "loss": 2.486, + "step": 3150 + }, + { + "epoch": 1.58, + "learning_rate": 3.8622588554601e-05, + "loss": 2.8387, + "step": 3155 + }, + { + "epoch": 1.59, + "learning_rate": 3.858951161847001e-05, + "loss": 2.816, + "step": 3160 + }, + { + "epoch": 1.59, + "learning_rate": 3.855640088170579e-05, + "loss": 2.8128, + "step": 3165 + }, + { + "epoch": 1.59, + "learning_rate": 3.8523256426663313e-05, + "loss": 2.5875, + "step": 3170 + }, + { + "epoch": 1.59, + "learning_rate": 3.8490078335781423e-05, + "loss": 2.5853, + "step": 3175 + }, + { + "epoch": 1.6, + "learning_rate": 3.845686669158263e-05, + "loss": 2.7638, + "step": 3180 + }, + { + "epoch": 1.6, + "learning_rate": 3.842362157667287e-05, + "loss": 2.8281, + "step": 3185 + }, + { + "epoch": 1.6, + "learning_rate": 3.839700144139754e-05, + "loss": 2.3574, + "step": 3190 + }, + { + "epoch": 1.6, + "learning_rate": 3.836369628764067e-05, + "loss": 2.5533, + "step": 3195 + }, + { + "epoch": 1.61, + "learning_rate": 3.833035789491177e-05, + "loss": 2.4513, + "step": 3200 + }, + { + "epoch": 1.61, + "learning_rate": 3.8296986346132036e-05, + "loss": 2.567, + "step": 3205 + }, + { + "epoch": 1.61, + "learning_rate": 3.826358172430516e-05, + "loss": 2.6501, + "step": 3210 + }, + { + "epoch": 1.61, + "learning_rate": 3.823014411251708e-05, + "loss": 2.6927, + "step": 3215 + }, + { + "epoch": 1.62, + "learning_rate": 3.819667359393578e-05, + "loss": 2.6094, + "step": 3220 + }, + { + "epoch": 1.62, + "learning_rate": 3.816317025181111e-05, + "loss": 2.6776, + "step": 3225 + }, + { + "epoch": 1.62, + "learning_rate": 3.8129634169474563e-05, + "loss": 2.5833, + "step": 3230 + }, + { + "epoch": 1.62, + "learning_rate": 3.809606543033905e-05, + "loss": 2.6483, + "step": 3235 + }, + { + "epoch": 1.63, + "learning_rate": 3.8062464117898724e-05, + "loss": 2.4814, + "step": 3240 + }, + { + "epoch": 1.63, + "learning_rate": 3.802883031572874e-05, + "loss": 2.5217, + "step": 3245 + }, + { + "epoch": 1.63, + "learning_rate": 3.799516410748506e-05, + "loss": 2.5127, + "step": 3250 + }, + { + "epoch": 1.63, + "learning_rate": 3.796146557690428e-05, + "loss": 2.9325, + "step": 3255 + }, + { + "epoch": 1.64, + "learning_rate": 3.792773480780335e-05, + "loss": 2.4567, + "step": 3260 + }, + { + "epoch": 1.64, + "learning_rate": 3.789397188407944e-05, + "loss": 2.6045, + "step": 3265 + }, + { + "epoch": 1.64, + "learning_rate": 3.786017688970967e-05, + "loss": 2.5031, + "step": 3270 + }, + { + "epoch": 1.64, + "learning_rate": 3.782634990875094e-05, + "loss": 2.7692, + "step": 3275 + }, + { + "epoch": 1.65, + "learning_rate": 3.779249102533972e-05, + "loss": 2.6893, + "step": 3280 + }, + { + "epoch": 1.65, + "learning_rate": 3.7758600323691806e-05, + "loss": 2.604, + "step": 3285 + }, + { + "epoch": 1.65, + "learning_rate": 3.7724677888102145e-05, + "loss": 2.6633, + "step": 3290 + }, + { + "epoch": 1.65, + "learning_rate": 3.769072380294463e-05, + "loss": 2.4804, + "step": 3295 + }, + { + "epoch": 1.66, + "learning_rate": 3.765673815267184e-05, + "loss": 2.5951, + "step": 3300 + }, + { + "epoch": 1.66, + "learning_rate": 3.76227210218149e-05, + "loss": 2.598, + "step": 3305 + }, + { + "epoch": 1.66, + "learning_rate": 3.758867249498321e-05, + "loss": 2.8652, + "step": 3310 + }, + { + "epoch": 1.66, + "learning_rate": 3.7554592656864285e-05, + "loss": 2.5312, + "step": 3315 + }, + { + "epoch": 1.67, + "learning_rate": 3.752048159222349e-05, + "loss": 2.7432, + "step": 3320 + }, + { + "epoch": 1.67, + "learning_rate": 3.748633938590388e-05, + "loss": 2.64, + "step": 3325 + }, + { + "epoch": 1.67, + "learning_rate": 3.745216612282596e-05, + "loss": 2.751, + "step": 3330 + }, + { + "epoch": 1.67, + "learning_rate": 3.741796188798747e-05, + "loss": 2.7636, + "step": 3335 + }, + { + "epoch": 1.68, + "learning_rate": 3.738372676646321e-05, + "loss": 2.8103, + "step": 3340 + }, + { + "epoch": 1.68, + "learning_rate": 3.7349460843404796e-05, + "loss": 2.6672, + "step": 3345 + }, + { + "epoch": 1.68, + "learning_rate": 3.731516420404043e-05, + "loss": 2.5272, + "step": 3350 + }, + { + "epoch": 1.68, + "learning_rate": 3.728083693367474e-05, + "loss": 2.6674, + "step": 3355 + }, + { + "epoch": 1.69, + "learning_rate": 3.724647911768854e-05, + "loss": 2.708, + "step": 3360 + }, + { + "epoch": 1.69, + "learning_rate": 3.72120908415386e-05, + "loss": 2.3902, + "step": 3365 + }, + { + "epoch": 1.69, + "learning_rate": 3.7177672190757476e-05, + "loss": 2.8144, + "step": 3370 + }, + { + "epoch": 1.69, + "learning_rate": 3.714322325095325e-05, + "loss": 2.8525, + "step": 3375 + }, + { + "epoch": 1.7, + "learning_rate": 3.7108744107809364e-05, + "loss": 2.4671, + "step": 3380 + }, + { + "epoch": 1.7, + "learning_rate": 3.7074234847084363e-05, + "loss": 2.3723, + "step": 3385 + }, + { + "epoch": 1.7, + "learning_rate": 3.703969555461173e-05, + "loss": 2.8825, + "step": 3390 + }, + { + "epoch": 1.7, + "learning_rate": 3.700512631629961e-05, + "loss": 2.768, + "step": 3395 + }, + { + "epoch": 1.71, + "learning_rate": 3.6970527218130644e-05, + "loss": 2.4601, + "step": 3400 + }, + { + "epoch": 1.71, + "learning_rate": 3.693589834616176e-05, + "loss": 2.7482, + "step": 3405 + }, + { + "epoch": 1.71, + "learning_rate": 3.6901239786523914e-05, + "loss": 2.4536, + "step": 3410 + }, + { + "epoch": 1.71, + "learning_rate": 3.686655162542192e-05, + "loss": 2.571, + "step": 3415 + }, + { + "epoch": 1.72, + "learning_rate": 3.683183394913422e-05, + "loss": 2.5796, + "step": 3420 + }, + { + "epoch": 1.72, + "learning_rate": 3.6797086844012654e-05, + "loss": 2.7312, + "step": 3425 + }, + { + "epoch": 1.72, + "learning_rate": 3.676231039648227e-05, + "loss": 2.5584, + "step": 3430 + }, + { + "epoch": 1.72, + "learning_rate": 3.67275046930411e-05, + "loss": 2.6298, + "step": 3435 + }, + { + "epoch": 1.73, + "learning_rate": 3.669266982025993e-05, + "loss": 2.628, + "step": 3440 + }, + { + "epoch": 1.73, + "learning_rate": 3.6657805864782116e-05, + "loss": 2.5943, + "step": 3445 + }, + { + "epoch": 1.73, + "learning_rate": 3.662291291332333e-05, + "loss": 2.6121, + "step": 3450 + }, + { + "epoch": 1.73, + "learning_rate": 3.658799105267138e-05, + "loss": 2.6501, + "step": 3455 + }, + { + "epoch": 1.74, + "learning_rate": 3.655304036968597e-05, + "loss": 2.5708, + "step": 3460 + }, + { + "epoch": 1.74, + "learning_rate": 3.651806095129849e-05, + "loss": 2.8039, + "step": 3465 + }, + { + "epoch": 1.74, + "learning_rate": 3.648305288451183e-05, + "loss": 2.8159, + "step": 3470 + }, + { + "epoch": 1.74, + "learning_rate": 3.64480162564001e-05, + "loss": 2.6506, + "step": 3475 + }, + { + "epoch": 1.75, + "learning_rate": 3.641295115410847e-05, + "loss": 2.8982, + "step": 3480 + }, + { + "epoch": 1.75, + "learning_rate": 3.637785766485291e-05, + "loss": 2.8018, + "step": 3485 + }, + { + "epoch": 1.75, + "learning_rate": 3.634273587592003e-05, + "loss": 2.5278, + "step": 3490 + }, + { + "epoch": 1.75, + "learning_rate": 3.630758587466681e-05, + "loss": 2.8937, + "step": 3495 + }, + { + "epoch": 1.76, + "learning_rate": 3.6272407748520394e-05, + "loss": 2.1922, + "step": 3500 + }, + { + "epoch": 1.76, + "learning_rate": 3.623720158497789e-05, + "loss": 2.4288, + "step": 3505 + }, + { + "epoch": 1.76, + "learning_rate": 3.620196747160613e-05, + "loss": 2.9423, + "step": 3510 + }, + { + "epoch": 1.76, + "learning_rate": 3.61667054960415e-05, + "loss": 2.5559, + "step": 3515 + }, + { + "epoch": 1.77, + "learning_rate": 3.613141574598965e-05, + "loss": 2.7228, + "step": 3520 + }, + { + "epoch": 1.77, + "learning_rate": 3.6096098309225325e-05, + "loss": 2.6933, + "step": 3525 + }, + { + "epoch": 1.77, + "learning_rate": 3.606075327359212e-05, + "loss": 2.5865, + "step": 3530 + }, + { + "epoch": 1.77, + "learning_rate": 3.602538072700229e-05, + "loss": 2.5846, + "step": 3535 + }, + { + "epoch": 1.78, + "learning_rate": 3.598998075743653e-05, + "loss": 2.8847, + "step": 3540 + }, + { + "epoch": 1.78, + "learning_rate": 3.595455345294372e-05, + "loss": 2.5386, + "step": 3545 + }, + { + "epoch": 1.78, + "learning_rate": 3.5919098901640735e-05, + "loss": 2.5932, + "step": 3550 + }, + { + "epoch": 1.78, + "learning_rate": 3.588361719171223e-05, + "loss": 2.4481, + "step": 3555 + }, + { + "epoch": 1.79, + "learning_rate": 3.584810841141039e-05, + "loss": 2.5773, + "step": 3560 + }, + { + "epoch": 1.79, + "learning_rate": 3.581257264905476e-05, + "loss": 2.6305, + "step": 3565 + }, + { + "epoch": 1.79, + "learning_rate": 3.5777009993031955e-05, + "loss": 2.7, + "step": 3570 + }, + { + "epoch": 1.79, + "learning_rate": 3.574142053179553e-05, + "loss": 2.4408, + "step": 3575 + }, + { + "epoch": 1.8, + "learning_rate": 3.5705804353865677e-05, + "loss": 2.7292, + "step": 3580 + }, + { + "epoch": 1.8, + "learning_rate": 3.567016154782906e-05, + "loss": 2.4371, + "step": 3585 + }, + { + "epoch": 1.8, + "learning_rate": 3.563449220233855e-05, + "loss": 2.7977, + "step": 3590 + }, + { + "epoch": 1.8, + "learning_rate": 3.559879640611305e-05, + "loss": 2.7357, + "step": 3595 + }, + { + "epoch": 1.81, + "learning_rate": 3.5563074247937244e-05, + "loss": 2.6255, + "step": 3600 + }, + { + "epoch": 1.81, + "learning_rate": 3.552732581666139e-05, + "loss": 2.8134, + "step": 3605 + }, + { + "epoch": 1.81, + "learning_rate": 3.549155120120109e-05, + "loss": 2.702, + "step": 3610 + }, + { + "epoch": 1.81, + "learning_rate": 3.545575049053707e-05, + "loss": 2.4854, + "step": 3615 + }, + { + "epoch": 1.82, + "learning_rate": 3.541992377371497e-05, + "loss": 2.7596, + "step": 3620 + }, + { + "epoch": 1.82, + "learning_rate": 3.53840711398451e-05, + "loss": 2.6891, + "step": 3625 + }, + { + "epoch": 1.82, + "learning_rate": 3.5348192678102254e-05, + "loss": 2.5619, + "step": 3630 + }, + { + "epoch": 1.82, + "learning_rate": 3.531228847772544e-05, + "loss": 2.5719, + "step": 3635 + }, + { + "epoch": 1.83, + "learning_rate": 3.52763586280177e-05, + "loss": 2.6297, + "step": 3640 + }, + { + "epoch": 1.83, + "learning_rate": 3.524040321834589e-05, + "loss": 2.5114, + "step": 3645 + }, + { + "epoch": 1.83, + "learning_rate": 3.52044223381404e-05, + "loss": 2.8536, + "step": 3650 + }, + { + "epoch": 1.83, + "learning_rate": 3.516841607689501e-05, + "loss": 2.5271, + "step": 3655 + }, + { + "epoch": 1.84, + "learning_rate": 3.51323845241666e-05, + "loss": 2.5654, + "step": 3660 + }, + { + "epoch": 1.84, + "learning_rate": 3.509632776957497e-05, + "loss": 2.6269, + "step": 3665 + }, + { + "epoch": 1.84, + "learning_rate": 3.50602459028026e-05, + "loss": 2.4401, + "step": 3670 + }, + { + "epoch": 1.84, + "learning_rate": 3.502413901359445e-05, + "loss": 2.4368, + "step": 3675 + }, + { + "epoch": 1.85, + "learning_rate": 3.498800719175768e-05, + "loss": 2.6087, + "step": 3680 + }, + { + "epoch": 1.85, + "learning_rate": 3.4951850527161495e-05, + "loss": 2.5927, + "step": 3685 + }, + { + "epoch": 1.85, + "learning_rate": 3.4915669109736876e-05, + "loss": 2.8352, + "step": 3690 + }, + { + "epoch": 1.85, + "learning_rate": 3.487946302947637e-05, + "loss": 2.8485, + "step": 3695 + }, + { + "epoch": 1.86, + "learning_rate": 3.484323237643389e-05, + "loss": 2.6611, + "step": 3700 + }, + { + "epoch": 1.86, + "learning_rate": 3.4806977240724423e-05, + "loss": 2.5251, + "step": 3705 + }, + { + "epoch": 1.86, + "learning_rate": 3.47706977125239e-05, + "loss": 2.6043, + "step": 3710 + }, + { + "epoch": 1.86, + "learning_rate": 3.473439388206887e-05, + "loss": 2.5911, + "step": 3715 + }, + { + "epoch": 1.87, + "learning_rate": 3.469806583965639e-05, + "loss": 2.7092, + "step": 3720 + }, + { + "epoch": 1.87, + "learning_rate": 3.466171367564368e-05, + "loss": 2.4572, + "step": 3725 + }, + { + "epoch": 1.87, + "learning_rate": 3.462533748044801e-05, + "loss": 2.464, + "step": 3730 + }, + { + "epoch": 1.87, + "learning_rate": 3.458893734454636e-05, + "loss": 2.8654, + "step": 3735 + }, + { + "epoch": 1.88, + "learning_rate": 3.455251335847531e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 1.88, + "learning_rate": 3.451606561283074e-05, + "loss": 2.59, + "step": 3745 + }, + { + "epoch": 1.88, + "learning_rate": 3.447959419826763e-05, + "loss": 2.4732, + "step": 3750 + }, + { + "epoch": 1.88, + "learning_rate": 3.444309920549983e-05, + "loss": 2.7611, + "step": 3755 + }, + { + "epoch": 1.89, + "learning_rate": 3.440658072529983e-05, + "loss": 2.5923, + "step": 3760 + }, + { + "epoch": 1.89, + "learning_rate": 3.437003884849854e-05, + "loss": 2.5405, + "step": 3765 + }, + { + "epoch": 1.89, + "learning_rate": 3.433347366598506e-05, + "loss": 2.6368, + "step": 3770 + }, + { + "epoch": 1.89, + "learning_rate": 3.429688526870649e-05, + "loss": 2.5877, + "step": 3775 + }, + { + "epoch": 1.9, + "learning_rate": 3.4260273747667635e-05, + "loss": 2.5978, + "step": 3780 + }, + { + "epoch": 1.9, + "learning_rate": 3.422363919393082e-05, + "loss": 2.7375, + "step": 3785 + }, + { + "epoch": 1.9, + "learning_rate": 3.418698169861567e-05, + "loss": 2.4161, + "step": 3790 + }, + { + "epoch": 1.9, + "learning_rate": 3.415030135289884e-05, + "loss": 2.3959, + "step": 3795 + }, + { + "epoch": 1.91, + "learning_rate": 3.4113598248013875e-05, + "loss": 2.9642, + "step": 3800 + }, + { + "epoch": 1.91, + "learning_rate": 3.40768724752509e-05, + "loss": 2.7313, + "step": 3805 + }, + { + "epoch": 1.91, + "learning_rate": 3.404012412595639e-05, + "loss": 2.6311, + "step": 3810 + }, + { + "epoch": 1.91, + "learning_rate": 3.400335329153304e-05, + "loss": 2.6268, + "step": 3815 + }, + { + "epoch": 1.92, + "learning_rate": 3.396656006343939e-05, + "loss": 2.5252, + "step": 3820 + }, + { + "epoch": 1.92, + "learning_rate": 3.392974453318975e-05, + "loss": 2.5863, + "step": 3825 + }, + { + "epoch": 1.92, + "learning_rate": 3.3892906792353886e-05, + "loss": 2.4848, + "step": 3830 + }, + { + "epoch": 1.92, + "learning_rate": 3.3856046932556775e-05, + "loss": 2.5517, + "step": 3835 + }, + { + "epoch": 1.93, + "learning_rate": 3.3819165045478426e-05, + "loss": 2.6679, + "step": 3840 + }, + { + "epoch": 1.93, + "learning_rate": 3.3782261222853654e-05, + "loss": 2.4474, + "step": 3845 + }, + { + "epoch": 1.93, + "learning_rate": 3.37453355564718e-05, + "loss": 2.5378, + "step": 3850 + }, + { + "epoch": 1.93, + "learning_rate": 3.3708388138176584e-05, + "loss": 2.8168, + "step": 3855 + }, + { + "epoch": 1.94, + "learning_rate": 3.367141905986578e-05, + "loss": 2.6556, + "step": 3860 + }, + { + "epoch": 1.94, + "learning_rate": 3.3634428413491054e-05, + "loss": 2.5122, + "step": 3865 + }, + { + "epoch": 1.94, + "learning_rate": 3.359741629105773e-05, + "loss": 2.7044, + "step": 3870 + }, + { + "epoch": 1.94, + "learning_rate": 3.3560382784624524e-05, + "loss": 2.6193, + "step": 3875 + }, + { + "epoch": 1.95, + "learning_rate": 3.352332798630336e-05, + "loss": 3.0051, + "step": 3880 + }, + { + "epoch": 1.95, + "learning_rate": 3.3486251988259134e-05, + "loss": 2.5351, + "step": 3885 + }, + { + "epoch": 1.95, + "learning_rate": 3.344915488270942e-05, + "loss": 2.7273, + "step": 3890 + }, + { + "epoch": 1.95, + "learning_rate": 3.341203676192433e-05, + "loss": 2.3833, + "step": 3895 + }, + { + "epoch": 1.96, + "learning_rate": 3.3374897718226236e-05, + "loss": 2.5073, + "step": 3900 + }, + { + "epoch": 1.96, + "learning_rate": 3.333773784398957e-05, + "loss": 2.7216, + "step": 3905 + }, + { + "epoch": 1.96, + "learning_rate": 3.330055723164055e-05, + "loss": 2.6666, + "step": 3910 + }, + { + "epoch": 1.96, + "learning_rate": 3.326335597365698e-05, + "loss": 2.4959, + "step": 3915 + }, + { + "epoch": 1.97, + "learning_rate": 3.322613416256802e-05, + "loss": 2.8799, + "step": 3920 + }, + { + "epoch": 1.97, + "learning_rate": 3.3188891890953956e-05, + "loss": 2.6041, + "step": 3925 + }, + { + "epoch": 1.97, + "learning_rate": 3.315162925144595e-05, + "loss": 2.7423, + "step": 3930 + }, + { + "epoch": 1.97, + "learning_rate": 3.3114346336725834e-05, + "loss": 2.5765, + "step": 3935 + }, + { + "epoch": 1.98, + "learning_rate": 3.3077043239525874e-05, + "loss": 2.152, + "step": 3940 + }, + { + "epoch": 1.98, + "learning_rate": 3.303972005262852e-05, + "loss": 2.6474, + "step": 3945 + }, + { + "epoch": 1.98, + "learning_rate": 3.3002376868866206e-05, + "loss": 2.473, + "step": 3950 + }, + { + "epoch": 1.98, + "learning_rate": 3.296501378112109e-05, + "loss": 2.6779, + "step": 3955 + }, + { + "epoch": 1.99, + "learning_rate": 3.292763088232485e-05, + "loss": 2.7439, + "step": 3960 + }, + { + "epoch": 1.99, + "learning_rate": 3.289022826545844e-05, + "loss": 2.8874, + "step": 3965 + }, + { + "epoch": 1.99, + "learning_rate": 3.2852806023551834e-05, + "loss": 2.7191, + "step": 3970 + }, + { + "epoch": 1.99, + "learning_rate": 3.281536424968383e-05, + "loss": 2.563, + "step": 3975 + }, + { + "epoch": 2.0, + "learning_rate": 3.2777903036981836e-05, + "loss": 2.7884, + "step": 3980 + }, + { + "epoch": 2.0, + "learning_rate": 3.274042247862158e-05, + "loss": 2.5221, + "step": 3985 + }, + { + "epoch": 2.0, + "learning_rate": 3.270292266782689e-05, + "loss": 2.462, + "step": 3990 + }, + { + "epoch": 2.0, + "learning_rate": 3.266540369786953e-05, + "loss": 2.5059, + "step": 3995 + }, + { + "epoch": 2.01, + "learning_rate": 3.262786566206888e-05, + "loss": 2.688, + "step": 4000 + }, + { + "epoch": 2.01, + "learning_rate": 3.259030865379174e-05, + "loss": 2.7194, + "step": 4005 + }, + { + "epoch": 2.01, + "learning_rate": 3.255273276645214e-05, + "loss": 2.4145, + "step": 4010 + }, + { + "epoch": 2.01, + "learning_rate": 3.251513809351101e-05, + "loss": 2.5469, + "step": 4015 + }, + { + "epoch": 2.02, + "learning_rate": 3.247752472847605e-05, + "loss": 2.7916, + "step": 4020 + }, + { + "epoch": 2.02, + "learning_rate": 3.243989276490143e-05, + "loss": 2.5973, + "step": 4025 + }, + { + "epoch": 2.02, + "learning_rate": 3.240224229638758e-05, + "loss": 2.6732, + "step": 4030 + }, + { + "epoch": 2.02, + "learning_rate": 3.236457341658097e-05, + "loss": 2.2159, + "step": 4035 + }, + { + "epoch": 2.03, + "learning_rate": 3.232688621917386e-05, + "loss": 2.6783, + "step": 4040 + }, + { + "epoch": 2.03, + "learning_rate": 3.2289180797904055e-05, + "loss": 2.8193, + "step": 4045 + }, + { + "epoch": 2.03, + "learning_rate": 3.22514572465547e-05, + "loss": 2.3885, + "step": 4050 + }, + { + "epoch": 2.03, + "learning_rate": 3.221371565895404e-05, + "loss": 2.6288, + "step": 4055 + }, + { + "epoch": 2.04, + "learning_rate": 3.217595612897516e-05, + "loss": 2.5372, + "step": 4060 + }, + { + "epoch": 2.04, + "learning_rate": 3.21381787505358e-05, + "loss": 2.7257, + "step": 4065 + }, + { + "epoch": 2.04, + "learning_rate": 3.210038361759807e-05, + "loss": 2.6329, + "step": 4070 + }, + { + "epoch": 2.04, + "learning_rate": 3.206257082416825e-05, + "loss": 2.6518, + "step": 4075 + }, + { + "epoch": 2.05, + "learning_rate": 3.2024740464296544e-05, + "loss": 2.4218, + "step": 4080 + }, + { + "epoch": 2.05, + "learning_rate": 3.198689263207686e-05, + "loss": 2.7266, + "step": 4085 + }, + { + "epoch": 2.05, + "learning_rate": 3.1949027421646546e-05, + "loss": 2.4466, + "step": 4090 + }, + { + "epoch": 2.05, + "learning_rate": 3.1911144927186185e-05, + "loss": 2.6014, + "step": 4095 + }, + { + "epoch": 2.06, + "learning_rate": 3.1873245242919354e-05, + "loss": 2.7072, + "step": 4100 + }, + { + "epoch": 2.06, + "learning_rate": 3.183532846311236e-05, + "loss": 2.5211, + "step": 4105 + }, + { + "epoch": 2.06, + "learning_rate": 3.179739468207406e-05, + "loss": 2.5787, + "step": 4110 + }, + { + "epoch": 2.06, + "learning_rate": 3.175944399415559e-05, + "loss": 2.5141, + "step": 4115 + }, + { + "epoch": 2.07, + "learning_rate": 3.1721476493750134e-05, + "loss": 2.6807, + "step": 4120 + }, + { + "epoch": 2.07, + "learning_rate": 3.1683492275292695e-05, + "loss": 2.5611, + "step": 4125 + }, + { + "epoch": 2.07, + "learning_rate": 3.164549143325985e-05, + "loss": 2.5864, + "step": 4130 + }, + { + "epoch": 2.08, + "learning_rate": 3.160747406216953e-05, + "loss": 2.7378, + "step": 4135 + }, + { + "epoch": 2.08, + "learning_rate": 3.156944025658079e-05, + "loss": 2.7334, + "step": 4140 + }, + { + "epoch": 2.08, + "learning_rate": 3.153139011109354e-05, + "loss": 2.4465, + "step": 4145 + }, + { + "epoch": 2.08, + "learning_rate": 3.149332372034834e-05, + "loss": 2.5482, + "step": 4150 + }, + { + "epoch": 2.09, + "learning_rate": 3.145524117902617e-05, + "loss": 2.7633, + "step": 4155 + }, + { + "epoch": 2.09, + "learning_rate": 3.141714258184816e-05, + "loss": 2.6796, + "step": 4160 + }, + { + "epoch": 2.09, + "learning_rate": 3.137902802357538e-05, + "loss": 2.6622, + "step": 4165 + }, + { + "epoch": 2.09, + "learning_rate": 3.134089759900861e-05, + "loss": 2.9277, + "step": 4170 + }, + { + "epoch": 2.1, + "learning_rate": 3.130275140298808e-05, + "loss": 2.351, + "step": 4175 + }, + { + "epoch": 2.1, + "learning_rate": 3.1264589530393263e-05, + "loss": 2.654, + "step": 4180 + }, + { + "epoch": 2.1, + "learning_rate": 3.12264120761426e-05, + "loss": 2.6229, + "step": 4185 + }, + { + "epoch": 2.1, + "learning_rate": 3.118821913519333e-05, + "loss": 2.6415, + "step": 4190 + }, + { + "epoch": 2.11, + "learning_rate": 3.115001080254115e-05, + "loss": 2.6029, + "step": 4195 + }, + { + "epoch": 2.11, + "learning_rate": 3.1111787173220095e-05, + "loss": 2.6775, + "step": 4200 + }, + { + "epoch": 2.11, + "learning_rate": 3.107354834230223e-05, + "loss": 2.7393, + "step": 4205 + }, + { + "epoch": 2.11, + "learning_rate": 3.1035294404897396e-05, + "loss": 2.5886, + "step": 4210 + }, + { + "epoch": 2.12, + "learning_rate": 3.099702545615307e-05, + "loss": 2.3812, + "step": 4215 + }, + { + "epoch": 2.12, + "learning_rate": 3.0958741591254026e-05, + "loss": 3.1271, + "step": 4220 + }, + { + "epoch": 2.12, + "learning_rate": 3.0920442905422145e-05, + "loss": 2.7758, + "step": 4225 + }, + { + "epoch": 2.12, + "learning_rate": 3.0882129493916167e-05, + "loss": 2.7682, + "step": 4230 + }, + { + "epoch": 2.13, + "learning_rate": 3.0843801452031466e-05, + "loss": 2.2826, + "step": 4235 + }, + { + "epoch": 2.13, + "learning_rate": 3.0805458875099804e-05, + "loss": 2.5934, + "step": 4240 + }, + { + "epoch": 2.13, + "learning_rate": 3.0767101858489103e-05, + "loss": 2.7693, + "step": 4245 + }, + { + "epoch": 2.13, + "learning_rate": 3.072873049760319e-05, + "loss": 2.6955, + "step": 4250 + }, + { + "epoch": 2.14, + "learning_rate": 3.0690344887881565e-05, + "loss": 2.7344, + "step": 4255 + }, + { + "epoch": 2.14, + "learning_rate": 3.0651945124799185e-05, + "loss": 2.6215, + "step": 4260 + }, + { + "epoch": 2.14, + "learning_rate": 3.061353130386619e-05, + "loss": 2.5794, + "step": 4265 + }, + { + "epoch": 2.14, + "learning_rate": 3.05751035206277e-05, + "loss": 2.671, + "step": 4270 + }, + { + "epoch": 2.15, + "learning_rate": 3.0536661870663576e-05, + "loss": 2.7265, + "step": 4275 + }, + { + "epoch": 2.15, + "learning_rate": 3.0498206449588136e-05, + "loss": 2.9018, + "step": 4280 + }, + { + "epoch": 2.15, + "learning_rate": 3.045973735304996e-05, + "loss": 2.4823, + "step": 4285 + }, + { + "epoch": 2.15, + "learning_rate": 3.0421254676731664e-05, + "loss": 2.5454, + "step": 4290 + }, + { + "epoch": 2.16, + "learning_rate": 3.0382758516349623e-05, + "loss": 2.7888, + "step": 4295 + }, + { + "epoch": 2.16, + "learning_rate": 3.0344248967653754e-05, + "loss": 2.596, + "step": 4300 + }, + { + "epoch": 2.16, + "learning_rate": 3.030572612642727e-05, + "loss": 2.7905, + "step": 4305 + }, + { + "epoch": 2.16, + "learning_rate": 3.0267190088486452e-05, + "loss": 2.5676, + "step": 4310 + }, + { + "epoch": 2.17, + "learning_rate": 3.0228640949680388e-05, + "loss": 2.5836, + "step": 4315 + }, + { + "epoch": 2.17, + "learning_rate": 3.0190078805890786e-05, + "loss": 2.7138, + "step": 4320 + }, + { + "epoch": 2.17, + "learning_rate": 3.015150375303168e-05, + "loss": 2.4747, + "step": 4325 + }, + { + "epoch": 2.17, + "learning_rate": 3.011291588704919e-05, + "loss": 2.5188, + "step": 4330 + }, + { + "epoch": 2.18, + "learning_rate": 3.0074315303921353e-05, + "loss": 2.2952, + "step": 4335 + }, + { + "epoch": 2.18, + "learning_rate": 3.0035702099657787e-05, + "loss": 2.4875, + "step": 4340 + }, + { + "epoch": 2.18, + "learning_rate": 2.9997076370299544e-05, + "loss": 2.5085, + "step": 4345 + }, + { + "epoch": 2.18, + "learning_rate": 2.9958438211918805e-05, + "loss": 2.7452, + "step": 4350 + }, + { + "epoch": 2.19, + "learning_rate": 2.9919787720618676e-05, + "loss": 2.334, + "step": 4355 + }, + { + "epoch": 2.19, + "learning_rate": 2.9881124992532933e-05, + "loss": 2.6904, + "step": 4360 + }, + { + "epoch": 2.19, + "learning_rate": 2.984245012382579e-05, + "loss": 2.5706, + "step": 4365 + }, + { + "epoch": 2.19, + "learning_rate": 2.980376321069165e-05, + "loss": 2.6657, + "step": 4370 + }, + { + "epoch": 2.2, + "learning_rate": 2.976506434935489e-05, + "loss": 2.4117, + "step": 4375 + }, + { + "epoch": 2.2, + "learning_rate": 2.9726353636069582e-05, + "loss": 2.4446, + "step": 4380 + }, + { + "epoch": 2.2, + "learning_rate": 2.968763116711931e-05, + "loss": 2.5681, + "step": 4385 + }, + { + "epoch": 2.2, + "learning_rate": 2.964889703881686e-05, + "loss": 2.7244, + "step": 4390 + }, + { + "epoch": 2.21, + "learning_rate": 2.9610151347504044e-05, + "loss": 2.577, + "step": 4395 + }, + { + "epoch": 2.21, + "learning_rate": 2.957139418955143e-05, + "loss": 2.6176, + "step": 4400 + }, + { + "epoch": 2.21, + "learning_rate": 2.9532625661358105e-05, + "loss": 2.5708, + "step": 4405 + }, + { + "epoch": 2.21, + "learning_rate": 2.949384585935143e-05, + "loss": 2.4873, + "step": 4410 + }, + { + "epoch": 2.22, + "learning_rate": 2.9455054879986797e-05, + "loss": 2.5953, + "step": 4415 + }, + { + "epoch": 2.22, + "learning_rate": 2.941625281974743e-05, + "loss": 2.5578, + "step": 4420 + }, + { + "epoch": 2.22, + "learning_rate": 2.93774397751441e-05, + "loss": 2.5782, + "step": 4425 + }, + { + "epoch": 2.22, + "learning_rate": 2.9338615842714883e-05, + "loss": 2.8227, + "step": 4430 + }, + { + "epoch": 2.23, + "learning_rate": 2.9299781119024956e-05, + "loss": 2.4317, + "step": 4435 + }, + { + "epoch": 2.23, + "learning_rate": 2.926093570066633e-05, + "loss": 2.7054, + "step": 4440 + }, + { + "epoch": 2.23, + "learning_rate": 2.9222079684257608e-05, + "loss": 2.7207, + "step": 4445 + }, + { + "epoch": 2.23, + "learning_rate": 2.9183213166443778e-05, + "loss": 2.6732, + "step": 4450 + }, + { + "epoch": 2.24, + "learning_rate": 2.9144336243895927e-05, + "loss": 2.4768, + "step": 4455 + }, + { + "epoch": 2.24, + "learning_rate": 2.910544901331101e-05, + "loss": 2.5146, + "step": 4460 + }, + { + "epoch": 2.24, + "learning_rate": 2.9066551571411645e-05, + "loss": 2.4941, + "step": 4465 + }, + { + "epoch": 2.24, + "learning_rate": 2.902764401494584e-05, + "loss": 2.1425, + "step": 4470 + }, + { + "epoch": 2.25, + "learning_rate": 2.898872644068676e-05, + "loss": 2.6335, + "step": 4475 + }, + { + "epoch": 2.25, + "learning_rate": 2.8949798945432483e-05, + "loss": 2.4093, + "step": 4480 + }, + { + "epoch": 2.25, + "learning_rate": 2.8910861626005776e-05, + "loss": 2.4006, + "step": 4485 + }, + { + "epoch": 2.25, + "learning_rate": 2.8871914579253824e-05, + "loss": 2.4762, + "step": 4490 + }, + { + "epoch": 2.26, + "learning_rate": 2.8832957902048014e-05, + "loss": 2.6309, + "step": 4495 + }, + { + "epoch": 2.26, + "learning_rate": 2.8793991691283684e-05, + "loss": 2.7734, + "step": 4500 + }, + { + "epoch": 2.26, + "learning_rate": 2.87550160438799e-05, + "loss": 2.5444, + "step": 4505 + }, + { + "epoch": 2.26, + "learning_rate": 2.871603105677917e-05, + "loss": 2.5277, + "step": 4510 + }, + { + "epoch": 2.27, + "learning_rate": 2.8677036826947264e-05, + "loss": 2.5775, + "step": 4515 + }, + { + "epoch": 2.27, + "learning_rate": 2.863803345137291e-05, + "loss": 2.4524, + "step": 4520 + }, + { + "epoch": 2.27, + "learning_rate": 2.8599021027067608e-05, + "loss": 2.572, + "step": 4525 + }, + { + "epoch": 2.27, + "learning_rate": 2.855999965106536e-05, + "loss": 2.8112, + "step": 4530 + }, + { + "epoch": 2.28, + "learning_rate": 2.8520969420422427e-05, + "loss": 2.65, + "step": 4535 + }, + { + "epoch": 2.28, + "learning_rate": 2.8481930432217096e-05, + "loss": 2.4675, + "step": 4540 + }, + { + "epoch": 2.28, + "learning_rate": 2.844288278354943e-05, + "loss": 2.668, + "step": 4545 + }, + { + "epoch": 2.28, + "learning_rate": 2.8403826571541046e-05, + "loss": 2.7135, + "step": 4550 + }, + { + "epoch": 2.29, + "learning_rate": 2.8364761893334858e-05, + "loss": 2.5212, + "step": 4555 + }, + { + "epoch": 2.29, + "learning_rate": 2.8325688846094817e-05, + "loss": 2.698, + "step": 4560 + }, + { + "epoch": 2.29, + "learning_rate": 2.828660752700572e-05, + "loss": 2.6517, + "step": 4565 + }, + { + "epoch": 2.29, + "learning_rate": 2.8247518033272924e-05, + "loss": 2.6687, + "step": 4570 + }, + { + "epoch": 2.3, + "learning_rate": 2.8208420462122105e-05, + "loss": 2.4934, + "step": 4575 + }, + { + "epoch": 2.3, + "learning_rate": 2.816931491079906e-05, + "loss": 2.3762, + "step": 4580 + }, + { + "epoch": 2.3, + "learning_rate": 2.8130201476569413e-05, + "loss": 2.6873, + "step": 4585 + }, + { + "epoch": 2.3, + "learning_rate": 2.8091080256718398e-05, + "loss": 2.6981, + "step": 4590 + }, + { + "epoch": 2.31, + "learning_rate": 2.805195134855061e-05, + "loss": 2.6814, + "step": 4595 + }, + { + "epoch": 2.31, + "learning_rate": 2.8012814849389785e-05, + "loss": 2.6641, + "step": 4600 + }, + { + "epoch": 2.31, + "learning_rate": 2.797367085657852e-05, + "loss": 2.488, + "step": 4605 + }, + { + "epoch": 2.31, + "learning_rate": 2.793451946747806e-05, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 2.32, + "learning_rate": 2.7895360779468044e-05, + "loss": 2.7458, + "step": 4615 + }, + { + "epoch": 2.32, + "learning_rate": 2.785619488994627e-05, + "loss": 2.7558, + "step": 4620 + }, + { + "epoch": 2.32, + "learning_rate": 2.7817021896328427e-05, + "loss": 2.6906, + "step": 4625 + }, + { + "epoch": 2.32, + "learning_rate": 2.7777841896047914e-05, + "loss": 2.5384, + "step": 4630 + }, + { + "epoch": 2.33, + "learning_rate": 2.7738654986555523e-05, + "loss": 2.8354, + "step": 4635 + }, + { + "epoch": 2.33, + "learning_rate": 2.7699461265319242e-05, + "loss": 2.6352, + "step": 4640 + }, + { + "epoch": 2.33, + "learning_rate": 2.7660260829824003e-05, + "loss": 2.5602, + "step": 4645 + }, + { + "epoch": 2.33, + "learning_rate": 2.7621053777571425e-05, + "loss": 2.7109, + "step": 4650 + }, + { + "epoch": 2.34, + "learning_rate": 2.7581840206079616e-05, + "loss": 2.7317, + "step": 4655 + }, + { + "epoch": 2.34, + "learning_rate": 2.754262021288287e-05, + "loss": 2.6874, + "step": 4660 + }, + { + "epoch": 2.34, + "learning_rate": 2.750339389553146e-05, + "loss": 2.5854, + "step": 4665 + }, + { + "epoch": 2.34, + "learning_rate": 2.7464161351591393e-05, + "loss": 2.4774, + "step": 4670 + }, + { + "epoch": 2.35, + "learning_rate": 2.7424922678644172e-05, + "loss": 2.3301, + "step": 4675 + }, + { + "epoch": 2.35, + "learning_rate": 2.7385677974286517e-05, + "loss": 2.8886, + "step": 4680 + }, + { + "epoch": 2.35, + "learning_rate": 2.7346427336130175e-05, + "loss": 2.4401, + "step": 4685 + }, + { + "epoch": 2.35, + "learning_rate": 2.7307170861801644e-05, + "loss": 2.4788, + "step": 4690 + }, + { + "epoch": 2.36, + "learning_rate": 2.7267908648941938e-05, + "loss": 2.4079, + "step": 4695 + }, + { + "epoch": 2.36, + "learning_rate": 2.7228640795206344e-05, + "loss": 2.3763, + "step": 4700 + }, + { + "epoch": 2.36, + "learning_rate": 2.718936739826417e-05, + "loss": 2.4751, + "step": 4705 + }, + { + "epoch": 2.36, + "learning_rate": 2.7150088555798537e-05, + "loss": 2.5827, + "step": 4710 + }, + { + "epoch": 2.37, + "learning_rate": 2.7110804365506083e-05, + "loss": 2.8181, + "step": 4715 + }, + { + "epoch": 2.37, + "learning_rate": 2.7071514925096762e-05, + "loss": 2.5951, + "step": 4720 + }, + { + "epoch": 2.37, + "learning_rate": 2.703222033229359e-05, + "loss": 2.8812, + "step": 4725 + }, + { + "epoch": 2.37, + "learning_rate": 2.6992920684832374e-05, + "loss": 2.8793, + "step": 4730 + }, + { + "epoch": 2.38, + "learning_rate": 2.6953616080461526e-05, + "loss": 2.7234, + "step": 4735 + }, + { + "epoch": 2.38, + "learning_rate": 2.6914306616941764e-05, + "loss": 2.566, + "step": 4740 + }, + { + "epoch": 2.38, + "learning_rate": 2.68749923920459e-05, + "loss": 2.7924, + "step": 4745 + }, + { + "epoch": 2.38, + "learning_rate": 2.683567350355859e-05, + "loss": 2.5467, + "step": 4750 + }, + { + "epoch": 2.39, + "learning_rate": 2.679635004927608e-05, + "loss": 2.8032, + "step": 4755 + }, + { + "epoch": 2.39, + "learning_rate": 2.6757022127006e-05, + "loss": 2.4976, + "step": 4760 + }, + { + "epoch": 2.39, + "learning_rate": 2.6717689834567055e-05, + "loss": 2.7968, + "step": 4765 + }, + { + "epoch": 2.39, + "learning_rate": 2.6678353269788854e-05, + "loss": 2.4099, + "step": 4770 + }, + { + "epoch": 2.4, + "learning_rate": 2.66390125305116e-05, + "loss": 2.4246, + "step": 4775 + }, + { + "epoch": 2.4, + "learning_rate": 2.659966771458589e-05, + "loss": 2.7296, + "step": 4780 + }, + { + "epoch": 2.4, + "learning_rate": 2.656031891987249e-05, + "loss": 2.5013, + "step": 4785 + }, + { + "epoch": 2.4, + "learning_rate": 2.6520966244242024e-05, + "loss": 2.4029, + "step": 4790 + }, + { + "epoch": 2.41, + "learning_rate": 2.648160978557479e-05, + "loss": 2.405, + "step": 4795 + }, + { + "epoch": 2.41, + "learning_rate": 2.644224964176048e-05, + "loss": 2.5121, + "step": 4800 + }, + { + "epoch": 2.41, + "learning_rate": 2.6402885910697966e-05, + "loss": 2.7865, + "step": 4805 + }, + { + "epoch": 2.41, + "learning_rate": 2.6363518690295035e-05, + "loss": 2.3725, + "step": 4810 + }, + { + "epoch": 2.42, + "learning_rate": 2.632414807846816e-05, + "loss": 2.612, + "step": 4815 + }, + { + "epoch": 2.42, + "learning_rate": 2.6284774173142233e-05, + "loss": 2.5412, + "step": 4820 + }, + { + "epoch": 2.42, + "learning_rate": 2.624539707225036e-05, + "loss": 2.4828, + "step": 4825 + }, + { + "epoch": 2.42, + "learning_rate": 2.6206016873733574e-05, + "loss": 2.6912, + "step": 4830 + }, + { + "epoch": 2.43, + "learning_rate": 2.6166633675540635e-05, + "loss": 2.9305, + "step": 4835 + }, + { + "epoch": 2.43, + "learning_rate": 2.612724757562775e-05, + "loss": 2.4224, + "step": 4840 + }, + { + "epoch": 2.43, + "learning_rate": 2.608785867195834e-05, + "loss": 2.448, + "step": 4845 + }, + { + "epoch": 2.43, + "learning_rate": 2.60484670625028e-05, + "loss": 2.356, + "step": 4850 + }, + { + "epoch": 2.44, + "learning_rate": 2.600907284523827e-05, + "loss": 2.3987, + "step": 4855 + }, + { + "epoch": 2.44, + "learning_rate": 2.5969676118148358e-05, + "loss": 1.9985, + "step": 4860 + }, + { + "epoch": 2.44, + "learning_rate": 2.5930276979222927e-05, + "loss": 2.7017, + "step": 4865 + }, + { + "epoch": 2.44, + "learning_rate": 2.5890875526457836e-05, + "loss": 2.5223, + "step": 4870 + }, + { + "epoch": 2.45, + "learning_rate": 2.5851471857854697e-05, + "loss": 2.5558, + "step": 4875 + }, + { + "epoch": 2.45, + "learning_rate": 2.5812066071420632e-05, + "loss": 2.3048, + "step": 4880 + }, + { + "epoch": 2.45, + "learning_rate": 2.5772658265168025e-05, + "loss": 2.6089, + "step": 4885 + }, + { + "epoch": 2.45, + "learning_rate": 2.5733248537114306e-05, + "loss": 2.6711, + "step": 4890 + }, + { + "epoch": 2.46, + "learning_rate": 2.569383698528166e-05, + "loss": 2.6202, + "step": 4895 + }, + { + "epoch": 2.46, + "learning_rate": 2.5654423707696833e-05, + "loss": 2.6619, + "step": 4900 + }, + { + "epoch": 2.46, + "learning_rate": 2.5615008802390834e-05, + "loss": 2.6442, + "step": 4905 + }, + { + "epoch": 2.46, + "learning_rate": 2.5575592367398733e-05, + "loss": 2.7031, + "step": 4910 + }, + { + "epoch": 2.47, + "learning_rate": 2.5536174500759417e-05, + "loss": 2.5043, + "step": 4915 + }, + { + "epoch": 2.47, + "learning_rate": 2.5496755300515323e-05, + "loss": 2.3442, + "step": 4920 + }, + { + "epoch": 2.47, + "learning_rate": 2.5457334864712206e-05, + "loss": 2.7005, + "step": 4925 + }, + { + "epoch": 2.47, + "learning_rate": 2.5417913291398892e-05, + "loss": 2.8544, + "step": 4930 + }, + { + "epoch": 2.48, + "learning_rate": 2.5378490678627043e-05, + "loss": 2.3004, + "step": 4935 + }, + { + "epoch": 2.48, + "learning_rate": 2.5339067124450887e-05, + "loss": 2.527, + "step": 4940 + }, + { + "epoch": 2.48, + "learning_rate": 2.5299642726927035e-05, + "loss": 2.689, + "step": 4945 + }, + { + "epoch": 2.48, + "learning_rate": 2.526021758411415e-05, + "loss": 2.7824, + "step": 4950 + }, + { + "epoch": 2.49, + "learning_rate": 2.5220791794072774e-05, + "loss": 2.5121, + "step": 4955 + }, + { + "epoch": 2.49, + "learning_rate": 2.518136545486504e-05, + "loss": 2.3836, + "step": 4960 + }, + { + "epoch": 2.49, + "learning_rate": 2.5141938664554482e-05, + "loss": 2.6548, + "step": 4965 + }, + { + "epoch": 2.49, + "learning_rate": 2.5102511521205718e-05, + "loss": 2.5556, + "step": 4970 + }, + { + "epoch": 2.5, + "learning_rate": 2.5063084122884267e-05, + "loss": 2.4746, + "step": 4975 + }, + { + "epoch": 2.5, + "learning_rate": 2.5023656567656272e-05, + "loss": 2.5669, + "step": 4980 + }, + { + "epoch": 2.5, + "learning_rate": 2.498422895358827e-05, + "loss": 2.4125, + "step": 4985 + }, + { + "epoch": 2.5, + "learning_rate": 2.4944801378746935e-05, + "loss": 2.6531, + "step": 4990 + }, + { + "epoch": 2.51, + "learning_rate": 2.4905373941198864e-05, + "loss": 2.7697, + "step": 4995 + }, + { + "epoch": 2.51, + "learning_rate": 2.48659467390103e-05, + "loss": 2.5167, + "step": 5000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4826519870246897e-05, + "loss": 2.6029, + "step": 5005 + }, + { + "epoch": 2.51, + "learning_rate": 2.478709343297349e-05, + "loss": 2.3878, + "step": 5010 + }, + { + "epoch": 2.52, + "learning_rate": 2.4747667525253825e-05, + "loss": 2.6694, + "step": 5015 + }, + { + "epoch": 2.52, + "learning_rate": 2.470824224515034e-05, + "loss": 2.606, + "step": 5020 + }, + { + "epoch": 2.52, + "learning_rate": 2.466881769072392e-05, + "loss": 2.6548, + "step": 5025 + }, + { + "epoch": 2.52, + "learning_rate": 2.4629393960033633e-05, + "loss": 2.2612, + "step": 5030 + }, + { + "epoch": 2.53, + "learning_rate": 2.4589971151136503e-05, + "loss": 2.6507, + "step": 5035 + }, + { + "epoch": 2.53, + "learning_rate": 2.4550549362087258e-05, + "loss": 2.4926, + "step": 5040 + }, + { + "epoch": 2.53, + "learning_rate": 2.45111286909381e-05, + "loss": 2.4079, + "step": 5045 + }, + { + "epoch": 2.53, + "learning_rate": 2.447170923573844e-05, + "loss": 2.5999, + "step": 5050 + }, + { + "epoch": 2.54, + "learning_rate": 2.443229109453467e-05, + "loss": 2.4915, + "step": 5055 + }, + { + "epoch": 2.54, + "learning_rate": 2.43928743653699e-05, + "loss": 2.4626, + "step": 5060 + }, + { + "epoch": 2.54, + "learning_rate": 2.4353459146283743e-05, + "loss": 2.405, + "step": 5065 + }, + { + "epoch": 2.54, + "learning_rate": 2.431404553531206e-05, + "loss": 2.857, + "step": 5070 + }, + { + "epoch": 2.55, + "learning_rate": 2.42746336304867e-05, + "loss": 2.8089, + "step": 5075 + }, + { + "epoch": 2.55, + "learning_rate": 2.423522352983527e-05, + "loss": 2.5966, + "step": 5080 + }, + { + "epoch": 2.55, + "learning_rate": 2.41958153313809e-05, + "loss": 2.4951, + "step": 5085 + }, + { + "epoch": 2.55, + "learning_rate": 2.4156409133141967e-05, + "loss": 2.519, + "step": 5090 + }, + { + "epoch": 2.56, + "learning_rate": 2.4117005033131894e-05, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 2.56, + "learning_rate": 2.40776031293589e-05, + "loss": 2.4225, + "step": 5100 + }, + { + "epoch": 2.56, + "learning_rate": 2.4038203519825676e-05, + "loss": 2.47, + "step": 5105 + }, + { + "epoch": 2.56, + "learning_rate": 2.399880630252928e-05, + "loss": 2.4996, + "step": 5110 + }, + { + "epoch": 2.57, + "learning_rate": 2.3959411575460777e-05, + "loss": 2.4737, + "step": 5115 + }, + { + "epoch": 2.57, + "learning_rate": 2.392001943660506e-05, + "loss": 2.636, + "step": 5120 + }, + { + "epoch": 2.57, + "learning_rate": 2.3880629983940572e-05, + "loss": 2.5773, + "step": 5125 + }, + { + "epoch": 2.57, + "learning_rate": 2.3841243315439077e-05, + "loss": 2.7459, + "step": 5130 + }, + { + "epoch": 2.58, + "learning_rate": 2.3801859529065416e-05, + "loss": 2.1073, + "step": 5135 + }, + { + "epoch": 2.58, + "learning_rate": 2.3762478722777264e-05, + "loss": 2.3601, + "step": 5140 + }, + { + "epoch": 2.58, + "learning_rate": 2.3723100994524873e-05, + "loss": 2.5435, + "step": 5145 + }, + { + "epoch": 2.58, + "learning_rate": 2.3683726442250853e-05, + "loss": 2.3054, + "step": 5150 + }, + { + "epoch": 2.59, + "learning_rate": 2.3644355163889908e-05, + "loss": 2.6588, + "step": 5155 + }, + { + "epoch": 2.59, + "learning_rate": 2.3604987257368596e-05, + "loss": 2.6469, + "step": 5160 + }, + { + "epoch": 2.59, + "learning_rate": 2.3565622820605096e-05, + "loss": 2.4276, + "step": 5165 + }, + { + "epoch": 2.59, + "learning_rate": 2.3526261951508947e-05, + "loss": 2.8147, + "step": 5170 + }, + { + "epoch": 2.6, + "learning_rate": 2.3486904747980817e-05, + "loss": 2.5942, + "step": 5175 + }, + { + "epoch": 2.6, + "learning_rate": 2.344755130791227e-05, + "loss": 2.7043, + "step": 5180 + }, + { + "epoch": 2.6, + "learning_rate": 2.340820172918549e-05, + "loss": 2.7551, + "step": 5185 + }, + { + "epoch": 2.6, + "learning_rate": 2.336885610967308e-05, + "loss": 2.4544, + "step": 5190 + }, + { + "epoch": 2.61, + "learning_rate": 2.3329514547237757e-05, + "loss": 2.5914, + "step": 5195 + }, + { + "epoch": 2.61, + "learning_rate": 2.3290177139732186e-05, + "loss": 2.5679, + "step": 5200 + }, + { + "epoch": 2.61, + "learning_rate": 2.325084398499868e-05, + "loss": 2.5207, + "step": 5205 + }, + { + "epoch": 2.61, + "learning_rate": 2.3211515180868972e-05, + "loss": 2.4572, + "step": 5210 + }, + { + "epoch": 2.62, + "learning_rate": 2.3172190825163987e-05, + "loss": 2.6281, + "step": 5215 + }, + { + "epoch": 2.62, + "learning_rate": 2.3132871015693566e-05, + "loss": 2.5488, + "step": 5220 + }, + { + "epoch": 2.62, + "learning_rate": 2.309355585025627e-05, + "loss": 2.74, + "step": 5225 + }, + { + "epoch": 2.62, + "learning_rate": 2.3054245426639078e-05, + "loss": 2.5737, + "step": 5230 + }, + { + "epoch": 2.63, + "learning_rate": 2.3014939842617197e-05, + "loss": 2.5987, + "step": 5235 + }, + { + "epoch": 2.63, + "learning_rate": 2.297563919595379e-05, + "loss": 2.5215, + "step": 5240 + }, + { + "epoch": 2.63, + "learning_rate": 2.293634358439973e-05, + "loss": 2.4514, + "step": 5245 + }, + { + "epoch": 2.63, + "learning_rate": 2.289705310569338e-05, + "loss": 2.8411, + "step": 5250 + }, + { + "epoch": 2.64, + "learning_rate": 2.2857767857560337e-05, + "loss": 2.9463, + "step": 5255 + }, + { + "epoch": 2.64, + "learning_rate": 2.281848793771318e-05, + "loss": 2.6361, + "step": 5260 + }, + { + "epoch": 2.64, + "learning_rate": 2.2779213443851233e-05, + "loss": 2.4919, + "step": 5265 + }, + { + "epoch": 2.64, + "learning_rate": 2.2739944473660332e-05, + "loss": 2.3893, + "step": 5270 + }, + { + "epoch": 2.65, + "learning_rate": 2.2700681124812574e-05, + "loss": 2.6469, + "step": 5275 + }, + { + "epoch": 2.65, + "learning_rate": 2.2661423494966074e-05, + "loss": 2.7062, + "step": 5280 + }, + { + "epoch": 2.65, + "learning_rate": 2.2622171681764706e-05, + "loss": 2.6681, + "step": 5285 + }, + { + "epoch": 2.65, + "learning_rate": 2.2582925782837898e-05, + "loss": 2.5944, + "step": 5290 + }, + { + "epoch": 2.66, + "learning_rate": 2.254368589580036e-05, + "loss": 2.6699, + "step": 5295 + }, + { + "epoch": 2.66, + "learning_rate": 2.250445211825185e-05, + "loss": 2.7243, + "step": 5300 + }, + { + "epoch": 2.66, + "learning_rate": 2.2465224547776934e-05, + "loss": 2.5652, + "step": 5305 + }, + { + "epoch": 2.66, + "learning_rate": 2.2426003281944725e-05, + "loss": 2.4402, + "step": 5310 + }, + { + "epoch": 2.67, + "learning_rate": 2.238678841830867e-05, + "loss": 2.5059, + "step": 5315 + }, + { + "epoch": 2.67, + "learning_rate": 2.234758005440628e-05, + "loss": 2.5083, + "step": 5320 + }, + { + "epoch": 2.67, + "learning_rate": 2.2308378287758906e-05, + "loss": 2.3585, + "step": 5325 + }, + { + "epoch": 2.67, + "learning_rate": 2.22691832158715e-05, + "loss": 2.8297, + "step": 5330 + }, + { + "epoch": 2.68, + "learning_rate": 2.2229994936232346e-05, + "loss": 2.4266, + "step": 5335 + }, + { + "epoch": 2.68, + "learning_rate": 2.219081354631284e-05, + "loss": 2.3729, + "step": 5340 + }, + { + "epoch": 2.68, + "learning_rate": 2.2151639143567236e-05, + "loss": 2.4558, + "step": 5345 + }, + { + "epoch": 2.68, + "learning_rate": 2.211247182543242e-05, + "loss": 2.5797, + "step": 5350 + }, + { + "epoch": 2.69, + "learning_rate": 2.2073311689327648e-05, + "loss": 2.4579, + "step": 5355 + }, + { + "epoch": 2.69, + "learning_rate": 2.203415883265432e-05, + "loss": 2.5511, + "step": 5360 + }, + { + "epoch": 2.69, + "learning_rate": 2.1995013352795725e-05, + "loss": 2.4825, + "step": 5365 + }, + { + "epoch": 2.69, + "learning_rate": 2.1955875347116808e-05, + "loss": 2.4933, + "step": 5370 + }, + { + "epoch": 2.7, + "learning_rate": 2.191674491296391e-05, + "loss": 2.434, + "step": 5375 + }, + { + "epoch": 2.7, + "learning_rate": 2.187762214766456e-05, + "loss": 2.5205, + "step": 5380 + }, + { + "epoch": 2.7, + "learning_rate": 2.1838507148527197e-05, + "loss": 2.5461, + "step": 5385 + }, + { + "epoch": 2.7, + "learning_rate": 2.179940001284095e-05, + "loss": 2.4265, + "step": 5390 + }, + { + "epoch": 2.71, + "learning_rate": 2.176030083787539e-05, + "loss": 2.6166, + "step": 5395 + }, + { + "epoch": 2.71, + "learning_rate": 2.1721209720880277e-05, + "loss": 2.5101, + "step": 5400 + }, + { + "epoch": 2.71, + "learning_rate": 2.168212675908536e-05, + "loss": 2.4824, + "step": 5405 + }, + { + "epoch": 2.71, + "learning_rate": 2.1643052049700066e-05, + "loss": 2.347, + "step": 5410 + }, + { + "epoch": 2.72, + "learning_rate": 2.1603985689913317e-05, + "loss": 2.5476, + "step": 5415 + }, + { + "epoch": 2.72, + "learning_rate": 2.1564927776893264e-05, + "loss": 2.456, + "step": 5420 + }, + { + "epoch": 2.72, + "learning_rate": 2.152587840778704e-05, + "loss": 2.4415, + "step": 5425 + }, + { + "epoch": 2.72, + "learning_rate": 2.1486837679720535e-05, + "loss": 2.7548, + "step": 5430 + }, + { + "epoch": 2.73, + "learning_rate": 2.144780568979816e-05, + "loss": 2.3779, + "step": 5435 + }, + { + "epoch": 2.73, + "learning_rate": 2.1408782535102566e-05, + "loss": 2.5284, + "step": 5440 + }, + { + "epoch": 2.73, + "learning_rate": 2.136976831269444e-05, + "loss": 2.6886, + "step": 5445 + }, + { + "epoch": 2.73, + "learning_rate": 2.133076311961226e-05, + "loss": 2.5138, + "step": 5450 + }, + { + "epoch": 2.74, + "learning_rate": 2.1291767052872035e-05, + "loss": 2.664, + "step": 5455 + }, + { + "epoch": 2.74, + "learning_rate": 2.1252780209467073e-05, + "loss": 2.4325, + "step": 5460 + }, + { + "epoch": 2.74, + "learning_rate": 2.121380268636775e-05, + "loss": 2.5471, + "step": 5465 + }, + { + "epoch": 2.74, + "learning_rate": 2.1174834580521253e-05, + "loss": 2.4363, + "step": 5470 + }, + { + "epoch": 2.75, + "learning_rate": 2.113587598885135e-05, + "loss": 2.6272, + "step": 5475 + }, + { + "epoch": 2.75, + "learning_rate": 2.109692700825814e-05, + "loss": 2.5261, + "step": 5480 + }, + { + "epoch": 2.75, + "learning_rate": 2.105798773561783e-05, + "loss": 2.9262, + "step": 5485 + }, + { + "epoch": 2.75, + "learning_rate": 2.101905826778246e-05, + "loss": 2.3964, + "step": 5490 + }, + { + "epoch": 2.76, + "learning_rate": 2.0980138701579704e-05, + "loss": 2.6239, + "step": 5495 + }, + { + "epoch": 2.76, + "learning_rate": 2.0941229133812593e-05, + "loss": 2.53, + "step": 5500 + }, + { + "epoch": 2.76, + "learning_rate": 2.0902329661259293e-05, + "loss": 2.7035, + "step": 5505 + }, + { + "epoch": 2.77, + "learning_rate": 2.0863440380672856e-05, + "loss": 2.5909, + "step": 5510 + }, + { + "epoch": 2.77, + "learning_rate": 2.0824561388781005e-05, + "loss": 2.1592, + "step": 5515 + }, + { + "epoch": 2.77, + "learning_rate": 2.078569278228585e-05, + "loss": 2.5515, + "step": 5520 + }, + { + "epoch": 2.77, + "learning_rate": 2.0746834657863672e-05, + "loss": 2.6217, + "step": 5525 + }, + { + "epoch": 2.78, + "learning_rate": 2.0707987112164692e-05, + "loss": 2.6302, + "step": 5530 + }, + { + "epoch": 2.78, + "learning_rate": 2.0669150241812807e-05, + "loss": 2.3984, + "step": 5535 + }, + { + "epoch": 2.78, + "learning_rate": 2.0630324143405372e-05, + "loss": 2.6425, + "step": 5540 + }, + { + "epoch": 2.78, + "learning_rate": 2.0591508913512954e-05, + "loss": 2.6817, + "step": 5545 + }, + { + "epoch": 2.79, + "learning_rate": 2.055270464867906e-05, + "loss": 2.3904, + "step": 5550 + }, + { + "epoch": 2.79, + "learning_rate": 2.0513911445419936e-05, + "loss": 2.6625, + "step": 5555 + }, + { + "epoch": 2.79, + "learning_rate": 2.0475129400224337e-05, + "loss": 2.6876, + "step": 5560 + }, + { + "epoch": 2.79, + "learning_rate": 2.043635860955325e-05, + "loss": 2.4981, + "step": 5565 + }, + { + "epoch": 2.8, + "learning_rate": 2.039759916983966e-05, + "loss": 2.3809, + "step": 5570 + }, + { + "epoch": 2.8, + "learning_rate": 2.0358851177488326e-05, + "loss": 2.4396, + "step": 5575 + }, + { + "epoch": 2.8, + "learning_rate": 2.0320114728875538e-05, + "loss": 2.526, + "step": 5580 + }, + { + "epoch": 2.8, + "learning_rate": 2.028138992034887e-05, + "loss": 2.6806, + "step": 5585 + }, + { + "epoch": 2.81, + "learning_rate": 2.0242676848226948e-05, + "loss": 2.5842, + "step": 5590 + }, + { + "epoch": 2.81, + "learning_rate": 2.02039756087992e-05, + "loss": 2.4016, + "step": 5595 + }, + { + "epoch": 2.81, + "learning_rate": 2.0165286298325638e-05, + "loss": 2.5254, + "step": 5600 + }, + { + "epoch": 2.81, + "learning_rate": 2.0126609013036575e-05, + "loss": 2.4995, + "step": 5605 + }, + { + "epoch": 2.82, + "learning_rate": 2.0087943849132446e-05, + "loss": 2.473, + "step": 5610 + }, + { + "epoch": 2.82, + "learning_rate": 2.004929090278351e-05, + "loss": 2.668, + "step": 5615 + }, + { + "epoch": 2.82, + "learning_rate": 2.001065027012966e-05, + "loss": 2.693, + "step": 5620 + }, + { + "epoch": 2.82, + "learning_rate": 1.9972022047280154e-05, + "loss": 2.5113, + "step": 5625 + }, + { + "epoch": 2.83, + "learning_rate": 1.9933406330313374e-05, + "loss": 2.7322, + "step": 5630 + }, + { + "epoch": 2.83, + "learning_rate": 1.989480321527661e-05, + "loss": 2.6501, + "step": 5635 + }, + { + "epoch": 2.83, + "learning_rate": 1.9856212798185798e-05, + "loss": 2.546, + "step": 5640 + }, + { + "epoch": 2.83, + "learning_rate": 1.9817635175025295e-05, + "loss": 2.8634, + "step": 5645 + }, + { + "epoch": 2.84, + "learning_rate": 1.9779070441747638e-05, + "loss": 2.2475, + "step": 5650 + }, + { + "epoch": 2.84, + "learning_rate": 1.97405186942733e-05, + "loss": 2.5021, + "step": 5655 + }, + { + "epoch": 2.84, + "learning_rate": 1.9701980028490452e-05, + "loss": 2.4607, + "step": 5660 + }, + { + "epoch": 2.84, + "learning_rate": 1.9663454540254744e-05, + "loss": 2.4587, + "step": 5665 + }, + { + "epoch": 2.85, + "learning_rate": 1.9624942325389032e-05, + "loss": 2.6975, + "step": 5670 + }, + { + "epoch": 2.85, + "learning_rate": 1.9586443479683164e-05, + "loss": 2.728, + "step": 5675 + }, + { + "epoch": 2.85, + "learning_rate": 1.9547958098893734e-05, + "loss": 2.6458, + "step": 5680 + }, + { + "epoch": 2.85, + "learning_rate": 1.9509486278743847e-05, + "loss": 2.7608, + "step": 5685 + }, + { + "epoch": 2.86, + "learning_rate": 1.9471028114922873e-05, + "loss": 2.7753, + "step": 5690 + }, + { + "epoch": 2.86, + "learning_rate": 1.9432583703086235e-05, + "loss": 2.3438, + "step": 5695 + }, + { + "epoch": 2.86, + "learning_rate": 1.9394153138855127e-05, + "loss": 2.5513, + "step": 5700 + }, + { + "epoch": 2.86, + "learning_rate": 1.9355736517816313e-05, + "loss": 2.6064, + "step": 5705 + }, + { + "epoch": 2.87, + "learning_rate": 1.9317333935521872e-05, + "loss": 2.6884, + "step": 5710 + }, + { + "epoch": 2.87, + "learning_rate": 1.927894548748897e-05, + "loss": 2.479, + "step": 5715 + }, + { + "epoch": 2.87, + "learning_rate": 1.9240571269199607e-05, + "loss": 2.741, + "step": 5720 + }, + { + "epoch": 2.87, + "learning_rate": 1.9202211376100427e-05, + "loss": 2.6736, + "step": 5725 + }, + { + "epoch": 2.88, + "learning_rate": 1.9163865903602374e-05, + "loss": 2.5357, + "step": 5730 + }, + { + "epoch": 2.88, + "learning_rate": 1.9125534947080574e-05, + "loss": 2.4667, + "step": 5735 + }, + { + "epoch": 2.88, + "learning_rate": 1.908721860187406e-05, + "loss": 2.3793, + "step": 5740 + }, + { + "epoch": 2.88, + "learning_rate": 1.904891696328548e-05, + "loss": 2.6379, + "step": 5745 + }, + { + "epoch": 2.89, + "learning_rate": 1.901063012658093e-05, + "loss": 2.5932, + "step": 5750 + }, + { + "epoch": 2.89, + "learning_rate": 1.897235818698969e-05, + "loss": 2.3479, + "step": 5755 + }, + { + "epoch": 2.89, + "learning_rate": 1.8934101239703973e-05, + "loss": 2.7664, + "step": 5760 + }, + { + "epoch": 2.89, + "learning_rate": 1.889585937987871e-05, + "loss": 2.6163, + "step": 5765 + }, + { + "epoch": 2.9, + "learning_rate": 1.885763270263131e-05, + "loss": 2.644, + "step": 5770 + }, + { + "epoch": 2.9, + "learning_rate": 1.881942130304142e-05, + "loss": 2.6584, + "step": 5775 + }, + { + "epoch": 2.9, + "learning_rate": 1.8781225276150675e-05, + "loss": 2.7552, + "step": 5780 + }, + { + "epoch": 2.9, + "learning_rate": 1.874304471696248e-05, + "loss": 2.3176, + "step": 5785 + }, + { + "epoch": 2.91, + "learning_rate": 1.8704879720441773e-05, + "loss": 2.9294, + "step": 5790 + }, + { + "epoch": 2.91, + "learning_rate": 1.8666730381514774e-05, + "loss": 2.5388, + "step": 5795 + }, + { + "epoch": 2.91, + "learning_rate": 1.8628596795068776e-05, + "loss": 2.5343, + "step": 5800 + }, + { + "epoch": 2.91, + "learning_rate": 1.859047905595187e-05, + "loss": 2.5239, + "step": 5805 + }, + { + "epoch": 2.92, + "learning_rate": 1.8552377258972747e-05, + "loss": 2.382, + "step": 5810 + }, + { + "epoch": 2.92, + "learning_rate": 1.851429149890044e-05, + "loss": 2.7965, + "step": 5815 + }, + { + "epoch": 2.92, + "learning_rate": 1.8476221870464083e-05, + "loss": 2.738, + "step": 5820 + }, + { + "epoch": 2.92, + "learning_rate": 1.84381684683527e-05, + "loss": 2.7431, + "step": 5825 + }, + { + "epoch": 2.93, + "learning_rate": 1.8400131387214964e-05, + "loss": 2.6551, + "step": 5830 + }, + { + "epoch": 2.93, + "learning_rate": 1.8362110721658927e-05, + "loss": 2.5836, + "step": 5835 + }, + { + "epoch": 2.93, + "learning_rate": 1.832410656625183e-05, + "loss": 2.3278, + "step": 5840 + }, + { + "epoch": 2.93, + "learning_rate": 1.8286119015519852e-05, + "loss": 2.5348, + "step": 5845 + }, + { + "epoch": 2.94, + "learning_rate": 1.8248148163947866e-05, + "loss": 2.388, + "step": 5850 + }, + { + "epoch": 2.94, + "learning_rate": 1.8210194105979205e-05, + "loss": 2.5839, + "step": 5855 + }, + { + "epoch": 2.94, + "learning_rate": 1.817225693601543e-05, + "loss": 2.7362, + "step": 5860 + }, + { + "epoch": 2.94, + "learning_rate": 1.8134336748416115e-05, + "loss": 2.2506, + "step": 5865 + }, + { + "epoch": 2.95, + "learning_rate": 1.8096433637498574e-05, + "loss": 2.6163, + "step": 5870 + }, + { + "epoch": 2.95, + "learning_rate": 1.8058547697537655e-05, + "loss": 2.6588, + "step": 5875 + }, + { + "epoch": 2.95, + "learning_rate": 1.802067902276551e-05, + "loss": 2.6955, + "step": 5880 + }, + { + "epoch": 2.95, + "learning_rate": 1.7982827707371326e-05, + "loss": 2.5438, + "step": 5885 + }, + { + "epoch": 2.96, + "learning_rate": 1.7944993845501118e-05, + "loss": 2.6858, + "step": 5890 + }, + { + "epoch": 2.96, + "learning_rate": 1.7907177531257507e-05, + "loss": 2.5499, + "step": 5895 + }, + { + "epoch": 2.96, + "learning_rate": 1.7869378858699452e-05, + "loss": 2.572, + "step": 5900 + }, + { + "epoch": 2.96, + "learning_rate": 1.783159792184203e-05, + "loss": 2.6499, + "step": 5905 + }, + { + "epoch": 2.97, + "learning_rate": 1.779383481465622e-05, + "loss": 2.6526, + "step": 5910 + }, + { + "epoch": 2.97, + "learning_rate": 1.775608963106863e-05, + "loss": 2.5916, + "step": 5915 + }, + { + "epoch": 2.97, + "learning_rate": 1.7718362464961314e-05, + "loss": 2.7463, + "step": 5920 + }, + { + "epoch": 2.97, + "learning_rate": 1.76806534101715e-05, + "loss": 2.6052, + "step": 5925 + }, + { + "epoch": 2.98, + "learning_rate": 1.764296256049137e-05, + "loss": 2.4542, + "step": 5930 + }, + { + "epoch": 2.98, + "learning_rate": 1.760529000966782e-05, + "loss": 2.5414, + "step": 5935 + }, + { + "epoch": 2.98, + "learning_rate": 1.7567635851402238e-05, + "loss": 2.6181, + "step": 5940 + }, + { + "epoch": 2.98, + "learning_rate": 1.753000017935026e-05, + "loss": 2.4822, + "step": 5945 + }, + { + "epoch": 2.99, + "learning_rate": 1.7492383087121546e-05, + "loss": 2.6495, + "step": 5950 + }, + { + "epoch": 2.99, + "learning_rate": 1.7454784668279546e-05, + "loss": 2.455, + "step": 5955 + }, + { + "epoch": 2.99, + "learning_rate": 1.7417205016341258e-05, + "loss": 2.3959, + "step": 5960 + }, + { + "epoch": 2.99, + "learning_rate": 1.7379644224777004e-05, + "loss": 2.5139, + "step": 5965 + }, + { + "epoch": 3.0, + "learning_rate": 1.7342102387010194e-05, + "loss": 2.6985, + "step": 5970 + }, + { + "epoch": 3.0, + "learning_rate": 1.7304579596417104e-05, + "loss": 2.6598, + "step": 5975 + }, + { + "epoch": 3.0, + "learning_rate": 1.726707594632661e-05, + "loss": 2.6638, + "step": 5980 + }, + { + "epoch": 3.0, + "learning_rate": 1.7229591530020022e-05, + "loss": 2.446, + "step": 5985 + }, + { + "epoch": 3.01, + "learning_rate": 1.7192126440730784e-05, + "loss": 2.5736, + "step": 5990 + }, + { + "epoch": 3.01, + "learning_rate": 1.7154680771644242e-05, + "loss": 2.4385, + "step": 5995 + }, + { + "epoch": 3.01, + "learning_rate": 1.7117254615897497e-05, + "loss": 2.5651, + "step": 6000 + }, + { + "epoch": 3.01, + "learning_rate": 1.707984806657908e-05, + "loss": 2.5767, + "step": 6005 + }, + { + "epoch": 3.02, + "learning_rate": 1.7042461216728756e-05, + "loss": 2.5527, + "step": 6010 + }, + { + "epoch": 3.02, + "learning_rate": 1.7005094159337307e-05, + "loss": 2.4275, + "step": 6015 + }, + { + "epoch": 3.02, + "learning_rate": 1.6967746987346272e-05, + "loss": 2.5136, + "step": 6020 + }, + { + "epoch": 3.02, + "learning_rate": 1.6930419793647735e-05, + "loss": 2.5035, + "step": 6025 + }, + { + "epoch": 3.03, + "learning_rate": 1.6893112671084094e-05, + "loss": 2.3627, + "step": 6030 + }, + { + "epoch": 3.03, + "learning_rate": 1.6855825712447822e-05, + "loss": 2.5038, + "step": 6035 + }, + { + "epoch": 3.03, + "learning_rate": 1.6818559010481226e-05, + "loss": 2.3229, + "step": 6040 + }, + { + "epoch": 3.03, + "learning_rate": 1.6781312657876254e-05, + "loss": 2.3298, + "step": 6045 + }, + { + "epoch": 3.04, + "learning_rate": 1.6744086747274224e-05, + "loss": 2.5965, + "step": 6050 + }, + { + "epoch": 3.04, + "learning_rate": 1.67068813712656e-05, + "loss": 2.6745, + "step": 6055 + }, + { + "epoch": 3.04, + "learning_rate": 1.6669696622389797e-05, + "loss": 2.5358, + "step": 6060 + }, + { + "epoch": 3.04, + "learning_rate": 1.6632532593134907e-05, + "loss": 2.1572, + "step": 6065 + }, + { + "epoch": 3.05, + "learning_rate": 1.6595389375937488e-05, + "loss": 2.3413, + "step": 6070 + }, + { + "epoch": 3.05, + "learning_rate": 1.6558267063182342e-05, + "loss": 2.4968, + "step": 6075 + }, + { + "epoch": 3.05, + "learning_rate": 1.6521165747202276e-05, + "loss": 2.528, + "step": 6080 + }, + { + "epoch": 3.05, + "learning_rate": 1.6484085520277847e-05, + "loss": 2.4744, + "step": 6085 + }, + { + "epoch": 3.06, + "learning_rate": 1.6447026474637194e-05, + "loss": 2.5993, + "step": 6090 + }, + { + "epoch": 3.06, + "learning_rate": 1.640998870245575e-05, + "loss": 2.5334, + "step": 6095 + }, + { + "epoch": 3.06, + "learning_rate": 1.637297229585604e-05, + "loss": 2.3431, + "step": 6100 + }, + { + "epoch": 3.06, + "learning_rate": 1.633597734690746e-05, + "loss": 2.3361, + "step": 6105 + }, + { + "epoch": 3.07, + "learning_rate": 1.6299003947626017e-05, + "loss": 2.4223, + "step": 6110 + }, + { + "epoch": 3.07, + "learning_rate": 1.6262052189974125e-05, + "loss": 2.6129, + "step": 6115 + }, + { + "epoch": 3.07, + "learning_rate": 1.622512216586038e-05, + "loss": 2.5798, + "step": 6120 + }, + { + "epoch": 3.07, + "learning_rate": 1.61882139671393e-05, + "loss": 2.7164, + "step": 6125 + }, + { + "epoch": 3.08, + "learning_rate": 1.6151327685611127e-05, + "loss": 2.627, + "step": 6130 + }, + { + "epoch": 3.08, + "learning_rate": 1.6114463413021612e-05, + "loss": 2.5533, + "step": 6135 + }, + { + "epoch": 3.08, + "learning_rate": 1.6077621241061725e-05, + "loss": 2.4149, + "step": 6140 + }, + { + "epoch": 3.08, + "learning_rate": 1.6040801261367493e-05, + "loss": 2.5167, + "step": 6145 + }, + { + "epoch": 3.09, + "learning_rate": 1.6004003565519734e-05, + "loss": 2.3775, + "step": 6150 + }, + { + "epoch": 3.09, + "learning_rate": 1.596722824504385e-05, + "loss": 2.7508, + "step": 6155 + }, + { + "epoch": 3.09, + "learning_rate": 1.5930475391409562e-05, + "loss": 2.2924, + "step": 6160 + }, + { + "epoch": 3.09, + "learning_rate": 1.5893745096030754e-05, + "loss": 2.6994, + "step": 6165 + }, + { + "epoch": 3.1, + "learning_rate": 1.5857037450265176e-05, + "loss": 2.4325, + "step": 6170 + }, + { + "epoch": 3.1, + "learning_rate": 1.5820352545414232e-05, + "loss": 2.7048, + "step": 6175 + }, + { + "epoch": 3.1, + "learning_rate": 1.5783690472722785e-05, + "loss": 2.5557, + "step": 6180 + }, + { + "epoch": 3.1, + "learning_rate": 1.5747051323378903e-05, + "loss": 2.5968, + "step": 6185 + }, + { + "epoch": 3.11, + "learning_rate": 1.5710435188513627e-05, + "loss": 2.8964, + "step": 6190 + }, + { + "epoch": 3.11, + "learning_rate": 1.5673842159200768e-05, + "loss": 2.4664, + "step": 6195 + }, + { + "epoch": 3.11, + "learning_rate": 1.5637272326456666e-05, + "loss": 2.6002, + "step": 6200 + }, + { + "epoch": 3.11, + "learning_rate": 1.560072578123995e-05, + "loss": 2.1335, + "step": 6205 + }, + { + "epoch": 3.12, + "learning_rate": 1.5564202614451352e-05, + "loss": 2.4466, + "step": 6210 + }, + { + "epoch": 3.12, + "learning_rate": 1.5527702916933436e-05, + "loss": 2.4236, + "step": 6215 + }, + { + "epoch": 3.12, + "learning_rate": 1.54912267794704e-05, + "loss": 2.6926, + "step": 6220 + }, + { + "epoch": 3.12, + "learning_rate": 1.5454774292787837e-05, + "loss": 2.6268, + "step": 6225 + }, + { + "epoch": 3.13, + "learning_rate": 1.541834554755252e-05, + "loss": 2.5849, + "step": 6230 + }, + { + "epoch": 3.13, + "learning_rate": 1.5381940634372165e-05, + "loss": 2.4988, + "step": 6235 + }, + { + "epoch": 3.13, + "learning_rate": 1.534555964379522e-05, + "loss": 2.8093, + "step": 6240 + }, + { + "epoch": 3.13, + "learning_rate": 1.5309202666310622e-05, + "loss": 2.6214, + "step": 6245 + }, + { + "epoch": 3.14, + "learning_rate": 1.5272869792347595e-05, + "loss": 2.4958, + "step": 6250 + }, + { + "epoch": 3.14, + "learning_rate": 1.5236561112275394e-05, + "loss": 2.5731, + "step": 6255 + }, + { + "epoch": 3.14, + "learning_rate": 1.5200276716403103e-05, + "loss": 2.4501, + "step": 6260 + }, + { + "epoch": 3.14, + "learning_rate": 1.5164016694979411e-05, + "loss": 2.3793, + "step": 6265 + }, + { + "epoch": 3.15, + "learning_rate": 1.5127781138192374e-05, + "loss": 2.4751, + "step": 6270 + }, + { + "epoch": 3.15, + "learning_rate": 1.5091570136169206e-05, + "loss": 2.2213, + "step": 6275 + }, + { + "epoch": 3.15, + "learning_rate": 1.505538377897604e-05, + "loss": 2.4721, + "step": 6280 + }, + { + "epoch": 3.15, + "learning_rate": 1.5019222156617712e-05, + "loss": 2.5355, + "step": 6285 + }, + { + "epoch": 3.16, + "learning_rate": 1.4983085359037547e-05, + "loss": 2.6066, + "step": 6290 + }, + { + "epoch": 3.16, + "learning_rate": 1.4946973476117105e-05, + "loss": 2.5482, + "step": 6295 + }, + { + "epoch": 3.16, + "learning_rate": 1.4910886597675994e-05, + "loss": 2.6717, + "step": 6300 + }, + { + "epoch": 3.16, + "learning_rate": 1.4874824813471616e-05, + "loss": 2.5616, + "step": 6305 + }, + { + "epoch": 3.17, + "learning_rate": 1.4838788213198965e-05, + "loss": 2.5877, + "step": 6310 + }, + { + "epoch": 3.17, + "learning_rate": 1.48027768864904e-05, + "loss": 2.4685, + "step": 6315 + }, + { + "epoch": 3.17, + "learning_rate": 1.4766790922915405e-05, + "loss": 2.459, + "step": 6320 + }, + { + "epoch": 3.17, + "learning_rate": 1.4730830411980393e-05, + "loss": 2.4626, + "step": 6325 + }, + { + "epoch": 3.18, + "learning_rate": 1.469489544312846e-05, + "loss": 2.4486, + "step": 6330 + }, + { + "epoch": 3.18, + "learning_rate": 1.4658986105739175e-05, + "loss": 2.6828, + "step": 6335 + }, + { + "epoch": 3.18, + "learning_rate": 1.4623102489128353e-05, + "loss": 2.702, + "step": 6340 + }, + { + "epoch": 3.18, + "learning_rate": 1.4587244682547857e-05, + "loss": 2.8563, + "step": 6345 + }, + { + "epoch": 3.19, + "learning_rate": 1.4551412775185308e-05, + "loss": 2.5647, + "step": 6350 + }, + { + "epoch": 3.19, + "learning_rate": 1.4515606856163949e-05, + "loss": 2.5023, + "step": 6355 + }, + { + "epoch": 3.19, + "learning_rate": 1.4479827014542363e-05, + "loss": 2.347, + "step": 6360 + }, + { + "epoch": 3.19, + "learning_rate": 1.4444073339314284e-05, + "loss": 2.6892, + "step": 6365 + }, + { + "epoch": 3.2, + "learning_rate": 1.4408345919408359e-05, + "loss": 2.5874, + "step": 6370 + }, + { + "epoch": 3.2, + "learning_rate": 1.4372644843687922e-05, + "loss": 2.3453, + "step": 6375 + }, + { + "epoch": 3.2, + "learning_rate": 1.4336970200950794e-05, + "loss": 2.4236, + "step": 6380 + }, + { + "epoch": 3.2, + "learning_rate": 1.4301322079929053e-05, + "loss": 2.6329, + "step": 6385 + }, + { + "epoch": 3.21, + "learning_rate": 1.4265700569288792e-05, + "loss": 2.7761, + "step": 6390 + }, + { + "epoch": 3.21, + "learning_rate": 1.4230105757629936e-05, + "loss": 2.6791, + "step": 6395 + }, + { + "epoch": 3.21, + "learning_rate": 1.4194537733485994e-05, + "loss": 2.6064, + "step": 6400 + }, + { + "epoch": 3.21, + "learning_rate": 1.4158996585323841e-05, + "loss": 2.3809, + "step": 6405 + }, + { + "epoch": 3.22, + "learning_rate": 1.4123482401543531e-05, + "loss": 2.5205, + "step": 6410 + }, + { + "epoch": 3.22, + "learning_rate": 1.4087995270478021e-05, + "loss": 2.524, + "step": 6415 + }, + { + "epoch": 3.22, + "learning_rate": 1.4052535280392999e-05, + "loss": 2.2721, + "step": 6420 + }, + { + "epoch": 3.22, + "learning_rate": 1.401710251948663e-05, + "loss": 2.5879, + "step": 6425 + }, + { + "epoch": 3.23, + "learning_rate": 1.3981697075889372e-05, + "loss": 2.6147, + "step": 6430 + }, + { + "epoch": 3.23, + "learning_rate": 1.394631903766373e-05, + "loss": 2.5308, + "step": 6435 + }, + { + "epoch": 3.23, + "learning_rate": 1.3910968492804028e-05, + "loss": 2.4739, + "step": 6440 + }, + { + "epoch": 3.23, + "learning_rate": 1.3875645529236234e-05, + "loss": 2.4483, + "step": 6445 + }, + { + "epoch": 3.24, + "learning_rate": 1.3840350234817686e-05, + "loss": 2.6367, + "step": 6450 + }, + { + "epoch": 3.24, + "learning_rate": 1.3805082697336943e-05, + "loss": 2.6567, + "step": 6455 + }, + { + "epoch": 3.24, + "learning_rate": 1.3769843004513489e-05, + "loss": 2.52, + "step": 6460 + }, + { + "epoch": 3.24, + "learning_rate": 1.3734631243997561e-05, + "loss": 2.6544, + "step": 6465 + }, + { + "epoch": 3.25, + "learning_rate": 1.3699447503369925e-05, + "loss": 2.3696, + "step": 6470 + }, + { + "epoch": 3.25, + "learning_rate": 1.3664291870141649e-05, + "loss": 2.4517, + "step": 6475 + }, + { + "epoch": 3.25, + "learning_rate": 1.3629164431753894e-05, + "loss": 2.6313, + "step": 6480 + }, + { + "epoch": 3.25, + "learning_rate": 1.3594065275577692e-05, + "loss": 2.4032, + "step": 6485 + }, + { + "epoch": 3.26, + "learning_rate": 1.3558994488913731e-05, + "loss": 2.7063, + "step": 6490 + }, + { + "epoch": 3.26, + "learning_rate": 1.3523952158992136e-05, + "loss": 2.6109, + "step": 6495 + }, + { + "epoch": 3.26, + "learning_rate": 1.3488938372972257e-05, + "loss": 2.633, + "step": 6500 + }, + { + "epoch": 3.26, + "learning_rate": 1.3453953217942436e-05, + "loss": 2.5565, + "step": 6505 + }, + { + "epoch": 3.27, + "learning_rate": 1.3418996780919804e-05, + "loss": 2.5866, + "step": 6510 + }, + { + "epoch": 3.27, + "learning_rate": 1.3384069148850087e-05, + "loss": 2.5992, + "step": 6515 + }, + { + "epoch": 3.27, + "learning_rate": 1.3349170408607342e-05, + "loss": 2.4388, + "step": 6520 + }, + { + "epoch": 3.27, + "learning_rate": 1.3314300646993771e-05, + "loss": 2.2734, + "step": 6525 + }, + { + "epoch": 3.28, + "learning_rate": 1.3279459950739489e-05, + "loss": 2.7683, + "step": 6530 + }, + { + "epoch": 3.28, + "learning_rate": 1.3244648406502331e-05, + "loss": 2.3653, + "step": 6535 + }, + { + "epoch": 3.28, + "learning_rate": 1.3209866100867613e-05, + "loss": 2.6401, + "step": 6540 + }, + { + "epoch": 3.28, + "learning_rate": 1.3175113120347943e-05, + "loss": 2.5218, + "step": 6545 + }, + { + "epoch": 3.29, + "learning_rate": 1.3140389551382975e-05, + "loss": 2.4681, + "step": 6550 + }, + { + "epoch": 3.29, + "learning_rate": 1.3105695480339206e-05, + "loss": 2.4681, + "step": 6555 + }, + { + "epoch": 3.29, + "learning_rate": 1.3071030993509788e-05, + "loss": 2.5743, + "step": 6560 + }, + { + "epoch": 3.29, + "learning_rate": 1.303639617711427e-05, + "loss": 2.5423, + "step": 6565 + }, + { + "epoch": 3.3, + "learning_rate": 1.3001791117298395e-05, + "loss": 2.4267, + "step": 6570 + }, + { + "epoch": 3.3, + "learning_rate": 1.2967215900133911e-05, + "loss": 2.5537, + "step": 6575 + }, + { + "epoch": 3.3, + "learning_rate": 1.2932670611618336e-05, + "loss": 2.5451, + "step": 6580 + }, + { + "epoch": 3.3, + "learning_rate": 1.2898155337674744e-05, + "loss": 2.4048, + "step": 6585 + }, + { + "epoch": 3.31, + "learning_rate": 1.2863670164151551e-05, + "loss": 2.6769, + "step": 6590 + }, + { + "epoch": 3.31, + "learning_rate": 1.2829215176822316e-05, + "loss": 2.2118, + "step": 6595 + }, + { + "epoch": 3.31, + "learning_rate": 1.2794790461385508e-05, + "loss": 2.2912, + "step": 6600 + }, + { + "epoch": 3.31, + "learning_rate": 1.2760396103464309e-05, + "loss": 2.3978, + "step": 6605 + }, + { + "epoch": 3.32, + "learning_rate": 1.2726032188606388e-05, + "loss": 2.4801, + "step": 6610 + }, + { + "epoch": 3.32, + "learning_rate": 1.2691698802283697e-05, + "loss": 2.5522, + "step": 6615 + }, + { + "epoch": 3.32, + "learning_rate": 1.2657396029892258e-05, + "loss": 2.6728, + "step": 6620 + }, + { + "epoch": 3.32, + "learning_rate": 1.2623123956751943e-05, + "loss": 2.2937, + "step": 6625 + }, + { + "epoch": 3.33, + "learning_rate": 1.258888266810627e-05, + "loss": 2.5459, + "step": 6630 + }, + { + "epoch": 3.33, + "learning_rate": 1.2554672249122187e-05, + "loss": 2.6329, + "step": 6635 + }, + { + "epoch": 3.33, + "learning_rate": 1.2520492784889865e-05, + "loss": 2.4845, + "step": 6640 + }, + { + "epoch": 3.33, + "learning_rate": 1.2486344360422475e-05, + "loss": 2.5023, + "step": 6645 + }, + { + "epoch": 3.34, + "learning_rate": 1.2452227060655993e-05, + "loss": 2.5674, + "step": 6650 + }, + { + "epoch": 3.34, + "learning_rate": 1.2418140970448975e-05, + "loss": 2.4996, + "step": 6655 + }, + { + "epoch": 3.34, + "learning_rate": 1.2384086174582336e-05, + "loss": 2.4704, + "step": 6660 + }, + { + "epoch": 3.34, + "learning_rate": 1.2350062757759193e-05, + "loss": 2.4928, + "step": 6665 + }, + { + "epoch": 3.35, + "learning_rate": 1.2316070804604576e-05, + "loss": 2.5498, + "step": 6670 + }, + { + "epoch": 3.35, + "learning_rate": 1.228211039966528e-05, + "loss": 2.5641, + "step": 6675 + }, + { + "epoch": 3.35, + "learning_rate": 1.2248181627409619e-05, + "loss": 2.5725, + "step": 6680 + }, + { + "epoch": 3.35, + "learning_rate": 1.221428457222723e-05, + "loss": 2.5827, + "step": 6685 + }, + { + "epoch": 3.36, + "learning_rate": 1.2180419318428868e-05, + "loss": 2.3591, + "step": 6690 + }, + { + "epoch": 3.36, + "learning_rate": 1.2146585950246186e-05, + "loss": 2.5772, + "step": 6695 + }, + { + "epoch": 3.36, + "learning_rate": 1.2112784551831533e-05, + "loss": 2.4008, + "step": 6700 + }, + { + "epoch": 3.36, + "learning_rate": 1.2079015207257724e-05, + "loss": 2.3334, + "step": 6705 + }, + { + "epoch": 3.37, + "learning_rate": 1.2045278000517857e-05, + "loss": 2.8023, + "step": 6710 + }, + { + "epoch": 3.37, + "learning_rate": 1.2011573015525118e-05, + "loss": 2.6145, + "step": 6715 + }, + { + "epoch": 3.37, + "learning_rate": 1.1977900336112519e-05, + "loss": 2.6568, + "step": 6720 + }, + { + "epoch": 3.37, + "learning_rate": 1.1944260046032735e-05, + "loss": 2.1771, + "step": 6725 + }, + { + "epoch": 3.38, + "learning_rate": 1.1910652228957872e-05, + "loss": 2.4932, + "step": 6730 + }, + { + "epoch": 3.38, + "learning_rate": 1.187707696847927e-05, + "loss": 2.3883, + "step": 6735 + }, + { + "epoch": 3.38, + "learning_rate": 1.1843534348107294e-05, + "loss": 2.7792, + "step": 6740 + }, + { + "epoch": 3.38, + "learning_rate": 1.1810024451271125e-05, + "loss": 2.5825, + "step": 6745 + }, + { + "epoch": 3.39, + "learning_rate": 1.1776547361318551e-05, + "loss": 2.406, + "step": 6750 + }, + { + "epoch": 3.39, + "learning_rate": 1.1743103161515762e-05, + "loss": 2.5823, + "step": 6755 + }, + { + "epoch": 3.39, + "learning_rate": 1.1709691935047137e-05, + "loss": 2.7587, + "step": 6760 + }, + { + "epoch": 3.39, + "learning_rate": 1.1676313765015038e-05, + "loss": 2.5183, + "step": 6765 + }, + { + "epoch": 3.4, + "learning_rate": 1.1642968734439633e-05, + "loss": 2.6452, + "step": 6770 + }, + { + "epoch": 3.4, + "learning_rate": 1.1609656926258634e-05, + "loss": 2.4641, + "step": 6775 + }, + { + "epoch": 3.4, + "learning_rate": 1.1576378423327131e-05, + "loss": 2.6462, + "step": 6780 + }, + { + "epoch": 3.4, + "learning_rate": 1.1543133308417378e-05, + "loss": 2.5271, + "step": 6785 + }, + { + "epoch": 3.41, + "learning_rate": 1.1509921664218587e-05, + "loss": 2.4245, + "step": 6790 + }, + { + "epoch": 3.41, + "learning_rate": 1.14767435733367e-05, + "loss": 2.3622, + "step": 6795 + }, + { + "epoch": 3.41, + "learning_rate": 1.1443599118294227e-05, + "loss": 2.5564, + "step": 6800 + }, + { + "epoch": 3.41, + "learning_rate": 1.1410488381530005e-05, + "loss": 2.342, + "step": 6805 + }, + { + "epoch": 3.42, + "learning_rate": 1.1377411445399006e-05, + "loss": 2.4976, + "step": 6810 + }, + { + "epoch": 3.42, + "learning_rate": 1.1344368392172125e-05, + "loss": 2.4792, + "step": 6815 + }, + { + "epoch": 3.42, + "learning_rate": 1.1311359304036013e-05, + "loss": 2.4829, + "step": 6820 + }, + { + "epoch": 3.42, + "learning_rate": 1.1278384263092797e-05, + "loss": 2.3949, + "step": 6825 + }, + { + "epoch": 3.43, + "learning_rate": 1.124544335135995e-05, + "loss": 2.5555, + "step": 6830 + }, + { + "epoch": 3.43, + "learning_rate": 1.1212536650770041e-05, + "loss": 2.5479, + "step": 6835 + }, + { + "epoch": 3.43, + "learning_rate": 1.1179664243170554e-05, + "loss": 2.5333, + "step": 6840 + }, + { + "epoch": 3.43, + "learning_rate": 1.1146826210323677e-05, + "loss": 2.0832, + "step": 6845 + }, + { + "epoch": 3.44, + "learning_rate": 1.1114022633906096e-05, + "loss": 2.7639, + "step": 6850 + }, + { + "epoch": 3.44, + "learning_rate": 1.10812535955088e-05, + "loss": 2.663, + "step": 6855 + }, + { + "epoch": 3.44, + "learning_rate": 1.104851917663687e-05, + "loss": 2.485, + "step": 6860 + }, + { + "epoch": 3.44, + "learning_rate": 1.1015819458709279e-05, + "loss": 2.3004, + "step": 6865 + }, + { + "epoch": 3.45, + "learning_rate": 1.0983154523058687e-05, + "loss": 2.3924, + "step": 6870 + }, + { + "epoch": 3.45, + "learning_rate": 1.095052445093124e-05, + "loss": 2.4694, + "step": 6875 + }, + { + "epoch": 3.45, + "learning_rate": 1.0917929323486398e-05, + "loss": 2.5255, + "step": 6880 + }, + { + "epoch": 3.46, + "learning_rate": 1.0885369221796657e-05, + "loss": 2.211, + "step": 6885 + }, + { + "epoch": 3.46, + "learning_rate": 1.0852844226847425e-05, + "loss": 2.5446, + "step": 6890 + }, + { + "epoch": 3.46, + "learning_rate": 1.0820354419536786e-05, + "loss": 2.778, + "step": 6895 + }, + { + "epoch": 3.46, + "learning_rate": 1.0787899880675298e-05, + "loss": 2.5628, + "step": 6900 + }, + { + "epoch": 3.47, + "learning_rate": 1.0755480690985803e-05, + "loss": 2.5333, + "step": 6905 + }, + { + "epoch": 3.47, + "learning_rate": 1.0723096931103218e-05, + "loss": 2.7511, + "step": 6910 + }, + { + "epoch": 3.47, + "learning_rate": 1.0690748681574336e-05, + "loss": 2.2807, + "step": 6915 + }, + { + "epoch": 3.47, + "learning_rate": 1.0658436022857617e-05, + "loss": 2.5652, + "step": 6920 + }, + { + "epoch": 3.48, + "learning_rate": 1.062615903532303e-05, + "loss": 2.7855, + "step": 6925 + }, + { + "epoch": 3.48, + "learning_rate": 1.0593917799251785e-05, + "loss": 2.5029, + "step": 6930 + }, + { + "epoch": 3.48, + "learning_rate": 1.0561712394836184e-05, + "loss": 2.3403, + "step": 6935 + }, + { + "epoch": 3.48, + "learning_rate": 1.0529542902179406e-05, + "loss": 2.748, + "step": 6940 + }, + { + "epoch": 3.49, + "learning_rate": 1.0497409401295303e-05, + "loss": 2.4717, + "step": 6945 + }, + { + "epoch": 3.49, + "learning_rate": 1.0465311972108214e-05, + "loss": 2.6532, + "step": 6950 + }, + { + "epoch": 3.49, + "learning_rate": 1.043325069445275e-05, + "loss": 2.3954, + "step": 6955 + }, + { + "epoch": 3.49, + "learning_rate": 1.0401225648073612e-05, + "loss": 2.4491, + "step": 6960 + }, + { + "epoch": 3.5, + "learning_rate": 1.0369236912625377e-05, + "loss": 2.8167, + "step": 6965 + }, + { + "epoch": 3.5, + "learning_rate": 1.0337284567672314e-05, + "loss": 2.4416, + "step": 6970 + }, + { + "epoch": 3.5, + "learning_rate": 1.0305368692688174e-05, + "loss": 2.4095, + "step": 6975 + }, + { + "epoch": 3.5, + "learning_rate": 1.0273489367056002e-05, + "loss": 2.6135, + "step": 6980 + }, + { + "epoch": 3.51, + "learning_rate": 1.0241646670067932e-05, + "loss": 2.7131, + "step": 6985 + }, + { + "epoch": 3.51, + "learning_rate": 1.0209840680924993e-05, + "loss": 2.45, + "step": 6990 + }, + { + "epoch": 3.51, + "learning_rate": 1.0178071478736914e-05, + "loss": 2.4902, + "step": 6995 + }, + { + "epoch": 3.51, + "learning_rate": 1.0146339142521926e-05, + "loss": 2.572, + "step": 7000 + }, + { + "epoch": 3.52, + "learning_rate": 1.0114643751206562e-05, + "loss": 2.3915, + "step": 7005 + }, + { + "epoch": 3.52, + "learning_rate": 1.0082985383625468e-05, + "loss": 2.3651, + "step": 7010 + }, + { + "epoch": 3.52, + "learning_rate": 1.0051364118521197e-05, + "loss": 2.0744, + "step": 7015 + }, + { + "epoch": 3.52, + "learning_rate": 1.0019780034544022e-05, + "loss": 2.3651, + "step": 7020 + }, + { + "epoch": 3.53, + "learning_rate": 9.988233210251723e-06, + "loss": 2.423, + "step": 7025 + }, + { + "epoch": 3.53, + "learning_rate": 9.956723724109441e-06, + "loss": 2.7706, + "step": 7030 + }, + { + "epoch": 3.53, + "learning_rate": 9.925251654489415e-06, + "loss": 2.4113, + "step": 7035 + }, + { + "epoch": 3.53, + "learning_rate": 9.893817079670825e-06, + "loss": 2.2758, + "step": 7040 + }, + { + "epoch": 3.54, + "learning_rate": 9.8624200778396e-06, + "loss": 2.4551, + "step": 7045 + }, + { + "epoch": 3.54, + "learning_rate": 9.831060727088215e-06, + "loss": 2.3271, + "step": 7050 + }, + { + "epoch": 3.54, + "learning_rate": 9.799739105415483e-06, + "loss": 2.348, + "step": 7055 + }, + { + "epoch": 3.54, + "learning_rate": 9.768455290726402e-06, + "loss": 2.803, + "step": 7060 + }, + { + "epoch": 3.55, + "learning_rate": 9.737209360831895e-06, + "loss": 2.4977, + "step": 7065 + }, + { + "epoch": 3.55, + "learning_rate": 9.70600139344868e-06, + "loss": 2.6904, + "step": 7070 + }, + { + "epoch": 3.55, + "learning_rate": 9.67483146619907e-06, + "loss": 2.4839, + "step": 7075 + }, + { + "epoch": 3.55, + "learning_rate": 9.64369965661073e-06, + "loss": 2.6168, + "step": 7080 + }, + { + "epoch": 3.56, + "learning_rate": 9.612606042116535e-06, + "loss": 2.3343, + "step": 7085 + }, + { + "epoch": 3.56, + "learning_rate": 9.581550700054345e-06, + "loss": 2.4697, + "step": 7090 + }, + { + "epoch": 3.56, + "learning_rate": 9.550533707666842e-06, + "loss": 2.7164, + "step": 7095 + }, + { + "epoch": 3.56, + "learning_rate": 9.519555142101311e-06, + "loss": 2.5116, + "step": 7100 + }, + { + "epoch": 3.57, + "learning_rate": 9.488615080409468e-06, + "loss": 2.4768, + "step": 7105 + }, + { + "epoch": 3.57, + "learning_rate": 9.457713599547252e-06, + "loss": 2.4756, + "step": 7110 + }, + { + "epoch": 3.57, + "learning_rate": 9.426850776374646e-06, + "loss": 2.4257, + "step": 7115 + }, + { + "epoch": 3.57, + "learning_rate": 9.396026687655483e-06, + "loss": 2.5385, + "step": 7120 + }, + { + "epoch": 3.58, + "learning_rate": 9.365241410057246e-06, + "loss": 2.5497, + "step": 7125 + }, + { + "epoch": 3.58, + "learning_rate": 9.334495020150885e-06, + "loss": 2.5848, + "step": 7130 + }, + { + "epoch": 3.58, + "learning_rate": 9.303787594410648e-06, + "loss": 2.5811, + "step": 7135 + }, + { + "epoch": 3.58, + "learning_rate": 9.273119209213841e-06, + "loss": 2.2504, + "step": 7140 + }, + { + "epoch": 3.59, + "learning_rate": 9.242489940840684e-06, + "loss": 2.4348, + "step": 7145 + }, + { + "epoch": 3.59, + "learning_rate": 9.211899865474086e-06, + "loss": 2.6538, + "step": 7150 + }, + { + "epoch": 3.59, + "learning_rate": 9.181349059199484e-06, + "loss": 2.9365, + "step": 7155 + }, + { + "epoch": 3.59, + "learning_rate": 9.150837598004648e-06, + "loss": 2.4267, + "step": 7160 + }, + { + "epoch": 3.6, + "learning_rate": 9.120365557779472e-06, + "loss": 2.3872, + "step": 7165 + }, + { + "epoch": 3.6, + "learning_rate": 9.089933014315818e-06, + "loss": 2.5116, + "step": 7170 + }, + { + "epoch": 3.6, + "learning_rate": 9.059540043307293e-06, + "loss": 2.3202, + "step": 7175 + }, + { + "epoch": 3.6, + "learning_rate": 9.029186720349078e-06, + "loss": 2.7859, + "step": 7180 + }, + { + "epoch": 3.61, + "learning_rate": 8.998873120937762e-06, + "loss": 2.6064, + "step": 7185 + }, + { + "epoch": 3.61, + "learning_rate": 8.968599320471102e-06, + "loss": 2.8572, + "step": 7190 + }, + { + "epoch": 3.61, + "learning_rate": 8.938365394247877e-06, + "loss": 2.4965, + "step": 7195 + }, + { + "epoch": 3.61, + "learning_rate": 8.908171417467692e-06, + "loss": 2.7261, + "step": 7200 + }, + { + "epoch": 3.62, + "learning_rate": 8.878017465230778e-06, + "loss": 2.6582, + "step": 7205 + }, + { + "epoch": 3.62, + "learning_rate": 8.847903612537826e-06, + "loss": 2.3756, + "step": 7210 + }, + { + "epoch": 3.62, + "learning_rate": 8.817829934289775e-06, + "loss": 2.5582, + "step": 7215 + }, + { + "epoch": 3.62, + "learning_rate": 8.787796505287657e-06, + "loss": 2.6091, + "step": 7220 + }, + { + "epoch": 3.63, + "learning_rate": 8.757803400232379e-06, + "loss": 2.5523, + "step": 7225 + }, + { + "epoch": 3.63, + "learning_rate": 8.727850693724558e-06, + "loss": 2.6721, + "step": 7230 + }, + { + "epoch": 3.63, + "learning_rate": 8.697938460264326e-06, + "loss": 2.6035, + "step": 7235 + }, + { + "epoch": 3.63, + "learning_rate": 8.668066774251158e-06, + "loss": 2.4755, + "step": 7240 + }, + { + "epoch": 3.64, + "learning_rate": 8.638235709983664e-06, + "loss": 2.6591, + "step": 7245 + }, + { + "epoch": 3.64, + "learning_rate": 8.608445341659423e-06, + "loss": 2.3781, + "step": 7250 + }, + { + "epoch": 3.64, + "learning_rate": 8.578695743374798e-06, + "loss": 2.5149, + "step": 7255 + }, + { + "epoch": 3.64, + "learning_rate": 8.548986989124737e-06, + "loss": 2.6264, + "step": 7260 + }, + { + "epoch": 3.65, + "learning_rate": 8.519319152802601e-06, + "loss": 2.638, + "step": 7265 + }, + { + "epoch": 3.65, + "learning_rate": 8.489692308199981e-06, + "loss": 2.4959, + "step": 7270 + }, + { + "epoch": 3.65, + "learning_rate": 8.460106529006511e-06, + "loss": 2.3365, + "step": 7275 + }, + { + "epoch": 3.65, + "learning_rate": 8.430561888809676e-06, + "loss": 2.3178, + "step": 7280 + }, + { + "epoch": 3.66, + "learning_rate": 8.401058461094643e-06, + "loss": 2.5691, + "step": 7285 + }, + { + "epoch": 3.66, + "learning_rate": 8.371596319244087e-06, + "loss": 2.4521, + "step": 7290 + }, + { + "epoch": 3.66, + "learning_rate": 8.342175536537975e-06, + "loss": 2.6887, + "step": 7295 + }, + { + "epoch": 3.66, + "learning_rate": 8.312796186153405e-06, + "loss": 2.2551, + "step": 7300 + }, + { + "epoch": 3.67, + "learning_rate": 8.283458341164432e-06, + "loss": 2.5463, + "step": 7305 + }, + { + "epoch": 3.67, + "learning_rate": 8.254162074541868e-06, + "loss": 2.6583, + "step": 7310 + }, + { + "epoch": 3.67, + "learning_rate": 8.224907459153114e-06, + "loss": 2.5084, + "step": 7315 + }, + { + "epoch": 3.67, + "learning_rate": 8.195694567761968e-06, + "loss": 2.3259, + "step": 7320 + }, + { + "epoch": 3.68, + "learning_rate": 8.166523473028465e-06, + "loss": 2.3955, + "step": 7325 + }, + { + "epoch": 3.68, + "learning_rate": 8.137394247508644e-06, + "loss": 2.5088, + "step": 7330 + }, + { + "epoch": 3.68, + "learning_rate": 8.108306963654452e-06, + "loss": 2.5981, + "step": 7335 + }, + { + "epoch": 3.68, + "learning_rate": 8.079261693813487e-06, + "loss": 2.6233, + "step": 7340 + }, + { + "epoch": 3.69, + "learning_rate": 8.05025851022885e-06, + "loss": 2.4973, + "step": 7345 + }, + { + "epoch": 3.69, + "learning_rate": 8.02129748503897e-06, + "loss": 2.5353, + "step": 7350 + }, + { + "epoch": 3.69, + "learning_rate": 7.992378690277416e-06, + "loss": 2.5229, + "step": 7355 + }, + { + "epoch": 3.69, + "learning_rate": 7.96350219787271e-06, + "loss": 2.2312, + "step": 7360 + }, + { + "epoch": 3.7, + "learning_rate": 7.93466807964817e-06, + "loss": 2.6011, + "step": 7365 + }, + { + "epoch": 3.7, + "learning_rate": 7.905876407321711e-06, + "loss": 2.4813, + "step": 7370 + }, + { + "epoch": 3.7, + "learning_rate": 7.87712725250567e-06, + "loss": 2.4722, + "step": 7375 + }, + { + "epoch": 3.7, + "learning_rate": 7.848420686706643e-06, + "loss": 2.6481, + "step": 7380 + }, + { + "epoch": 3.71, + "learning_rate": 7.819756781325285e-06, + "loss": 2.5964, + "step": 7385 + }, + { + "epoch": 3.71, + "learning_rate": 7.791135607656147e-06, + "loss": 2.4698, + "step": 7390 + }, + { + "epoch": 3.71, + "learning_rate": 7.762557236887507e-06, + "loss": 2.6941, + "step": 7395 + }, + { + "epoch": 3.71, + "learning_rate": 7.734021740101168e-06, + "loss": 2.6679, + "step": 7400 + }, + { + "epoch": 3.72, + "learning_rate": 7.705529188272295e-06, + "loss": 2.7456, + "step": 7405 + }, + { + "epoch": 3.72, + "learning_rate": 7.67707965226924e-06, + "loss": 2.3179, + "step": 7410 + }, + { + "epoch": 3.72, + "learning_rate": 7.64867320285337e-06, + "loss": 2.5265, + "step": 7415 + }, + { + "epoch": 3.72, + "learning_rate": 7.620309910678866e-06, + "loss": 2.4766, + "step": 7420 + }, + { + "epoch": 3.73, + "learning_rate": 7.59198984629258e-06, + "loss": 2.5339, + "step": 7425 + }, + { + "epoch": 3.73, + "learning_rate": 7.56371308013385e-06, + "loss": 2.5665, + "step": 7430 + }, + { + "epoch": 3.73, + "learning_rate": 7.535479682534302e-06, + "loss": 2.6048, + "step": 7435 + }, + { + "epoch": 3.73, + "learning_rate": 7.50728972371772e-06, + "loss": 2.5792, + "step": 7440 + }, + { + "epoch": 3.74, + "learning_rate": 7.479143273799818e-06, + "loss": 2.6327, + "step": 7445 + }, + { + "epoch": 3.74, + "learning_rate": 7.451040402788109e-06, + "loss": 2.4764, + "step": 7450 + }, + { + "epoch": 3.74, + "learning_rate": 7.4229811805817065e-06, + "loss": 2.5359, + "step": 7455 + }, + { + "epoch": 3.74, + "learning_rate": 7.394965676971158e-06, + "loss": 2.3672, + "step": 7460 + }, + { + "epoch": 3.75, + "learning_rate": 7.3669939616382744e-06, + "loss": 2.2471, + "step": 7465 + }, + { + "epoch": 3.75, + "learning_rate": 7.33906610415595e-06, + "loss": 2.2825, + "step": 7470 + }, + { + "epoch": 3.75, + "learning_rate": 7.311182173987999e-06, + "loss": 2.8013, + "step": 7475 + }, + { + "epoch": 3.75, + "learning_rate": 7.283342240488972e-06, + "loss": 2.6741, + "step": 7480 + }, + { + "epoch": 3.76, + "learning_rate": 7.25554637290399e-06, + "loss": 2.4988, + "step": 7485 + }, + { + "epoch": 3.76, + "learning_rate": 7.227794640368573e-06, + "loss": 2.6571, + "step": 7490 + }, + { + "epoch": 3.76, + "learning_rate": 7.2000871119084575e-06, + "loss": 2.7367, + "step": 7495 + }, + { + "epoch": 3.76, + "learning_rate": 7.172423856439459e-06, + "loss": 2.6667, + "step": 7500 + }, + { + "epoch": 3.77, + "learning_rate": 7.144804942767231e-06, + "loss": 2.6674, + "step": 7505 + }, + { + "epoch": 3.77, + "learning_rate": 7.117230439587172e-06, + "loss": 2.8285, + "step": 7510 + }, + { + "epoch": 3.77, + "learning_rate": 7.0952028586902694e-06, + "loss": 2.512, + "step": 7515 + }, + { + "epoch": 3.77, + "learning_rate": 7.067708467155793e-06, + "loss": 2.6181, + "step": 7520 + }, + { + "epoch": 3.78, + "learning_rate": 7.040258677872366e-06, + "loss": 2.5267, + "step": 7525 + }, + { + "epoch": 3.78, + "learning_rate": 7.012853559114737e-06, + "loss": 2.4466, + "step": 7530 + }, + { + "epoch": 3.78, + "learning_rate": 6.985493179046529e-06, + "loss": 2.1915, + "step": 7535 + }, + { + "epoch": 3.78, + "learning_rate": 6.958177605720082e-06, + "loss": 2.427, + "step": 7540 + }, + { + "epoch": 3.79, + "learning_rate": 6.930906907076301e-06, + "loss": 2.3777, + "step": 7545 + }, + { + "epoch": 3.79, + "learning_rate": 6.9036811509444715e-06, + "loss": 2.3888, + "step": 7550 + }, + { + "epoch": 3.79, + "learning_rate": 6.8765004050421075e-06, + "loss": 2.6918, + "step": 7555 + }, + { + "epoch": 3.79, + "learning_rate": 6.849364736974745e-06, + "loss": 2.4888, + "step": 7560 + }, + { + "epoch": 3.8, + "learning_rate": 6.822274214235819e-06, + "loss": 2.5234, + "step": 7565 + }, + { + "epoch": 3.8, + "learning_rate": 6.7952289042064655e-06, + "loss": 2.5397, + "step": 7570 + }, + { + "epoch": 3.8, + "learning_rate": 6.768228874155388e-06, + "loss": 2.6419, + "step": 7575 + }, + { + "epoch": 3.8, + "learning_rate": 6.741274191238642e-06, + "loss": 2.5339, + "step": 7580 + }, + { + "epoch": 3.81, + "learning_rate": 6.7143649224995056e-06, + "loss": 2.4222, + "step": 7585 + }, + { + "epoch": 3.81, + "learning_rate": 6.68750113486829e-06, + "loss": 2.4719, + "step": 7590 + }, + { + "epoch": 3.81, + "learning_rate": 6.660682895162191e-06, + "loss": 2.6034, + "step": 7595 + }, + { + "epoch": 3.81, + "learning_rate": 6.6339102700851144e-06, + "loss": 2.4438, + "step": 7600 + }, + { + "epoch": 3.82, + "learning_rate": 6.607183326227509e-06, + "loss": 2.6244, + "step": 7605 + }, + { + "epoch": 3.82, + "learning_rate": 6.580502130066201e-06, + "loss": 2.4553, + "step": 7610 + }, + { + "epoch": 3.82, + "learning_rate": 6.5538667479642376e-06, + "loss": 2.7125, + "step": 7615 + }, + { + "epoch": 3.82, + "learning_rate": 6.527277246170702e-06, + "loss": 2.5128, + "step": 7620 + }, + { + "epoch": 3.83, + "learning_rate": 6.500733690820571e-06, + "loss": 2.3843, + "step": 7625 + }, + { + "epoch": 3.83, + "learning_rate": 6.474236147934529e-06, + "loss": 2.5529, + "step": 7630 + }, + { + "epoch": 3.83, + "learning_rate": 6.4477846834188425e-06, + "loss": 2.6161, + "step": 7635 + }, + { + "epoch": 3.83, + "learning_rate": 6.421379363065142e-06, + "loss": 2.4355, + "step": 7640 + }, + { + "epoch": 3.84, + "learning_rate": 6.395020252550302e-06, + "loss": 2.3781, + "step": 7645 + }, + { + "epoch": 3.84, + "learning_rate": 6.368707417436237e-06, + "loss": 2.3661, + "step": 7650 + }, + { + "epoch": 3.84, + "learning_rate": 6.34244092316979e-06, + "loss": 2.3473, + "step": 7655 + }, + { + "epoch": 3.84, + "learning_rate": 6.316220835082528e-06, + "loss": 2.6448, + "step": 7660 + }, + { + "epoch": 3.85, + "learning_rate": 6.290047218390605e-06, + "loss": 2.5152, + "step": 7665 + }, + { + "epoch": 3.85, + "learning_rate": 6.2639201381945705e-06, + "loss": 2.317, + "step": 7670 + }, + { + "epoch": 3.85, + "learning_rate": 6.237839659479239e-06, + "loss": 2.4129, + "step": 7675 + }, + { + "epoch": 3.85, + "learning_rate": 6.2118058471135195e-06, + "loss": 2.196, + "step": 7680 + }, + { + "epoch": 3.86, + "learning_rate": 6.185818765850238e-06, + "loss": 2.6682, + "step": 7685 + }, + { + "epoch": 3.86, + "learning_rate": 6.159878480325995e-06, + "loss": 2.4091, + "step": 7690 + }, + { + "epoch": 3.86, + "learning_rate": 6.133985055060992e-06, + "loss": 2.7428, + "step": 7695 + }, + { + "epoch": 3.86, + "learning_rate": 6.108138554458881e-06, + "loss": 2.4382, + "step": 7700 + }, + { + "epoch": 3.87, + "learning_rate": 6.082339042806601e-06, + "loss": 2.5848, + "step": 7705 + }, + { + "epoch": 3.87, + "learning_rate": 6.056586584274218e-06, + "loss": 2.2978, + "step": 7710 + }, + { + "epoch": 3.87, + "learning_rate": 6.030881242914757e-06, + "loss": 2.6048, + "step": 7715 + }, + { + "epoch": 3.87, + "learning_rate": 6.005223082664063e-06, + "loss": 2.4201, + "step": 7720 + }, + { + "epoch": 3.88, + "learning_rate": 5.9796121673406174e-06, + "loss": 2.4145, + "step": 7725 + }, + { + "epoch": 3.88, + "learning_rate": 5.954048560645398e-06, + "loss": 2.3145, + "step": 7730 + }, + { + "epoch": 3.88, + "learning_rate": 5.928532326161712e-06, + "loss": 2.6399, + "step": 7735 + }, + { + "epoch": 3.88, + "learning_rate": 5.9030635273550404e-06, + "loss": 2.6007, + "step": 7740 + }, + { + "epoch": 3.89, + "learning_rate": 5.8776422275728774e-06, + "loss": 2.3568, + "step": 7745 + }, + { + "epoch": 3.89, + "learning_rate": 5.8522684900445765e-06, + "loss": 2.63, + "step": 7750 + }, + { + "epoch": 3.89, + "learning_rate": 5.826942377881195e-06, + "loss": 2.3783, + "step": 7755 + }, + { + "epoch": 3.89, + "learning_rate": 5.8016639540753234e-06, + "loss": 2.6271, + "step": 7760 + }, + { + "epoch": 3.9, + "learning_rate": 5.776433281500951e-06, + "loss": 2.4406, + "step": 7765 + }, + { + "epoch": 3.9, + "learning_rate": 5.75125042291329e-06, + "loss": 2.4872, + "step": 7770 + }, + { + "epoch": 3.9, + "learning_rate": 5.726115440948626e-06, + "loss": 2.3784, + "step": 7775 + }, + { + "epoch": 3.9, + "learning_rate": 5.70102839812417e-06, + "loss": 2.3692, + "step": 7780 + }, + { + "epoch": 3.91, + "learning_rate": 5.675989356837879e-06, + "loss": 2.2512, + "step": 7785 + }, + { + "epoch": 3.91, + "learning_rate": 5.6509983793683525e-06, + "loss": 2.3779, + "step": 7790 + }, + { + "epoch": 3.91, + "learning_rate": 5.626055527874605e-06, + "loss": 2.59, + "step": 7795 + }, + { + "epoch": 3.91, + "learning_rate": 5.601160864395971e-06, + "loss": 2.3533, + "step": 7800 + }, + { + "epoch": 3.92, + "learning_rate": 5.576314450851922e-06, + "loss": 2.8845, + "step": 7805 + }, + { + "epoch": 3.92, + "learning_rate": 5.5515163490419155e-06, + "loss": 2.6464, + "step": 7810 + }, + { + "epoch": 3.92, + "learning_rate": 5.526766620645258e-06, + "loss": 2.5061, + "step": 7815 + }, + { + "epoch": 3.92, + "learning_rate": 5.5020653272209235e-06, + "loss": 2.3723, + "step": 7820 + }, + { + "epoch": 3.93, + "learning_rate": 5.477412530207435e-06, + "loss": 2.5008, + "step": 7825 + }, + { + "epoch": 3.93, + "learning_rate": 5.452808290922656e-06, + "loss": 2.7899, + "step": 7830 + }, + { + "epoch": 3.93, + "learning_rate": 5.428252670563721e-06, + "loss": 2.6914, + "step": 7835 + }, + { + "epoch": 3.93, + "learning_rate": 5.403745730206811e-06, + "loss": 2.7228, + "step": 7840 + }, + { + "epoch": 3.94, + "learning_rate": 5.379287530807023e-06, + "loss": 2.4227, + "step": 7845 + }, + { + "epoch": 3.94, + "learning_rate": 5.354878133198237e-06, + "loss": 2.5355, + "step": 7850 + }, + { + "epoch": 3.94, + "learning_rate": 5.33051759809294e-06, + "loss": 2.4151, + "step": 7855 + }, + { + "epoch": 3.94, + "learning_rate": 5.3062059860820915e-06, + "loss": 2.5943, + "step": 7860 + }, + { + "epoch": 3.95, + "learning_rate": 5.281943357634961e-06, + "loss": 2.5721, + "step": 7865 + }, + { + "epoch": 3.95, + "learning_rate": 5.257729773098988e-06, + "loss": 2.2355, + "step": 7870 + }, + { + "epoch": 3.95, + "learning_rate": 5.233565292699624e-06, + "loss": 2.6952, + "step": 7875 + }, + { + "epoch": 3.95, + "learning_rate": 5.209449976540187e-06, + "loss": 2.5381, + "step": 7880 + }, + { + "epoch": 3.96, + "learning_rate": 5.1853838846017135e-06, + "loss": 2.4474, + "step": 7885 + }, + { + "epoch": 3.96, + "learning_rate": 5.161367076742796e-06, + "loss": 2.4288, + "step": 7890 + }, + { + "epoch": 3.96, + "learning_rate": 5.1373996126994646e-06, + "loss": 2.5151, + "step": 7895 + }, + { + "epoch": 3.96, + "learning_rate": 5.113481552085001e-06, + "loss": 2.4231, + "step": 7900 + }, + { + "epoch": 3.97, + "learning_rate": 5.089612954389814e-06, + "loss": 2.5372, + "step": 7905 + }, + { + "epoch": 3.97, + "learning_rate": 5.06579387898129e-06, + "loss": 2.5082, + "step": 7910 + }, + { + "epoch": 3.97, + "learning_rate": 5.042024385103624e-06, + "loss": 2.3919, + "step": 7915 + }, + { + "epoch": 3.97, + "learning_rate": 5.018304531877704e-06, + "loss": 2.4821, + "step": 7920 + }, + { + "epoch": 3.98, + "learning_rate": 4.9946343783009495e-06, + "loss": 2.5556, + "step": 7925 + }, + { + "epoch": 3.98, + "learning_rate": 4.971013983247158e-06, + "loss": 2.4618, + "step": 7930 + }, + { + "epoch": 3.98, + "learning_rate": 4.947443405466357e-06, + "loss": 2.6289, + "step": 7935 + }, + { + "epoch": 3.98, + "learning_rate": 4.923922703584691e-06, + "loss": 2.652, + "step": 7940 + }, + { + "epoch": 3.99, + "learning_rate": 4.9004519361042275e-06, + "loss": 2.5852, + "step": 7945 + }, + { + "epoch": 3.99, + "learning_rate": 4.877031161402843e-06, + "loss": 2.3628, + "step": 7950 + }, + { + "epoch": 3.99, + "learning_rate": 4.853660437734062e-06, + "loss": 2.5503, + "step": 7955 + }, + { + "epoch": 3.99, + "learning_rate": 4.8303398232269255e-06, + "loss": 2.6815, + "step": 7960 + }, + { + "epoch": 4.0, + "learning_rate": 4.807069375885842e-06, + "loss": 2.6943, + "step": 7965 + }, + { + "epoch": 4.0, + "learning_rate": 4.783849153590436e-06, + "loss": 2.3604, + "step": 7970 + }, + { + "epoch": 4.0, + "learning_rate": 4.760679214095409e-06, + "loss": 2.4289, + "step": 7975 + }, + { + "epoch": 4.0, + "learning_rate": 4.737559615030402e-06, + "loss": 2.3585, + "step": 7980 + }, + { + "epoch": 4.01, + "learning_rate": 4.714490413899839e-06, + "loss": 2.3335, + "step": 7985 + }, + { + "epoch": 4.01, + "learning_rate": 4.6914716680828e-06, + "loss": 2.5913, + "step": 7990 + }, + { + "epoch": 4.01, + "learning_rate": 4.668503434832852e-06, + "loss": 2.6069, + "step": 7995 + }, + { + "epoch": 4.01, + "learning_rate": 4.645585771277961e-06, + "loss": 2.332, + "step": 8000 + }, + { + "epoch": 4.02, + "learning_rate": 4.6227187344202675e-06, + "loss": 2.6009, + "step": 8005 + }, + { + "epoch": 4.02, + "learning_rate": 4.599902381136021e-06, + "loss": 2.4393, + "step": 8010 + }, + { + "epoch": 4.02, + "learning_rate": 4.577136768175391e-06, + "loss": 2.3213, + "step": 8015 + }, + { + "epoch": 4.02, + "learning_rate": 4.5544219521623576e-06, + "loss": 2.3454, + "step": 8020 + }, + { + "epoch": 4.03, + "learning_rate": 4.531757989594543e-06, + "loss": 2.6343, + "step": 8025 + }, + { + "epoch": 4.03, + "learning_rate": 4.5091449368430935e-06, + "loss": 2.6261, + "step": 8030 + }, + { + "epoch": 4.03, + "learning_rate": 4.486582850152523e-06, + "loss": 2.2482, + "step": 8035 + }, + { + "epoch": 4.03, + "learning_rate": 4.464071785640575e-06, + "loss": 2.5495, + "step": 8040 + }, + { + "epoch": 4.04, + "learning_rate": 4.4416117992981066e-06, + "loss": 2.1255, + "step": 8045 + }, + { + "epoch": 4.04, + "learning_rate": 4.419202946988912e-06, + "loss": 2.4457, + "step": 8050 + }, + { + "epoch": 4.04, + "learning_rate": 4.396845284449608e-06, + "loss": 2.4967, + "step": 8055 + }, + { + "epoch": 4.04, + "learning_rate": 4.374538867289488e-06, + "loss": 2.489, + "step": 8060 + }, + { + "epoch": 4.05, + "learning_rate": 4.352283750990388e-06, + "loss": 2.7544, + "step": 8065 + }, + { + "epoch": 4.05, + "learning_rate": 4.330079990906541e-06, + "loss": 2.2603, + "step": 8070 + }, + { + "epoch": 4.05, + "learning_rate": 4.307927642264448e-06, + "loss": 2.509, + "step": 8075 + }, + { + "epoch": 4.05, + "learning_rate": 4.285826760162731e-06, + "loss": 2.501, + "step": 8080 + }, + { + "epoch": 4.06, + "learning_rate": 4.2637773995720086e-06, + "loss": 2.5447, + "step": 8085 + }, + { + "epoch": 4.06, + "learning_rate": 4.241779615334748e-06, + "loss": 2.384, + "step": 8090 + }, + { + "epoch": 4.06, + "learning_rate": 4.219833462165132e-06, + "loss": 2.556, + "step": 8095 + }, + { + "epoch": 4.06, + "learning_rate": 4.197938994648923e-06, + "loss": 2.1611, + "step": 8100 + }, + { + "epoch": 4.07, + "learning_rate": 4.176096267243332e-06, + "loss": 2.7722, + "step": 8105 + }, + { + "epoch": 4.07, + "learning_rate": 4.154305334276876e-06, + "loss": 2.4201, + "step": 8110 + }, + { + "epoch": 4.07, + "learning_rate": 4.1325662499492435e-06, + "loss": 2.0671, + "step": 8115 + }, + { + "epoch": 4.07, + "learning_rate": 4.110879068331169e-06, + "loss": 2.4085, + "step": 8120 + }, + { + "epoch": 4.08, + "learning_rate": 4.089243843364285e-06, + "loss": 2.2808, + "step": 8125 + }, + { + "epoch": 4.08, + "learning_rate": 4.0676606288609945e-06, + "loss": 2.7176, + "step": 8130 + }, + { + "epoch": 4.08, + "learning_rate": 4.0461294785043426e-06, + "loss": 2.7093, + "step": 8135 + }, + { + "epoch": 4.08, + "learning_rate": 4.024650445847872e-06, + "loss": 2.7265, + "step": 8140 + }, + { + "epoch": 4.09, + "learning_rate": 4.003223584315491e-06, + "loss": 2.5104, + "step": 8145 + }, + { + "epoch": 4.09, + "learning_rate": 3.981848947201364e-06, + "loss": 2.5089, + "step": 8150 + }, + { + "epoch": 4.09, + "learning_rate": 3.960526587669741e-06, + "loss": 2.1579, + "step": 8155 + }, + { + "epoch": 4.09, + "learning_rate": 3.939256558754848e-06, + "loss": 2.5799, + "step": 8160 + }, + { + "epoch": 4.1, + "learning_rate": 3.918038913360753e-06, + "loss": 2.5948, + "step": 8165 + }, + { + "epoch": 4.1, + "learning_rate": 3.896873704261231e-06, + "loss": 2.4415, + "step": 8170 + }, + { + "epoch": 4.1, + "learning_rate": 3.875760984099641e-06, + "loss": 2.3362, + "step": 8175 + }, + { + "epoch": 4.1, + "learning_rate": 3.854700805388786e-06, + "loss": 2.0676, + "step": 8180 + }, + { + "epoch": 4.11, + "learning_rate": 3.83369322051077e-06, + "loss": 2.4296, + "step": 8185 + }, + { + "epoch": 4.11, + "learning_rate": 3.8127382817169018e-06, + "loss": 2.3855, + "step": 8190 + }, + { + "epoch": 4.11, + "learning_rate": 3.791836041127533e-06, + "loss": 2.4406, + "step": 8195 + }, + { + "epoch": 4.11, + "learning_rate": 3.770986550731956e-06, + "loss": 2.759, + "step": 8200 + }, + { + "epoch": 4.12, + "learning_rate": 3.750189862388248e-06, + "loss": 2.5292, + "step": 8205 + }, + { + "epoch": 4.12, + "learning_rate": 3.729446027823155e-06, + "loss": 2.6489, + "step": 8210 + }, + { + "epoch": 4.12, + "learning_rate": 3.7087550986319636e-06, + "loss": 2.4049, + "step": 8215 + }, + { + "epoch": 4.12, + "learning_rate": 3.68811712627837e-06, + "loss": 2.6333, + "step": 8220 + }, + { + "epoch": 4.13, + "learning_rate": 3.667532162094353e-06, + "loss": 2.6278, + "step": 8225 + }, + { + "epoch": 4.13, + "learning_rate": 3.6470002572800507e-06, + "loss": 2.3188, + "step": 8230 + }, + { + "epoch": 4.13, + "learning_rate": 3.6265214629036233e-06, + "loss": 2.6722, + "step": 8235 + }, + { + "epoch": 4.13, + "learning_rate": 3.606095829901132e-06, + "loss": 2.7608, + "step": 8240 + }, + { + "epoch": 4.14, + "learning_rate": 3.5857234090764163e-06, + "loss": 2.6491, + "step": 8245 + }, + { + "epoch": 4.14, + "learning_rate": 3.5654042511009596e-06, + "loss": 2.433, + "step": 8250 + }, + { + "epoch": 4.14, + "learning_rate": 3.5451384065137593e-06, + "loss": 2.4662, + "step": 8255 + }, + { + "epoch": 4.15, + "learning_rate": 3.524925925721234e-06, + "loss": 2.5401, + "step": 8260 + }, + { + "epoch": 4.15, + "learning_rate": 3.504766858997044e-06, + "loss": 2.3117, + "step": 8265 + }, + { + "epoch": 4.15, + "learning_rate": 3.4846612564820193e-06, + "loss": 2.4978, + "step": 8270 + }, + { + "epoch": 4.15, + "learning_rate": 3.464609168183977e-06, + "loss": 2.5536, + "step": 8275 + }, + { + "epoch": 4.16, + "learning_rate": 3.4446106439776664e-06, + "loss": 2.6747, + "step": 8280 + }, + { + "epoch": 4.16, + "learning_rate": 3.42466573360459e-06, + "loss": 2.4949, + "step": 8285 + }, + { + "epoch": 4.16, + "learning_rate": 3.4047744866729016e-06, + "loss": 2.4502, + "step": 8290 + }, + { + "epoch": 4.16, + "learning_rate": 3.3849369526572834e-06, + "loss": 2.3337, + "step": 8295 + }, + { + "epoch": 4.17, + "learning_rate": 3.365153180898814e-06, + "loss": 2.4335, + "step": 8300 + }, + { + "epoch": 4.17, + "learning_rate": 3.3454232206048626e-06, + "loss": 2.3121, + "step": 8305 + }, + { + "epoch": 4.17, + "learning_rate": 3.3257471208489435e-06, + "loss": 2.4353, + "step": 8310 + }, + { + "epoch": 4.17, + "learning_rate": 3.306124930570609e-06, + "loss": 2.3915, + "step": 8315 + }, + { + "epoch": 4.18, + "learning_rate": 3.286556698575327e-06, + "loss": 2.2757, + "step": 8320 + }, + { + "epoch": 4.18, + "learning_rate": 3.2670424735343597e-06, + "loss": 2.6466, + "step": 8325 + }, + { + "epoch": 4.18, + "learning_rate": 3.2475823039846283e-06, + "loss": 2.3025, + "step": 8330 + }, + { + "epoch": 4.18, + "learning_rate": 3.2281762383286203e-06, + "loss": 2.427, + "step": 8335 + }, + { + "epoch": 4.19, + "learning_rate": 3.208824324834239e-06, + "loss": 2.3778, + "step": 8340 + }, + { + "epoch": 4.19, + "learning_rate": 3.1895266116347023e-06, + "loss": 2.3377, + "step": 8345 + }, + { + "epoch": 4.19, + "learning_rate": 3.170283146728423e-06, + "loss": 2.4054, + "step": 8350 + }, + { + "epoch": 4.19, + "learning_rate": 3.1510939779788777e-06, + "loss": 2.4784, + "step": 8355 + }, + { + "epoch": 4.2, + "learning_rate": 3.131959153114494e-06, + "loss": 2.641, + "step": 8360 + }, + { + "epoch": 4.2, + "learning_rate": 3.1128787197285376e-06, + "loss": 2.2387, + "step": 8365 + }, + { + "epoch": 4.2, + "learning_rate": 3.093852725278987e-06, + "loss": 2.3799, + "step": 8370 + }, + { + "epoch": 4.2, + "learning_rate": 3.07488121708841e-06, + "loss": 2.5895, + "step": 8375 + }, + { + "epoch": 4.21, + "learning_rate": 3.0559642423438616e-06, + "loss": 2.5519, + "step": 8380 + }, + { + "epoch": 4.21, + "learning_rate": 3.037101848096757e-06, + "loss": 2.3567, + "step": 8385 + }, + { + "epoch": 4.21, + "learning_rate": 3.0182940812627488e-06, + "loss": 2.376, + "step": 8390 + }, + { + "epoch": 4.21, + "learning_rate": 2.9995409886216267e-06, + "loss": 2.5246, + "step": 8395 + }, + { + "epoch": 4.22, + "learning_rate": 2.980842616817184e-06, + "loss": 2.5203, + "step": 8400 + }, + { + "epoch": 4.22, + "learning_rate": 2.9621990123571057e-06, + "loss": 2.5897, + "step": 8405 + }, + { + "epoch": 4.22, + "learning_rate": 2.9436102216128747e-06, + "loss": 2.2612, + "step": 8410 + }, + { + "epoch": 4.22, + "learning_rate": 2.92507629081962e-06, + "loss": 2.4448, + "step": 8415 + }, + { + "epoch": 4.23, + "learning_rate": 2.906597266076022e-06, + "loss": 2.8432, + "step": 8420 + }, + { + "epoch": 4.23, + "learning_rate": 2.888173193344204e-06, + "loss": 2.8288, + "step": 8425 + }, + { + "epoch": 4.23, + "learning_rate": 2.8698041184496012e-06, + "loss": 2.5462, + "step": 8430 + }, + { + "epoch": 4.23, + "learning_rate": 2.8514900870808576e-06, + "loss": 2.5016, + "step": 8435 + }, + { + "epoch": 4.24, + "learning_rate": 2.833231144789711e-06, + "loss": 2.5918, + "step": 8440 + }, + { + "epoch": 4.24, + "learning_rate": 2.815027336990883e-06, + "loss": 2.57, + "step": 8445 + }, + { + "epoch": 4.24, + "learning_rate": 2.796878708961939e-06, + "loss": 2.6271, + "step": 8450 + }, + { + "epoch": 4.24, + "learning_rate": 2.778785305843232e-06, + "loss": 2.4207, + "step": 8455 + }, + { + "epoch": 4.25, + "learning_rate": 2.7607471726377316e-06, + "loss": 2.5623, + "step": 8460 + }, + { + "epoch": 4.25, + "learning_rate": 2.74276435421095e-06, + "loss": 2.3429, + "step": 8465 + }, + { + "epoch": 4.25, + "learning_rate": 2.7248368952908053e-06, + "loss": 2.4522, + "step": 8470 + }, + { + "epoch": 4.25, + "learning_rate": 2.7069648404675326e-06, + "loss": 2.5546, + "step": 8475 + }, + { + "epoch": 4.26, + "learning_rate": 2.689148234193556e-06, + "loss": 2.9038, + "step": 8480 + }, + { + "epoch": 4.26, + "learning_rate": 2.6713871207833892e-06, + "loss": 2.5678, + "step": 8485 + }, + { + "epoch": 4.26, + "learning_rate": 2.6536815444135194e-06, + "loss": 2.6918, + "step": 8490 + }, + { + "epoch": 4.26, + "learning_rate": 2.636031549122295e-06, + "loss": 2.5969, + "step": 8495 + }, + { + "epoch": 4.27, + "learning_rate": 2.6184371788098268e-06, + "loss": 2.5104, + "step": 8500 + }, + { + "epoch": 4.27, + "learning_rate": 2.6008984772378674e-06, + "loss": 2.5368, + "step": 8505 + }, + { + "epoch": 4.27, + "learning_rate": 2.583415488029706e-06, + "loss": 2.6573, + "step": 8510 + }, + { + "epoch": 4.27, + "learning_rate": 2.5659882546700654e-06, + "loss": 2.1869, + "step": 8515 + }, + { + "epoch": 4.28, + "learning_rate": 2.548616820504984e-06, + "loss": 2.4897, + "step": 8520 + }, + { + "epoch": 4.28, + "learning_rate": 2.531301228741717e-06, + "loss": 2.3068, + "step": 8525 + }, + { + "epoch": 4.28, + "learning_rate": 2.5140415224486234e-06, + "loss": 2.4327, + "step": 8530 + }, + { + "epoch": 4.28, + "learning_rate": 2.4968377445550667e-06, + "loss": 2.5269, + "step": 8535 + }, + { + "epoch": 4.29, + "learning_rate": 2.4796899378512816e-06, + "loss": 2.3768, + "step": 8540 + }, + { + "epoch": 4.29, + "learning_rate": 2.4625981449883125e-06, + "loss": 2.4458, + "step": 8545 + }, + { + "epoch": 4.29, + "learning_rate": 2.4455624084778692e-06, + "loss": 2.4402, + "step": 8550 + }, + { + "epoch": 4.29, + "learning_rate": 2.4285827706922363e-06, + "loss": 2.5712, + "step": 8555 + }, + { + "epoch": 4.3, + "learning_rate": 2.41165927386417e-06, + "loss": 2.4516, + "step": 8560 + }, + { + "epoch": 4.3, + "learning_rate": 2.394791960086795e-06, + "loss": 2.4824, + "step": 8565 + }, + { + "epoch": 4.3, + "learning_rate": 2.377980871313479e-06, + "loss": 2.4295, + "step": 8570 + }, + { + "epoch": 4.3, + "learning_rate": 2.3612260493577536e-06, + "loss": 2.5701, + "step": 8575 + }, + { + "epoch": 4.31, + "learning_rate": 2.3445275358931986e-06, + "loss": 2.5912, + "step": 8580 + }, + { + "epoch": 4.31, + "learning_rate": 2.3278853724533393e-06, + "loss": 2.4209, + "step": 8585 + }, + { + "epoch": 4.31, + "learning_rate": 2.3112996004315464e-06, + "loss": 2.3497, + "step": 8590 + }, + { + "epoch": 4.31, + "learning_rate": 2.2947702610809226e-06, + "loss": 2.385, + "step": 8595 + }, + { + "epoch": 4.32, + "learning_rate": 2.27829739551422e-06, + "loss": 2.5355, + "step": 8600 + }, + { + "epoch": 4.32, + "learning_rate": 2.2618810447037147e-06, + "loss": 2.6385, + "step": 8605 + }, + { + "epoch": 4.32, + "learning_rate": 2.245521249481125e-06, + "loss": 2.2516, + "step": 8610 + }, + { + "epoch": 4.32, + "learning_rate": 2.2292180505374905e-06, + "loss": 2.6147, + "step": 8615 + }, + { + "epoch": 4.33, + "learning_rate": 2.2129714884230935e-06, + "loss": 2.4443, + "step": 8620 + }, + { + "epoch": 4.33, + "learning_rate": 2.196781603547343e-06, + "loss": 2.2799, + "step": 8625 + }, + { + "epoch": 4.33, + "learning_rate": 2.1806484361786645e-06, + "loss": 2.3063, + "step": 8630 + }, + { + "epoch": 4.33, + "learning_rate": 2.1645720264444254e-06, + "loss": 2.7512, + "step": 8635 + }, + { + "epoch": 4.34, + "learning_rate": 2.1485524143308176e-06, + "loss": 2.4819, + "step": 8640 + }, + { + "epoch": 4.34, + "learning_rate": 2.132589639682761e-06, + "loss": 2.3846, + "step": 8645 + }, + { + "epoch": 4.34, + "learning_rate": 2.116683742203815e-06, + "loss": 2.5348, + "step": 8650 + }, + { + "epoch": 4.34, + "learning_rate": 2.1008347614560583e-06, + "loss": 2.6769, + "step": 8655 + }, + { + "epoch": 4.35, + "learning_rate": 2.08504273686001e-06, + "loss": 2.6296, + "step": 8660 + }, + { + "epoch": 4.35, + "learning_rate": 2.0693077076945204e-06, + "loss": 2.5037, + "step": 8665 + }, + { + "epoch": 4.35, + "learning_rate": 2.0536297130966886e-06, + "loss": 2.5987, + "step": 8670 + }, + { + "epoch": 4.35, + "learning_rate": 2.038008792061738e-06, + "loss": 2.1922, + "step": 8675 + }, + { + "epoch": 4.36, + "learning_rate": 2.0224449834429475e-06, + "loss": 2.3884, + "step": 8680 + }, + { + "epoch": 4.36, + "learning_rate": 2.006938325951538e-06, + "loss": 2.3341, + "step": 8685 + }, + { + "epoch": 4.36, + "learning_rate": 1.991488858156576e-06, + "loss": 2.5546, + "step": 8690 + }, + { + "epoch": 4.36, + "learning_rate": 1.976096618484893e-06, + "loss": 2.6646, + "step": 8695 + }, + { + "epoch": 4.37, + "learning_rate": 1.960761645220968e-06, + "loss": 2.3178, + "step": 8700 + }, + { + "epoch": 4.37, + "learning_rate": 1.945483976506854e-06, + "loss": 2.4587, + "step": 8705 + }, + { + "epoch": 4.37, + "learning_rate": 1.930263650342065e-06, + "loss": 2.47, + "step": 8710 + }, + { + "epoch": 4.37, + "learning_rate": 1.915100704583492e-06, + "loss": 2.5696, + "step": 8715 + }, + { + "epoch": 4.38, + "learning_rate": 1.89999517694531e-06, + "loss": 2.3695, + "step": 8720 + }, + { + "epoch": 4.38, + "learning_rate": 1.8849471049988716e-06, + "loss": 2.7141, + "step": 8725 + }, + { + "epoch": 4.38, + "learning_rate": 1.8699565261726282e-06, + "loss": 2.2626, + "step": 8730 + }, + { + "epoch": 4.38, + "learning_rate": 1.8550234777520327e-06, + "loss": 2.5917, + "step": 8735 + }, + { + "epoch": 4.39, + "learning_rate": 1.8401479968794404e-06, + "loss": 2.5758, + "step": 8740 + }, + { + "epoch": 4.39, + "learning_rate": 1.8253301205540223e-06, + "loss": 2.3528, + "step": 8745 + }, + { + "epoch": 4.39, + "learning_rate": 1.8105698856316744e-06, + "loss": 2.7399, + "step": 8750 + }, + { + "epoch": 4.39, + "learning_rate": 1.7958673288249183e-06, + "loss": 2.5856, + "step": 8755 + }, + { + "epoch": 4.4, + "learning_rate": 1.7812224867028243e-06, + "loss": 2.4043, + "step": 8760 + }, + { + "epoch": 4.4, + "learning_rate": 1.7666353956908993e-06, + "loss": 2.7552, + "step": 8765 + }, + { + "epoch": 4.4, + "learning_rate": 1.7521060920710152e-06, + "loss": 2.4064, + "step": 8770 + }, + { + "epoch": 4.4, + "learning_rate": 1.7376346119813203e-06, + "loss": 2.3987, + "step": 8775 + }, + { + "epoch": 4.41, + "learning_rate": 1.7232209914161245e-06, + "loss": 2.2554, + "step": 8780 + }, + { + "epoch": 4.41, + "learning_rate": 1.7088652662258313e-06, + "loss": 2.4382, + "step": 8785 + }, + { + "epoch": 4.41, + "learning_rate": 1.694567472116851e-06, + "loss": 2.3647, + "step": 8790 + }, + { + "epoch": 4.41, + "learning_rate": 1.680327644651497e-06, + "loss": 2.3458, + "step": 8795 + }, + { + "epoch": 4.42, + "learning_rate": 1.666145819247908e-06, + "loss": 2.4937, + "step": 8800 + }, + { + "epoch": 4.42, + "learning_rate": 1.6520220311799477e-06, + "loss": 2.4006, + "step": 8805 + }, + { + "epoch": 4.42, + "learning_rate": 1.6379563155771382e-06, + "loss": 2.6438, + "step": 8810 + }, + { + "epoch": 4.42, + "learning_rate": 1.6239487074245524e-06, + "loss": 2.5403, + "step": 8815 + }, + { + "epoch": 4.43, + "learning_rate": 1.609999241562743e-06, + "loss": 2.296, + "step": 8820 + }, + { + "epoch": 4.43, + "learning_rate": 1.5961079526876422e-06, + "loss": 2.5552, + "step": 8825 + }, + { + "epoch": 4.43, + "learning_rate": 1.5822748753504757e-06, + "loss": 2.4254, + "step": 8830 + }, + { + "epoch": 4.43, + "learning_rate": 1.5685000439576935e-06, + "loss": 2.3813, + "step": 8835 + }, + { + "epoch": 4.44, + "learning_rate": 1.5547834927708643e-06, + "loss": 2.4311, + "step": 8840 + }, + { + "epoch": 4.44, + "learning_rate": 1.541125255906603e-06, + "loss": 2.7451, + "step": 8845 + }, + { + "epoch": 4.44, + "learning_rate": 1.5275253673364798e-06, + "loss": 2.5164, + "step": 8850 + }, + { + "epoch": 4.44, + "learning_rate": 1.5139838608869362e-06, + "loss": 2.3238, + "step": 8855 + }, + { + "epoch": 4.45, + "learning_rate": 1.500500770239205e-06, + "loss": 2.589, + "step": 8860 + }, + { + "epoch": 4.45, + "learning_rate": 1.487076128929224e-06, + "loss": 2.4534, + "step": 8865 + }, + { + "epoch": 4.45, + "learning_rate": 1.4737099703475532e-06, + "loss": 2.2319, + "step": 8870 + }, + { + "epoch": 4.45, + "learning_rate": 1.4604023277392797e-06, + "loss": 2.5679, + "step": 8875 + }, + { + "epoch": 4.46, + "learning_rate": 1.4471532342039684e-06, + "loss": 2.5104, + "step": 8880 + }, + { + "epoch": 4.46, + "learning_rate": 1.4339627226955392e-06, + "loss": 2.2751, + "step": 8885 + }, + { + "epoch": 4.46, + "learning_rate": 1.4208308260222097e-06, + "loss": 2.6708, + "step": 8890 + }, + { + "epoch": 4.46, + "learning_rate": 1.4077575768464025e-06, + "loss": 2.4125, + "step": 8895 + }, + { + "epoch": 4.47, + "learning_rate": 1.3947430076846762e-06, + "loss": 2.2262, + "step": 8900 + }, + { + "epoch": 4.47, + "learning_rate": 1.381787150907629e-06, + "loss": 2.3416, + "step": 8905 + }, + { + "epoch": 4.47, + "learning_rate": 1.3688900387398367e-06, + "loss": 2.7089, + "step": 8910 + }, + { + "epoch": 4.47, + "learning_rate": 1.3560517032597503e-06, + "loss": 2.7203, + "step": 8915 + }, + { + "epoch": 4.48, + "learning_rate": 1.3432721763996348e-06, + "loss": 2.4658, + "step": 8920 + }, + { + "epoch": 4.48, + "learning_rate": 1.3305514899454835e-06, + "loss": 2.4292, + "step": 8925 + }, + { + "epoch": 4.48, + "learning_rate": 1.3178896755369374e-06, + "loss": 2.4312, + "step": 8930 + }, + { + "epoch": 4.48, + "learning_rate": 1.3052867646672045e-06, + "loss": 2.5501, + "step": 8935 + }, + { + "epoch": 4.49, + "learning_rate": 1.292742788682988e-06, + "loss": 2.4859, + "step": 8940 + }, + { + "epoch": 4.49, + "learning_rate": 1.2802577787843995e-06, + "loss": 2.4551, + "step": 8945 + }, + { + "epoch": 4.49, + "learning_rate": 1.2678317660248962e-06, + "loss": 2.4382, + "step": 8950 + }, + { + "epoch": 4.49, + "learning_rate": 1.2554647813111858e-06, + "loss": 2.3653, + "step": 8955 + }, + { + "epoch": 4.5, + "learning_rate": 1.2431568554031603e-06, + "loss": 2.532, + "step": 8960 + }, + { + "epoch": 4.5, + "learning_rate": 1.230908018913815e-06, + "loss": 2.6281, + "step": 8965 + }, + { + "epoch": 4.5, + "learning_rate": 1.2187183023091825e-06, + "loss": 2.5645, + "step": 8970 + }, + { + "epoch": 4.5, + "learning_rate": 1.2065877359082405e-06, + "loss": 2.6149, + "step": 8975 + }, + { + "epoch": 4.51, + "learning_rate": 1.1945163498828488e-06, + "loss": 2.6147, + "step": 8980 + }, + { + "epoch": 4.51, + "learning_rate": 1.1825041742576676e-06, + "loss": 2.2286, + "step": 8985 + }, + { + "epoch": 4.51, + "learning_rate": 1.1705512389100865e-06, + "loss": 2.4426, + "step": 8990 + }, + { + "epoch": 4.51, + "learning_rate": 1.1586575735701544e-06, + "loss": 2.3299, + "step": 8995 + }, + { + "epoch": 4.52, + "learning_rate": 1.146823207820491e-06, + "loss": 2.2959, + "step": 9000 + } + ], + "logging_steps": 5, + "max_steps": 9960, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 4.7652159131163034e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9000/training_args.bin b/checkpoint-9000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0044b546a35784411b4a5d133649574611e7334 --- /dev/null +++ b/checkpoint-9000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939ee45e9035366dc4952c5158e7d3a0d3426acdbfb5014d53ad1e260b19a19f +size 4475