diff --git a/alpaca-lora-based-origin-llama7b/finetune.py b/alpaca-lora-based-origin-llama7b/finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..9709b6ec77f99222b13e6060768c723fa7ce26ff --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/finetune.py @@ -0,0 +1,164 @@ +import os +import sys + +import torch +import torch.nn as nn +import bitsandbytes as bnb +from datasets import load_dataset +import transformers +from peft import PeftModel +import wandb + + +assert ( + "LlamaTokenizer" in transformers._import_structure["models.llama"] +), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git" +from transformers import LlamaForCausalLM, LlamaTokenizer +from peft import ( + prepare_model_for_int8_training, + LoraConfig, + get_peft_model, + get_peft_model_state_dict, +) + + +# optimized for RTX 4090. for larger GPUs, increase some of these? +MICRO_BATCH_SIZE = 64 # this could actually be 5 but i like powers of 2 +BATCH_SIZE = 128 +GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE +EPOCHS = 2 # we don't always need 3 tbh +LEARNING_RATE = 3e-4 # the Karpathy constant +CUTOFF_LEN = 256 # 256 accounts for about 96% of the data +LORA_R = 8 +LORA_ALPHA = 16 +LORA_DROPOUT = 0.05 +VAL_SET_SIZE = 2000 +TARGET_MODULES = [ + "q_proj", + "v_proj", +] +DATA_PATH = "alpaca_data.json" +DATA_PATH = "belle_open_source_1M.train.json" +OUTPUT_DIR = "lora-alpaca" + +device_map = "auto" +world_size = int(os.environ.get("WORLD_SIZE", 1)) +ddp = world_size != 1 +if ddp: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} + GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size + +model = LlamaForCausalLM.from_pretrained( + "/ndk/ai-repos/train-llama/models/7b_hf", + load_in_8bit=True, + device_map=device_map, +) +tokenizer = LlamaTokenizer.from_pretrained( + "/ndk/ai-repos/train-llama/models/7b_hf", add_eos_token=True +) + +model = prepare_model_for_int8_training(model) + +config = LoraConfig( + r=LORA_R, + lora_alpha=LORA_ALPHA, + target_modules=TARGET_MODULES, + lora_dropout=LORA_DROPOUT, + bias="none", + task_type="CAUSAL_LM", +) +#model = get_peft_model(model, config) + +model = PeftModel.from_pretrained ( + model, + "./lora-alpaca-cn-remote", + torch_dtype=torch.float16, +) + + +tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token +data = load_dataset("json", data_files=DATA_PATH) + + +def generate_prompt(data_point): + # sorry about the formatting disaster gotta move fast + return f"""以下是描述任务的说明。 编写适当地完成请求的响应。 +### 输入: +{data_point["input"]} + +### 输出: +{data_point["target"]}""" + + +def tokenize(prompt): + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + result = tokenizer( + prompt, + truncation=True, + max_length=CUTOFF_LEN + 1, + padding="max_length", + ) + return { + "input_ids": result["input_ids"][:-1], + "attention_mask": result["attention_mask"][:-1], + } + + +def generate_and_tokenize_prompt(data_point): + prompt = generate_prompt(data_point) + return tokenize(prompt) + + +if VAL_SET_SIZE > 0: + train_val = data["train"].train_test_split( + test_size=VAL_SET_SIZE, shuffle=True, seed=42 + ) + train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) +else: + train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = None + + +wandb.init(project="llama-lora") + +trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=MICRO_BATCH_SIZE, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + warmup_steps=100, + num_train_epochs=EPOCHS, + learning_rate=LEARNING_RATE, + fp16=True, + logging_steps=20, + evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no", + save_strategy="steps", + eval_steps=200 if VAL_SET_SIZE > 0 else None, + save_steps=200, + output_dir=OUTPUT_DIR, + save_total_limit=3, + load_best_model_at_end=True if VAL_SET_SIZE > 0 else False, + ddp_find_unused_parameters=False if ddp else None, + ), + data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), +) +model.config.use_cache = False + +old_state_dict = model.state_dict +model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict()) +).__get__(model, type(model)) + +if torch.__version__ >= "2" and sys.platform != "win32": + model = torch.compile(model) + +trainer.train() + +model.save_pretrained(OUTPUT_DIR) + +print("\n If there's a warning about missing keys above, please disregard :)") + diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_config.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..33c5303265e3866eb29318b856ec56744ffbad33 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/root/autodl-tmp/llama_hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1da73ce0017a6f6a8c6d5adcef3f5e54a23cdaa3 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2948f6764ef3ff5e051d05bbb765f7104f8add4f66f9bd68bc14c3a100b6478d +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1590628e5d0dd434a7842afd73d36dc2e842db9 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d129add58d64413cc002d22d05022822a46a7898208682a5f2510de08324fcb +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ebba33f0108bba8420a0779dcdf1cf61f5b4b8f --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0f5cec0e1f2454b1f8855ee479675477cb79999a84192db56aa3a57a1cb468 +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_0.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcbf9f87f48dddbb9b271600cdd8d03af2075cf4 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:208b8872c6017de913817448486f5d4838856d2ab3b06abdd852908b8a981947 +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_1.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..95dcf5a394033eb69144fe8c59bb02e68fa58f44 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d06cca3f3250b22a4e1c303e21fb8d0c7c535b0e993559911d99dcdb93c41c3 +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c6ad2d18ea05850dc3d1546345863a9e8627425 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9611887fb095d16dea29537243a8f5ead09cf63a435218000ff5a105864011d6 +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ddf00d328aab2026a28b2b41ccecff6e5f1057e --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a6506cbaa6f775027714b991992b3b6d3e97768b6aaa19b631f657d42b353b +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..45715acfd37a2732196e62c950f35b6e5b4fd6ab --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/trainer_state.json @@ -0,0 +1,4164 @@ +{ + "best_metric": 0.6372544765472412, + "best_model_checkpoint": "lora-alpaca-cn/checkpoint-12200", + "epoch": 2.884160756501182, + "global_step": 12200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7735, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.1358, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.9749, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9316, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9072, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995234312946783, + "loss": 0.8963, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990468625893566, + "loss": 0.8853, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002985702938840349, + "loss": 0.8709, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029809372517871323, + "loss": 0.8555, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029761715647339156, + "loss": 0.8584, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.8360834717750549, + "eval_runtime": 49.29, + "eval_samples_per_second": 40.576, + "eval_steps_per_second": 2.536, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971405877680699, + "loss": 0.859, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666401906274816, + "loss": 0.8511, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961874503574265, + "loss": 0.8401, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002957108816521048, + "loss": 0.8357, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523431294678314, + "loss": 0.8413, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029475774424146147, + "loss": 0.8283, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002942811755361398, + "loss": 0.8202, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029380460683081807, + "loss": 0.8222, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002933280381254964, + "loss": 0.8178, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928514694201747, + "loss": 0.8177, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.7966175079345703, + "eval_runtime": 49.1752, + "eval_samples_per_second": 40.671, + "eval_steps_per_second": 2.542, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237490071485305, + "loss": 0.8057, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002918983320095314, + "loss": 0.811, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029142176330420965, + "loss": 0.8056, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290945194598888, + "loss": 0.7993, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904686258935663, + "loss": 0.7982, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899920571882446, + "loss": 0.8023, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028951548848292296, + "loss": 0.7968, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903891977760123, + "loss": 0.8029, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028856235107227956, + "loss": 0.7892, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880857823669579, + "loss": 0.7946, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.7735009789466858, + "eval_runtime": 49.3305, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028760921366163616, + "loss": 0.782, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002871326449563145, + "loss": 0.7799, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002866560762509928, + "loss": 0.7782, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617950754567114, + "loss": 0.7785, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028570293884034947, + "loss": 0.785, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852263701350278, + "loss": 0.7754, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474980142970607, + "loss": 0.7804, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842732327243844, + "loss": 0.7696, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002837966640190627, + "loss": 0.7692, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028332009531374105, + "loss": 0.7752, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.7564254403114319, + "eval_runtime": 49.106, + "eval_samples_per_second": 40.728, + "eval_steps_per_second": 2.546, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828435266084194, + "loss": 0.7698, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028236695790309765, + "loss": 0.7699, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.000281890389197776, + "loss": 0.7718, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002814138204924543, + "loss": 0.7644, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028093725178713263, + "loss": 0.7659, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028046068308181096, + "loss": 0.7641, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027998411437648923, + "loss": 0.7535, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950754567116756, + "loss": 0.7672, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790309769658459, + "loss": 0.7563, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002785544082605242, + "loss": 0.752, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.7433652281761169, + "eval_runtime": 48.9945, + "eval_samples_per_second": 40.821, + "eval_steps_per_second": 2.551, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027807783955520254, + "loss": 0.755, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027760127084988087, + "loss": 0.7563, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027712470214455914, + "loss": 0.7475, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027664813343923747, + "loss": 0.7599, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002761715647339158, + "loss": 0.7533, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027569499602859407, + "loss": 0.7488, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027521842732327245, + "loss": 0.753, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747418586179507, + "loss": 0.7435, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027426528991262905, + "loss": 0.7457, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737887212073074, + "loss": 0.742, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7321739792823792, + "eval_runtime": 48.8876, + "eval_samples_per_second": 40.91, + "eval_steps_per_second": 2.557, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00027331215250198565, + "loss": 0.7474, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.000272835583796664, + "loss": 0.7456, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002723590150913423, + "loss": 0.7406, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027188244638602063, + "loss": 0.7448, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027140587768069896, + "loss": 0.7445, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027092930897537723, + "loss": 0.7349, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027045274027005556, + "loss": 0.7395, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699761715647339, + "loss": 0.7382, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694996028594122, + "loss": 0.7357, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026902303415409054, + "loss": 0.7409, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.7235888242721558, + "eval_runtime": 49.2145, + "eval_samples_per_second": 40.638, + "eval_steps_per_second": 2.54, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026854646544876887, + "loss": 0.7376, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026806989674344714, + "loss": 0.7298, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026759332803812547, + "loss": 0.7379, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671167593328038, + "loss": 0.7354, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002666401906274821, + "loss": 0.7341, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026616362192216045, + "loss": 0.7352, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656870532168387, + "loss": 0.7321, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026521048451151705, + "loss": 0.7285, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647339158061954, + "loss": 0.73, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026425734710087365, + "loss": 0.7304, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.716058611869812, + "eval_runtime": 48.9201, + "eval_samples_per_second": 40.883, + "eval_steps_per_second": 2.555, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026378077839555203, + "loss": 0.7314, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002633042096902303, + "loss": 0.7315, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026282764098490863, + "loss": 0.7239, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026235107227958696, + "loss": 0.73, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026187450357426523, + "loss": 0.7243, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026139793486894356, + "loss": 0.7199, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609213661636219, + "loss": 0.7216, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002604447974583002, + "loss": 0.7358, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025996822875297854, + "loss": 0.7313, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025949166004765687, + "loss": 0.7236, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.7097632884979248, + "eval_runtime": 49.4908, + "eval_samples_per_second": 40.412, + "eval_steps_per_second": 2.526, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025901509134233514, + "loss": 0.7282, + "step": 1820 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025853852263701347, + "loss": 0.7187, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002580619539316918, + "loss": 0.7303, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575853852263701, + "loss": 0.724, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025710881652104845, + "loss": 0.7248, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566322478157267, + "loss": 0.7195, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025615567911040505, + "loss": 0.7269, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556791104050834, + "loss": 0.7209, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552025416997617, + "loss": 0.7282, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025472597299444003, + "loss": 0.7195, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7037709355354309, + "eval_runtime": 49.7167, + "eval_samples_per_second": 40.228, + "eval_steps_per_second": 2.514, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542494042891183, + "loss": 0.7229, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025377283558379664, + "loss": 0.718, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025329626687847496, + "loss": 0.7223, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025281969817315324, + "loss": 0.7209, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002523431294678316, + "loss": 0.7151, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518665607625099, + "loss": 0.7141, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513899920571882, + "loss": 0.7084, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025091342335186654, + "loss": 0.7075, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025043685464654487, + "loss": 0.7133, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024996028594122314, + "loss": 0.7092, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.6989386677742004, + "eval_runtime": 49.2344, + "eval_samples_per_second": 40.622, + "eval_steps_per_second": 2.539, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494837172359015, + "loss": 0.7178, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002490071485305798, + "loss": 0.7188, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002485305798252581, + "loss": 0.7161, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024805401111993645, + "loss": 0.7078, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475774424146147, + "loss": 0.7, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024710087370929305, + "loss": 0.718, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002466243050039714, + "loss": 0.7059, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002461477362986497, + "loss": 0.712, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567116759332804, + "loss": 0.7116, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002451945988880063, + "loss": 0.6986, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.6939737796783447, + "eval_runtime": 49.459, + "eval_samples_per_second": 40.438, + "eval_steps_per_second": 2.527, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024471803018268464, + "loss": 0.7168, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024424146147736296, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024376489277204126, + "loss": 0.7095, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024328832406671962, + "loss": 0.7091, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024281175536139792, + "loss": 0.7015, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024233518665607622, + "loss": 0.7109, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024185861795075455, + "loss": 0.7086, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024138204924543285, + "loss": 0.7118, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024090548054011117, + "loss": 0.7033, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002404289118347895, + "loss": 0.7128, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.6901652812957764, + "eval_runtime": 49.5038, + "eval_samples_per_second": 40.401, + "eval_steps_per_second": 2.525, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023995234312946783, + "loss": 0.6968, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023947577442414613, + "loss": 0.7109, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023899920571882443, + "loss": 0.7048, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023852263701350276, + "loss": 0.7012, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023804606830818106, + "loss": 0.7065, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375694996028594, + "loss": 0.7009, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002370929308975377, + "loss": 0.7035, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023661636219221604, + "loss": 0.6973, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613979348689434, + "loss": 0.7075, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023566322478157264, + "loss": 0.6952, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.6865400671958923, + "eval_runtime": 49.2814, + "eval_samples_per_second": 40.583, + "eval_steps_per_second": 2.536, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023518665607625097, + "loss": 0.6979, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002347100873709293, + "loss": 0.6973, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023423351866560762, + "loss": 0.7033, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023375694996028592, + "loss": 0.6964, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023328038125496422, + "loss": 0.7052, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023280381254964255, + "loss": 0.6999, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023232724384432085, + "loss": 0.6963, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002318506751389992, + "loss": 0.7025, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313741064336775, + "loss": 0.704, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089753772835583, + "loss": 0.6926, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.6828380227088928, + "eval_runtime": 49.5667, + "eval_samples_per_second": 40.35, + "eval_steps_per_second": 2.522, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023042096902303413, + "loss": 0.698, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022994440031771243, + "loss": 0.6893, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022946783161239076, + "loss": 0.6938, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022899126290706908, + "loss": 0.6974, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002285146942017474, + "loss": 0.6922, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002280381254964257, + "loss": 0.7073, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022756155679110404, + "loss": 0.6895, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022708498808578234, + "loss": 0.7012, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022660841938046064, + "loss": 0.6985, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.000226131850675139, + "loss": 0.6901, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.6807068586349487, + "eval_runtime": 49.2421, + "eval_samples_per_second": 40.616, + "eval_steps_per_second": 2.538, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256552819698173, + "loss": 0.697, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022517871326449562, + "loss": 0.7002, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022470214455917392, + "loss": 0.6918, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022422557585385225, + "loss": 0.6999, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022374900714853055, + "loss": 0.6961, + "step": 3300 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002232724384432089, + "loss": 0.6888, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002227958697378872, + "loss": 0.695, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002223193010325655, + "loss": 0.6861, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022184273232724383, + "loss": 0.6864, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022136616362192213, + "loss": 0.6917, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.6773961782455444, + "eval_runtime": 49.3961, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022088959491660043, + "loss": 0.679, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022041302621127879, + "loss": 0.6915, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021993645750595709, + "loss": 0.6937, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002194598888006354, + "loss": 0.6831, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189833200953137, + "loss": 0.6875, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021850675138999204, + "loss": 0.6916, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021803018268467034, + "loss": 0.6896, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002175536139793487, + "loss": 0.6986, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.000217077045274027, + "loss": 0.693, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002166004765687053, + "loss": 0.6893, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.6753410696983337, + "eval_runtime": 49.3307, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021612390786338362, + "loss": 0.6872, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021564733915806192, + "loss": 0.6862, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021517077045274025, + "loss": 0.6943, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021469420174741858, + "loss": 0.6896, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002142176330420969, + "loss": 0.6912, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137410643367752, + "loss": 0.6859, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002132644956314535, + "loss": 0.6791, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021278792692613183, + "loss": 0.6882, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021231135822081013, + "loss": 0.6823, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002118347895154885, + "loss": 0.6831, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.6738302707672119, + "eval_runtime": 49.4648, + "eval_samples_per_second": 40.433, + "eval_steps_per_second": 2.527, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113582208101668, + "loss": 0.6818, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108816521048451, + "loss": 0.6912, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021040508339952341, + "loss": 0.6884, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020992851469420171, + "loss": 0.6888, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020945194598888004, + "loss": 0.6822, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020897537728355837, + "loss": 0.6879, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002084988085782367, + "loss": 0.6771, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.000208022239872915, + "loss": 0.684, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002075456711675933, + "loss": 0.6878, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020706910246227162, + "loss": 0.6913, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.6711302995681763, + "eval_runtime": 49.117, + "eval_samples_per_second": 40.719, + "eval_steps_per_second": 2.545, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020659253375694992, + "loss": 0.683, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020611596505162828, + "loss": 0.6833, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020563939634630658, + "loss": 0.6793, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002051628276409849, + "loss": 0.6843, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002046862589356632, + "loss": 0.6822, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002042096902303415, + "loss": 0.6856, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020373312152501983, + "loss": 0.6809, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020325655281969816, + "loss": 0.6843, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002027799841143765, + "loss": 0.6754, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002023034154090548, + "loss": 0.6823, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.6697036027908325, + "eval_runtime": 49.3237, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020182684670373312, + "loss": 0.6861, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020135027799841142, + "loss": 0.6806, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020087370929308972, + "loss": 0.6823, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020039714058776807, + "loss": 0.6805, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019992057188244637, + "loss": 0.6813, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994440031771247, + "loss": 0.675, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.000198967434471803, + "loss": 0.6728, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001984908657664813, + "loss": 0.6676, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019801429706115963, + "loss": 0.6729, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019753772835583795, + "loss": 0.685, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.6667952537536621, + "eval_runtime": 49.3899, + "eval_samples_per_second": 40.494, + "eval_steps_per_second": 2.531, + "step": 4400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019706115965051628, + "loss": 0.6786, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658459094519458, + "loss": 0.6738, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961080222398729, + "loss": 0.686, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956314535345512, + "loss": 0.6818, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951548848292295, + "loss": 0.6741, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019467831612390786, + "loss": 0.6756, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019420174741858616, + "loss": 0.6851, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001937251787132645, + "loss": 0.6784, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001932486100079428, + "loss": 0.6785, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019277204130262112, + "loss": 0.678, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.6655837297439575, + "eval_runtime": 49.5019, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019229547259729942, + "loss": 0.6782, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019181890389197777, + "loss": 0.6683, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019134233518665607, + "loss": 0.6783, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019086576648133437, + "loss": 0.675, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903891977760127, + "loss": 0.6691, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.000189912629070691, + "loss": 0.6726, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018943606036536933, + "loss": 0.68, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018895949166004763, + "loss": 0.6694, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018848292295472598, + "loss": 0.6686, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018800635424940428, + "loss": 0.6766, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.6646501421928406, + "eval_runtime": 49.3188, + "eval_samples_per_second": 40.552, + "eval_steps_per_second": 2.535, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018752978554408258, + "loss": 0.6724, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001870532168387609, + "loss": 0.6801, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001865766481334392, + "loss": 0.6698, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001861000794281175, + "loss": 0.6723, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018562351072279586, + "loss": 0.6693, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018514694201747416, + "loss": 0.6716, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001846703733121525, + "loss": 0.674, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001841938046068308, + "loss": 0.6702, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018371723590150912, + "loss": 0.6716, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018324066719618742, + "loss": 0.672, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.662735104560852, + "eval_runtime": 49.275, + "eval_samples_per_second": 40.589, + "eval_steps_per_second": 2.537, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018276409849086577, + "loss": 0.6701, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018228752978554407, + "loss": 0.6663, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018181096108022237, + "loss": 0.6651, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813343923749007, + "loss": 0.6708, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.000180857823669579, + "loss": 0.6697, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018038125496425733, + "loss": 0.662, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017990468625893566, + "loss": 0.669, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017942811755361398, + "loss": 0.6649, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017895154884829228, + "loss": 0.668, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017847498014297058, + "loss": 0.6796, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.6609957218170166, + "eval_runtime": 49.2394, + "eval_samples_per_second": 40.618, + "eval_steps_per_second": 2.539, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001779984114376489, + "loss": 0.6745, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001775218427323272, + "loss": 0.6646, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704527402700556, + "loss": 0.6682, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017656870532168386, + "loss": 0.6713, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017609213661636216, + "loss": 0.6618, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001756155679110405, + "loss": 0.6703, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001751389992057188, + "loss": 0.6652, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017466243050039712, + "loss": 0.6698, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017418586179507545, + "loss": 0.6728, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017370929308975377, + "loss": 0.6752, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.6592395901679993, + "eval_runtime": 49.3426, + "eval_samples_per_second": 40.533, + "eval_steps_per_second": 2.533, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017323272438443207, + "loss": 0.6653, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017275615567911037, + "loss": 0.669, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001722795869737887, + "loss": 0.6698, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.000171803018268467, + "loss": 0.6742, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017132644956314536, + "loss": 0.6596, + "step": 5500 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017084988085782366, + "loss": 0.6699, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017037331215250198, + "loss": 0.664, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016989674344718028, + "loss": 0.6673, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016942017474185858, + "loss": 0.6684, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001689436060365369, + "loss": 0.6769, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.6582754850387573, + "eval_runtime": 49.469, + "eval_samples_per_second": 40.429, + "eval_steps_per_second": 2.527, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016846703733121524, + "loss": 0.6633, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016799046862589357, + "loss": 0.6679, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016751389992057187, + "loss": 0.6601, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670373312152502, + "loss": 0.6731, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665607625099285, + "loss": 0.6638, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660841938046068, + "loss": 0.6693, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016560762509928515, + "loss": 0.6642, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016513105639396345, + "loss": 0.6649, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016465448768864178, + "loss": 0.663, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016417791898332008, + "loss": 0.6629, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.6574136018753052, + "eval_runtime": 49.3019, + "eval_samples_per_second": 40.566, + "eval_steps_per_second": 2.535, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016370135027799838, + "loss": 0.6605, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001632247815726767, + "loss": 0.6707, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016274821286735503, + "loss": 0.6695, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016227164416203336, + "loss": 0.6647, + "step": 5880 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016179507545671166, + "loss": 0.6657, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016131850675138999, + "loss": 0.6656, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016084193804606829, + "loss": 0.6676, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016036536934074659, + "loss": 0.6678, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015988880063542494, + "loss": 0.6639, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015941223193010324, + "loss": 0.6645, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.656126081943512, + "eval_runtime": 49.5095, + "eval_samples_per_second": 40.396, + "eval_steps_per_second": 2.525, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015893566322478157, + "loss": 0.6672, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015845909451945987, + "loss": 0.6678, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001579825258141382, + "loss": 0.6676, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001575059571088165, + "loss": 0.6717, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015702938840349485, + "loss": 0.671, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015655281969817315, + "loss": 0.6611, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015607625099285145, + "loss": 0.6606, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015559968228752978, + "loss": 0.6647, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015512311358220808, + "loss": 0.6652, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546465448768864, + "loss": 0.6629, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.6549723148345947, + "eval_runtime": 49.4871, + "eval_samples_per_second": 40.415, + "eval_steps_per_second": 2.526, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015416997617156473, + "loss": 0.6685, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015369340746624306, + "loss": 0.6578, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015321683876092136, + "loss": 0.6587, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015274027005559966, + "loss": 0.6655, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.000152263701350278, + "loss": 0.6662, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517871326449563, + "loss": 0.6648, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015131056393963464, + "loss": 0.6638, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015083399523431294, + "loss": 0.6614, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015035742652899124, + "loss": 0.6552, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014988085782366957, + "loss": 0.6753, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.6544620990753174, + "eval_runtime": 49.3242, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014940428911834787, + "loss": 0.6588, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001489277204130262, + "loss": 0.6609, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014845115170770452, + "loss": 0.6565, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014797458300238282, + "loss": 0.6589, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749801429706115, + "loss": 0.6585, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014702144559173945, + "loss": 0.6737, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014654487688641778, + "loss": 0.6554, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001460683081810961, + "loss": 0.6603, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001455917394757744, + "loss": 0.6647, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014511517077045273, + "loss": 0.6632, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.6527110934257507, + "eval_runtime": 49.2622, + "eval_samples_per_second": 40.599, + "eval_steps_per_second": 2.537, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014463860206513106, + "loss": 0.6705, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014416203335980936, + "loss": 0.6703, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014368546465448766, + "loss": 0.6602, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.000143208895949166, + "loss": 0.6639, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014273232724384432, + "loss": 0.6645, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225575853852262, + "loss": 0.6655, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014177918983320094, + "loss": 0.664, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014130262112787927, + "loss": 0.6656, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014082605242255757, + "loss": 0.6658, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001403494837172359, + "loss": 0.6641, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.6513609886169434, + "eval_runtime": 49.4424, + "eval_samples_per_second": 40.451, + "eval_steps_per_second": 2.528, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001398729150119142, + "loss": 0.6599, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013939634630659252, + "loss": 0.6552, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013891977760127085, + "loss": 0.6616, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013844320889594915, + "loss": 0.6635, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013796664019062745, + "loss": 0.6608, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749007148530578, + "loss": 0.6596, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001370135027799841, + "loss": 0.6589, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001365369340746624, + "loss": 0.6627, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013606036536934073, + "loss": 0.6606, + "step": 6980 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013558379666401906, + "loss": 0.6658, + "step": 7000 + }, + { + "epoch": 1.65, + "eval_loss": 0.6510519981384277, + "eval_runtime": 49.5012, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013510722795869736, + "loss": 0.6571, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001346306592533757, + "loss": 0.6607, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.000134154090548054, + "loss": 0.6562, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013367752184273232, + "loss": 0.6582, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013320095313741064, + "loss": 0.6635, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013272438443208894, + "loss": 0.6682, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224781572676727, + "loss": 0.6633, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001317712470214456, + "loss": 0.6671, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001312946783161239, + "loss": 0.6645, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001308181096108022, + "loss": 0.6699, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.6502068042755127, + "eval_runtime": 49.4619, + "eval_samples_per_second": 40.435, + "eval_steps_per_second": 2.527, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034154090548053, + "loss": 0.6617, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012986497220015885, + "loss": 0.6639, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012938840349483715, + "loss": 0.6634, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012891183478951548, + "loss": 0.663, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012843526608419378, + "loss": 0.6653, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001279586973788721, + "loss": 0.6555, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012748212867355044, + "loss": 0.6653, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012700555996822874, + "loss": 0.6573, + "step": 7360 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012652899126290706, + "loss": 0.658, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260524225575854, + "loss": 0.6562, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.6491650342941284, + "eval_runtime": 49.2463, + "eval_samples_per_second": 40.612, + "eval_steps_per_second": 2.538, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001255758538522637, + "loss": 0.6592, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.000125099285146942, + "loss": 0.6587, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012462271644162032, + "loss": 0.6616, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414614773629865, + "loss": 0.655, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012366957903097695, + "loss": 0.6591, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012319301032565527, + "loss": 0.6545, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001227164416203336, + "loss": 0.6673, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222398729150119, + "loss": 0.6626, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012176330420969023, + "loss": 0.6663, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012128673550436854, + "loss": 0.6643, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.6482685804367065, + "eval_runtime": 49.3591, + "eval_samples_per_second": 40.519, + "eval_steps_per_second": 2.532, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012081016679904685, + "loss": 0.6623, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012033359809372518, + "loss": 0.6636, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011985702938840348, + "loss": 0.6598, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001193804606830818, + "loss": 0.6521, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011890389197776012, + "loss": 0.664, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011842732327243844, + "loss": 0.6529, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795075456711675, + "loss": 0.6622, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011747418586179508, + "loss": 0.6608, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011699761715647338, + "loss": 0.6556, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011652104845115169, + "loss": 0.6643, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.6474015116691589, + "eval_runtime": 49.3608, + "eval_samples_per_second": 40.518, + "eval_steps_per_second": 2.532, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011604447974583002, + "loss": 0.6541, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011556791104050833, + "loss": 0.6614, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011509134233518665, + "loss": 0.6499, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011461477362986497, + "loss": 0.6563, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011413820492454327, + "loss": 0.6589, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011366163621922159, + "loss": 0.6544, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011318506751389992, + "loss": 0.6606, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011270849880857823, + "loss": 0.657, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011223193010325654, + "loss": 0.6608, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011175536139793487, + "loss": 0.6595, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.6469079256057739, + "eval_runtime": 49.3012, + "eval_samples_per_second": 40.567, + "eval_steps_per_second": 2.535, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011127879269261318, + "loss": 0.6563, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011080222398729148, + "loss": 0.6602, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011032565528196981, + "loss": 0.6603, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010984908657664812, + "loss": 0.6495, + "step": 8080 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010937251787132644, + "loss": 0.6551, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010891977760127084, + "loss": 0.6497, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010844320889594917, + "loss": 0.6652, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796664019062747, + "loss": 0.6497, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010749007148530578, + "loss": 0.6554, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010701350277998411, + "loss": 0.6563, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.645990252494812, + "eval_runtime": 49.3957, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010653693407466242, + "loss": 0.6572, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010606036536934074, + "loss": 0.6563, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010558379666401906, + "loss": 0.6535, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010510722795869736, + "loss": 0.655, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010463065925337568, + "loss": 0.6554, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.000104154090548054, + "loss": 0.6559, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010367752184273232, + "loss": 0.6522, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010320095313741063, + "loss": 0.6568, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010272438443208896, + "loss": 0.6566, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010224781572676727, + "loss": 0.6496, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.6457875967025757, + "eval_runtime": 49.0201, + "eval_samples_per_second": 40.8, + "eval_steps_per_second": 2.55, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010177124702144557, + "loss": 0.66, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001012946783161239, + "loss": 0.6457, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001008419380460683, + "loss": 0.6349, + "step": 8460 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010036536934074662, + "loss": 0.6545, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.988880063542493e-05, + "loss": 0.6515, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.941223193010326e-05, + "loss": 0.6459, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.893566322478156e-05, + "loss": 0.6494, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.845909451945987e-05, + "loss": 0.6608, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.79825258141382e-05, + "loss": 0.6485, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.750595710881651e-05, + "loss": 0.6461, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.6450995802879333, + "eval_runtime": 49.2592, + "eval_samples_per_second": 40.602, + "eval_steps_per_second": 2.538, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.702938840349483e-05, + "loss": 0.6523, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.655281969817315e-05, + "loss": 0.6565, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.607625099285145e-05, + "loss": 0.6541, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.559968228752977e-05, + "loss": 0.6585, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.51231135822081e-05, + "loss": 0.6531, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.464654487688641e-05, + "loss": 0.6579, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.416997617156472e-05, + "loss": 0.6438, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.369340746624305e-05, + "loss": 0.6516, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.321683876092136e-05, + "loss": 0.6576, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.274027005559966e-05, + "loss": 0.6506, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.6444578170776367, + "eval_runtime": 49.0631, + "eval_samples_per_second": 40.764, + "eval_steps_per_second": 2.548, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.226370135027799e-05, + "loss": 0.6484, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.17871326449563e-05, + "loss": 0.6566, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.131056393963462e-05, + "loss": 0.6547, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.083399523431295e-05, + "loss": 0.6532, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.035742652899126e-05, + "loss": 0.6532, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.988085782366956e-05, + "loss": 0.6479, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.940428911834789e-05, + "loss": 0.6548, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.89277204130262e-05, + "loss": 0.647, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.845115170770452e-05, + "loss": 0.6478, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.797458300238284e-05, + "loss": 0.6553, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.6433074474334717, + "eval_runtime": 49.3831, + "eval_samples_per_second": 40.5, + "eval_steps_per_second": 2.531, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.749801429706116e-05, + "loss": 0.6443, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.702144559173947e-05, + "loss": 0.6518, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.65448768864178e-05, + "loss": 0.6578, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.60683081810961e-05, + "loss": 0.6472, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.559173947577441e-05, + "loss": 0.6471, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.511517077045274e-05, + "loss": 0.6482, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.463860206513105e-05, + "loss": 0.6522, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.416203335980937e-05, + "loss": 0.6584, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.368546465448769e-05, + "loss": 0.6596, + "step": 9180 + }, + { + "epoch": 2.17, + "learning_rate": 8.320889594916599e-05, + "loss": 0.6581, + "step": 9200 + }, + { + "epoch": 2.17, + "eval_loss": 0.6426697969436646, + "eval_runtime": 49.0935, + "eval_samples_per_second": 40.739, + "eval_steps_per_second": 2.546, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.273232724384431e-05, + "loss": 0.6441, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.225575853852263e-05, + "loss": 0.6509, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.177918983320095e-05, + "loss": 0.6409, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.130262112787926e-05, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.082605242255759e-05, + "loss": 0.6597, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.03494837172359e-05, + "loss": 0.6544, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.98729150119142e-05, + "loss": 0.6528, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.939634630659253e-05, + "loss": 0.644, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.891977760127084e-05, + "loss": 0.6552, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.844320889594916e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.6423606276512146, + "eval_runtime": 49.6466, + "eval_samples_per_second": 40.285, + "eval_steps_per_second": 2.518, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.796664019062748e-05, + "loss": 0.6568, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.74900714853058e-05, + "loss": 0.6539, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.70135027799841e-05, + "loss": 0.6468, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.653693407466243e-05, + "loss": 0.6425, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.606036536934074e-05, + "loss": 0.6523, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.558379666401905e-05, + "loss": 0.6468, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.510722795869738e-05, + "loss": 0.6518, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.46306592533757e-05, + "loss": 0.6534, + "step": 9560 + }, + { + "epoch": 2.26, + "learning_rate": 7.415409054805401e-05, + "loss": 0.6471, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.367752184273232e-05, + "loss": 0.6465, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.6418060064315796, + "eval_runtime": 49.4954, + "eval_samples_per_second": 40.408, + "eval_steps_per_second": 2.525, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.320095313741064e-05, + "loss": 0.6577, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.272438443208895e-05, + "loss": 0.6453, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.224781572676726e-05, + "loss": 0.6489, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.177124702144559e-05, + "loss": 0.6466, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12946783161239e-05, + "loss": 0.6493, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.081810961080222e-05, + "loss": 0.6537, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.034154090548053e-05, + "loss": 0.6486, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.986497220015885e-05, + "loss": 0.65, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.938840349483717e-05, + "loss": 0.6387, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.891183478951549e-05, + "loss": 0.6464, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.6412256360054016, + "eval_runtime": 49.3752, + "eval_samples_per_second": 40.506, + "eval_steps_per_second": 2.532, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.84352660841938e-05, + "loss": 0.6475, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.795869737887211e-05, + "loss": 0.6543, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.748212867355043e-05, + "loss": 0.6545, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.700555996822874e-05, + "loss": 0.6468, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.652899126290707e-05, + "loss": 0.651, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.605242255758538e-05, + "loss": 0.641, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.55758538522637e-05, + "loss": 0.657, + "step": 9940 + }, + { + "epoch": 2.35, + "learning_rate": 6.509928514694201e-05, + "loss": 0.6481, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.462271644162034e-05, + "loss": 0.6496, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.414614773629864e-05, + "loss": 0.6451, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.6414454579353333, + "eval_runtime": 49.395, + "eval_samples_per_second": 40.49, + "eval_steps_per_second": 2.531, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.366957903097696e-05, + "loss": 0.6555, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.319301032565528e-05, + "loss": 0.6494, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.271644162033359e-05, + "loss": 0.6487, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.22398729150119e-05, + "loss": 0.6544, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.176330420969023e-05, + "loss": 0.6468, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.128673550436853e-05, + "loss": 0.6441, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.081016679904686e-05, + "loss": 0.6478, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.033359809372518e-05, + "loss": 0.6539, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.985702938840349e-05, + "loss": 0.6486, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.938046068308181e-05, + "loss": 0.6467, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.6406835913658142, + "eval_runtime": 49.5084, + "eval_samples_per_second": 40.397, + "eval_steps_per_second": 2.525, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.890389197776013e-05, + "loss": 0.6399, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.8427323272438435e-05, + "loss": 0.6519, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.7950754567116756e-05, + "loss": 0.6465, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7474185861795076e-05, + "loss": 0.6479, + "step": 10280 + }, + { + "epoch": 2.43, + "learning_rate": 5.6997617156473383e-05, + "loss": 0.6462, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6521048451151704e-05, + "loss": 0.6451, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.604447974583002e-05, + "loss": 0.6453, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.556791104050833e-05, + "loss": 0.6543, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.509134233518665e-05, + "loss": 0.6428, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.4614773629864966e-05, + "loss": 0.6491, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.6400973796844482, + "eval_runtime": 49.3411, + "eval_samples_per_second": 40.534, + "eval_steps_per_second": 2.533, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.413820492454328e-05, + "loss": 0.649, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.36616362192216e-05, + "loss": 0.6494, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3185067513899913e-05, + "loss": 0.6431, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.2708498808578234e-05, + "loss": 0.6478, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.223193010325655e-05, + "loss": 0.6416, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.175536139793486e-05, + "loss": 0.6507, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.127879269261318e-05, + "loss": 0.6448, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0802223987291496e-05, + "loss": 0.6455, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.032565528196981e-05, + "loss": 0.6437, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.984908657664813e-05, + "loss": 0.6488, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.6400858163833618, + "eval_runtime": 49.8084, + "eval_samples_per_second": 40.154, + "eval_steps_per_second": 2.51, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.937251787132645e-05, + "loss": 0.6436, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.889594916600476e-05, + "loss": 0.6446, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.841938046068308e-05, + "loss": 0.6488, + "step": 10660 + }, + { + "epoch": 2.52, + "learning_rate": 4.79428117553614e-05, + "loss": 0.6485, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.7466243050039705e-05, + "loss": 0.6524, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.6989674344718026e-05, + "loss": 0.6376, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.6513105639396346e-05, + "loss": 0.649, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.603653693407465e-05, + "loss": 0.6444, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5559968228752974e-05, + "loss": 0.6407, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5083399523431294e-05, + "loss": 0.6448, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6392157077789307, + "eval_runtime": 49.7963, + "eval_samples_per_second": 40.164, + "eval_steps_per_second": 2.51, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.46068308181096e-05, + "loss": 0.6454, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.413026211278792e-05, + "loss": 0.6544, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.365369340746624e-05, + "loss": 0.6478, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3177124702144556e-05, + "loss": 0.6434, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.270055599682287e-05, + "loss": 0.6482, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.222398729150119e-05, + "loss": 0.6403, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.1747418586179504e-05, + "loss": 0.6501, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.127084988085782e-05, + "loss": 0.6507, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.079428117553614e-05, + "loss": 0.6496, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.031771247021445e-05, + "loss": 0.6544, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.6390016078948975, + "eval_runtime": 49.6306, + "eval_samples_per_second": 40.298, + "eval_steps_per_second": 2.519, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.984114376489277e-05, + "loss": 0.6405, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.9364575059571086e-05, + "loss": 0.6429, + "step": 11040 + }, + { + "epoch": 2.61, + "learning_rate": 3.88880063542494e-05, + "loss": 0.6403, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.841143764892772e-05, + "loss": 0.6338, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.7934868943606034e-05, + "loss": 0.6417, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.7458300238284354e-05, + "loss": 0.6463, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.698173153296267e-05, + "loss": 0.6498, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.650516282764098e-05, + "loss": 0.6415, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.6028594122319296e-05, + "loss": 0.645, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.5552025416997616e-05, + "loss": 0.6467, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.6387213468551636, + "eval_runtime": 49.1775, + "eval_samples_per_second": 40.669, + "eval_steps_per_second": 2.542, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.507545671167593e-05, + "loss": 0.6515, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.4598888006354244e-05, + "loss": 0.65, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.4122319301032564e-05, + "loss": 0.6512, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.364575059571088e-05, + "loss": 0.6443, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.316918189038919e-05, + "loss": 0.6483, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.269261318506751e-05, + "loss": 0.6455, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.2216044479745826e-05, + "loss": 0.6461, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.173947577442414e-05, + "loss": 0.6505, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.126290706910246e-05, + "loss": 0.6517, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.0786338363780774e-05, + "loss": 0.6406, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.6380326151847839, + "eval_runtime": 49.4129, + "eval_samples_per_second": 40.475, + "eval_steps_per_second": 2.53, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.030976965845909e-05, + "loss": 0.647, + "step": 11420 + }, + { + "epoch": 2.7, + "learning_rate": 2.9833200953137408e-05, + "loss": 0.6495, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.9356632247815725e-05, + "loss": 0.6448, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.888006354249404e-05, + "loss": 0.6447, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.840349483717236e-05, + "loss": 0.6527, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7926926131850673e-05, + "loss": 0.6406, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7450357426528987e-05, + "loss": 0.6443, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6973788721207307e-05, + "loss": 0.6351, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.649722001588562e-05, + "loss": 0.6417, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.6020651310563938e-05, + "loss": 0.6356, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.6381237506866455, + "eval_runtime": 49.5534, + "eval_samples_per_second": 40.36, + "eval_steps_per_second": 2.523, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.5544082605242255e-05, + "loss": 0.6412, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.506751389992057e-05, + "loss": 0.6418, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4590945194598886e-05, + "loss": 0.6426, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.4114376489277203e-05, + "loss": 0.6461, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.363780778395552e-05, + "loss": 0.6475, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3161239078633834e-05, + "loss": 0.6431, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.2684670373312148e-05, + "loss": 0.6416, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2208101667990468e-05, + "loss": 0.6495, + "step": 11760 + }, + { + "epoch": 2.78, + "learning_rate": 2.1731532962668782e-05, + "loss": 0.6404, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.1254964257347096e-05, + "loss": 0.6434, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.6377163529396057, + "eval_runtime": 49.328, + "eval_samples_per_second": 40.545, + "eval_steps_per_second": 2.534, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.0778395552025416e-05, + "loss": 0.6437, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.030182684670373e-05, + "loss": 0.6393, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.9825258141382047e-05, + "loss": 0.6412, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9348689436060364e-05, + "loss": 0.6494, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.887212073073868e-05, + "loss": 0.6481, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8395552025416998e-05, + "loss": 0.6407, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7918983320095312e-05, + "loss": 0.6422, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.744241461477363e-05, + "loss": 0.6487, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.6965845909451946e-05, + "loss": 0.6478, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.648927720413026e-05, + "loss": 0.6451, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.6374698281288147, + "eval_runtime": 49.9107, + "eval_samples_per_second": 40.072, + "eval_steps_per_second": 2.504, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.6012708498808577e-05, + "loss": 0.6454, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.5536139793486894e-05, + "loss": 0.6399, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5059571088165208e-05, + "loss": 0.6479, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4583002382843525e-05, + "loss": 0.6412, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4106433677521842e-05, + "loss": 0.65, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3629864972200157e-05, + "loss": 0.6461, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3153296266878475e-05, + "loss": 0.6434, + "step": 12140 + }, + { + "epoch": 2.87, + "learning_rate": 1.2676727561556788e-05, + "loss": 0.6463, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.2200158856235105e-05, + "loss": 0.6399, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1723590150913422e-05, + "loss": 0.6446, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.6372544765472412, + "eval_runtime": 49.6265, + "eval_samples_per_second": 40.301, + "eval_steps_per_second": 2.519, + "step": 12200 + } + ], + "max_steps": 12690, + "num_train_epochs": 3, + "total_flos": 1.5855992729583485e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cfd5ca55a1cd7f462c1d326faacf15d022e29425 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a070370e87c048b60fc888b8736a0166eb94eeb3a75f5f78918edab715d0fb1c +size 3579 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..22f99b778ee1efa6b0e253b22f5d74b01361f5f2 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44d25fe2c7be3065324779281d2f127b5a642d67f1e6454f19700b42493bdd7 +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1e8786c2012b035b51275f68c9c5d212a4f89f77 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f507c244cdac0a2a8d4da2999644d1b91a384aa62f543cdb6df0f2076c7edb1f +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_0.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..680525e83dec9c5f3669b1d2808220ae27dbb7a6 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab02d3b0b43bf78678642d9cd5f53765f64014c5978ba11838067362d9bc2ad +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_1.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b8febb61159a9a774a73937b563fbefa0d2fc920 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64dd573126e6ce344168cb0149b372652148f70e76d3a1dc6ae6ddf416540e48 +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..05fcdc0e071bea529703b44ef99bd30d54ce9866 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ebe8baa4484dffe2c79d27911cedebee271659822ff874c04a10526e4d03d6f +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d40401c787464e89ec31f448fd400427293dfaf --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54bc440119a51d3d80c80b0036e812daff5b98ed0f83403347a0707e66b60f65 +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2df335ab61e023311d53d99a9f69e05581d875d3 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/trainer_state.json @@ -0,0 +1,4232 @@ +{ + "best_metric": 0.6368712186813354, + "best_model_checkpoint": "lora-alpaca-cn/checkpoint-12400", + "epoch": 2.9314420803782504, + "global_step": 12400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7735, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.1358, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.9749, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9316, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9072, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995234312946783, + "loss": 0.8963, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990468625893566, + "loss": 0.8853, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002985702938840349, + "loss": 0.8709, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029809372517871323, + "loss": 0.8555, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029761715647339156, + "loss": 0.8584, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.8360834717750549, + "eval_runtime": 49.29, + "eval_samples_per_second": 40.576, + "eval_steps_per_second": 2.536, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971405877680699, + "loss": 0.859, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666401906274816, + "loss": 0.8511, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961874503574265, + "loss": 0.8401, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002957108816521048, + "loss": 0.8357, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523431294678314, + "loss": 0.8413, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029475774424146147, + "loss": 0.8283, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002942811755361398, + "loss": 0.8202, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029380460683081807, + "loss": 0.8222, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002933280381254964, + "loss": 0.8178, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928514694201747, + "loss": 0.8177, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.7966175079345703, + "eval_runtime": 49.1752, + "eval_samples_per_second": 40.671, + "eval_steps_per_second": 2.542, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237490071485305, + "loss": 0.8057, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002918983320095314, + "loss": 0.811, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029142176330420965, + "loss": 0.8056, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290945194598888, + "loss": 0.7993, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904686258935663, + "loss": 0.7982, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899920571882446, + "loss": 0.8023, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028951548848292296, + "loss": 0.7968, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903891977760123, + "loss": 0.8029, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028856235107227956, + "loss": 0.7892, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880857823669579, + "loss": 0.7946, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.7735009789466858, + "eval_runtime": 49.3305, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028760921366163616, + "loss": 0.782, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002871326449563145, + "loss": 0.7799, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002866560762509928, + "loss": 0.7782, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617950754567114, + "loss": 0.7785, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028570293884034947, + "loss": 0.785, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852263701350278, + "loss": 0.7754, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474980142970607, + "loss": 0.7804, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842732327243844, + "loss": 0.7696, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002837966640190627, + "loss": 0.7692, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028332009531374105, + "loss": 0.7752, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.7564254403114319, + "eval_runtime": 49.106, + "eval_samples_per_second": 40.728, + "eval_steps_per_second": 2.546, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828435266084194, + "loss": 0.7698, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028236695790309765, + "loss": 0.7699, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.000281890389197776, + "loss": 0.7718, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002814138204924543, + "loss": 0.7644, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028093725178713263, + "loss": 0.7659, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028046068308181096, + "loss": 0.7641, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027998411437648923, + "loss": 0.7535, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950754567116756, + "loss": 0.7672, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790309769658459, + "loss": 0.7563, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002785544082605242, + "loss": 0.752, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.7433652281761169, + "eval_runtime": 48.9945, + "eval_samples_per_second": 40.821, + "eval_steps_per_second": 2.551, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027807783955520254, + "loss": 0.755, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027760127084988087, + "loss": 0.7563, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027712470214455914, + "loss": 0.7475, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027664813343923747, + "loss": 0.7599, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002761715647339158, + "loss": 0.7533, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027569499602859407, + "loss": 0.7488, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027521842732327245, + "loss": 0.753, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747418586179507, + "loss": 0.7435, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027426528991262905, + "loss": 0.7457, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737887212073074, + "loss": 0.742, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7321739792823792, + "eval_runtime": 48.8876, + "eval_samples_per_second": 40.91, + "eval_steps_per_second": 2.557, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00027331215250198565, + "loss": 0.7474, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.000272835583796664, + "loss": 0.7456, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002723590150913423, + "loss": 0.7406, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027188244638602063, + "loss": 0.7448, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027140587768069896, + "loss": 0.7445, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027092930897537723, + "loss": 0.7349, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027045274027005556, + "loss": 0.7395, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699761715647339, + "loss": 0.7382, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694996028594122, + "loss": 0.7357, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026902303415409054, + "loss": 0.7409, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.7235888242721558, + "eval_runtime": 49.2145, + "eval_samples_per_second": 40.638, + "eval_steps_per_second": 2.54, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026854646544876887, + "loss": 0.7376, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026806989674344714, + "loss": 0.7298, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026759332803812547, + "loss": 0.7379, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671167593328038, + "loss": 0.7354, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002666401906274821, + "loss": 0.7341, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026616362192216045, + "loss": 0.7352, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656870532168387, + "loss": 0.7321, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026521048451151705, + "loss": 0.7285, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647339158061954, + "loss": 0.73, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026425734710087365, + "loss": 0.7304, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.716058611869812, + "eval_runtime": 48.9201, + "eval_samples_per_second": 40.883, + "eval_steps_per_second": 2.555, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026378077839555203, + "loss": 0.7314, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002633042096902303, + "loss": 0.7315, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026282764098490863, + "loss": 0.7239, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026235107227958696, + "loss": 0.73, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026187450357426523, + "loss": 0.7243, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026139793486894356, + "loss": 0.7199, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609213661636219, + "loss": 0.7216, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002604447974583002, + "loss": 0.7358, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025996822875297854, + "loss": 0.7313, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025949166004765687, + "loss": 0.7236, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.7097632884979248, + "eval_runtime": 49.4908, + "eval_samples_per_second": 40.412, + "eval_steps_per_second": 2.526, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025901509134233514, + "loss": 0.7282, + "step": 1820 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025853852263701347, + "loss": 0.7187, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002580619539316918, + "loss": 0.7303, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575853852263701, + "loss": 0.724, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025710881652104845, + "loss": 0.7248, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566322478157267, + "loss": 0.7195, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025615567911040505, + "loss": 0.7269, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556791104050834, + "loss": 0.7209, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552025416997617, + "loss": 0.7282, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025472597299444003, + "loss": 0.7195, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7037709355354309, + "eval_runtime": 49.7167, + "eval_samples_per_second": 40.228, + "eval_steps_per_second": 2.514, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542494042891183, + "loss": 0.7229, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025377283558379664, + "loss": 0.718, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025329626687847496, + "loss": 0.7223, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025281969817315324, + "loss": 0.7209, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002523431294678316, + "loss": 0.7151, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518665607625099, + "loss": 0.7141, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513899920571882, + "loss": 0.7084, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025091342335186654, + "loss": 0.7075, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025043685464654487, + "loss": 0.7133, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024996028594122314, + "loss": 0.7092, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.6989386677742004, + "eval_runtime": 49.2344, + "eval_samples_per_second": 40.622, + "eval_steps_per_second": 2.539, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494837172359015, + "loss": 0.7178, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002490071485305798, + "loss": 0.7188, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002485305798252581, + "loss": 0.7161, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024805401111993645, + "loss": 0.7078, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475774424146147, + "loss": 0.7, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024710087370929305, + "loss": 0.718, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002466243050039714, + "loss": 0.7059, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002461477362986497, + "loss": 0.712, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567116759332804, + "loss": 0.7116, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002451945988880063, + "loss": 0.6986, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.6939737796783447, + "eval_runtime": 49.459, + "eval_samples_per_second": 40.438, + "eval_steps_per_second": 2.527, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024471803018268464, + "loss": 0.7168, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024424146147736296, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024376489277204126, + "loss": 0.7095, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024328832406671962, + "loss": 0.7091, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024281175536139792, + "loss": 0.7015, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024233518665607622, + "loss": 0.7109, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024185861795075455, + "loss": 0.7086, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024138204924543285, + "loss": 0.7118, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024090548054011117, + "loss": 0.7033, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002404289118347895, + "loss": 0.7128, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.6901652812957764, + "eval_runtime": 49.5038, + "eval_samples_per_second": 40.401, + "eval_steps_per_second": 2.525, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023995234312946783, + "loss": 0.6968, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023947577442414613, + "loss": 0.7109, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023899920571882443, + "loss": 0.7048, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023852263701350276, + "loss": 0.7012, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023804606830818106, + "loss": 0.7065, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375694996028594, + "loss": 0.7009, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002370929308975377, + "loss": 0.7035, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023661636219221604, + "loss": 0.6973, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613979348689434, + "loss": 0.7075, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023566322478157264, + "loss": 0.6952, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.6865400671958923, + "eval_runtime": 49.2814, + "eval_samples_per_second": 40.583, + "eval_steps_per_second": 2.536, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023518665607625097, + "loss": 0.6979, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002347100873709293, + "loss": 0.6973, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023423351866560762, + "loss": 0.7033, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023375694996028592, + "loss": 0.6964, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023328038125496422, + "loss": 0.7052, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023280381254964255, + "loss": 0.6999, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023232724384432085, + "loss": 0.6963, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002318506751389992, + "loss": 0.7025, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313741064336775, + "loss": 0.704, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089753772835583, + "loss": 0.6926, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.6828380227088928, + "eval_runtime": 49.5667, + "eval_samples_per_second": 40.35, + "eval_steps_per_second": 2.522, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023042096902303413, + "loss": 0.698, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022994440031771243, + "loss": 0.6893, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022946783161239076, + "loss": 0.6938, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022899126290706908, + "loss": 0.6974, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002285146942017474, + "loss": 0.6922, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002280381254964257, + "loss": 0.7073, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022756155679110404, + "loss": 0.6895, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022708498808578234, + "loss": 0.7012, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022660841938046064, + "loss": 0.6985, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.000226131850675139, + "loss": 0.6901, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.6807068586349487, + "eval_runtime": 49.2421, + "eval_samples_per_second": 40.616, + "eval_steps_per_second": 2.538, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256552819698173, + "loss": 0.697, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022517871326449562, + "loss": 0.7002, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022470214455917392, + "loss": 0.6918, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022422557585385225, + "loss": 0.6999, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022374900714853055, + "loss": 0.6961, + "step": 3300 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002232724384432089, + "loss": 0.6888, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002227958697378872, + "loss": 0.695, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002223193010325655, + "loss": 0.6861, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022184273232724383, + "loss": 0.6864, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022136616362192213, + "loss": 0.6917, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.6773961782455444, + "eval_runtime": 49.3961, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022088959491660043, + "loss": 0.679, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022041302621127879, + "loss": 0.6915, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021993645750595709, + "loss": 0.6937, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002194598888006354, + "loss": 0.6831, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189833200953137, + "loss": 0.6875, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021850675138999204, + "loss": 0.6916, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021803018268467034, + "loss": 0.6896, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002175536139793487, + "loss": 0.6986, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.000217077045274027, + "loss": 0.693, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002166004765687053, + "loss": 0.6893, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.6753410696983337, + "eval_runtime": 49.3307, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021612390786338362, + "loss": 0.6872, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021564733915806192, + "loss": 0.6862, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021517077045274025, + "loss": 0.6943, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021469420174741858, + "loss": 0.6896, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002142176330420969, + "loss": 0.6912, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137410643367752, + "loss": 0.6859, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002132644956314535, + "loss": 0.6791, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021278792692613183, + "loss": 0.6882, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021231135822081013, + "loss": 0.6823, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002118347895154885, + "loss": 0.6831, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.6738302707672119, + "eval_runtime": 49.4648, + "eval_samples_per_second": 40.433, + "eval_steps_per_second": 2.527, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113582208101668, + "loss": 0.6818, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108816521048451, + "loss": 0.6912, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021040508339952341, + "loss": 0.6884, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020992851469420171, + "loss": 0.6888, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020945194598888004, + "loss": 0.6822, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020897537728355837, + "loss": 0.6879, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002084988085782367, + "loss": 0.6771, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.000208022239872915, + "loss": 0.684, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002075456711675933, + "loss": 0.6878, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020706910246227162, + "loss": 0.6913, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.6711302995681763, + "eval_runtime": 49.117, + "eval_samples_per_second": 40.719, + "eval_steps_per_second": 2.545, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020659253375694992, + "loss": 0.683, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020611596505162828, + "loss": 0.6833, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020563939634630658, + "loss": 0.6793, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002051628276409849, + "loss": 0.6843, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002046862589356632, + "loss": 0.6822, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002042096902303415, + "loss": 0.6856, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020373312152501983, + "loss": 0.6809, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020325655281969816, + "loss": 0.6843, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002027799841143765, + "loss": 0.6754, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002023034154090548, + "loss": 0.6823, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.6697036027908325, + "eval_runtime": 49.3237, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020182684670373312, + "loss": 0.6861, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020135027799841142, + "loss": 0.6806, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020087370929308972, + "loss": 0.6823, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020039714058776807, + "loss": 0.6805, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019992057188244637, + "loss": 0.6813, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994440031771247, + "loss": 0.675, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.000198967434471803, + "loss": 0.6728, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001984908657664813, + "loss": 0.6676, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019801429706115963, + "loss": 0.6729, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019753772835583795, + "loss": 0.685, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.6667952537536621, + "eval_runtime": 49.3899, + "eval_samples_per_second": 40.494, + "eval_steps_per_second": 2.531, + "step": 4400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019706115965051628, + "loss": 0.6786, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658459094519458, + "loss": 0.6738, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961080222398729, + "loss": 0.686, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956314535345512, + "loss": 0.6818, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951548848292295, + "loss": 0.6741, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019467831612390786, + "loss": 0.6756, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019420174741858616, + "loss": 0.6851, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001937251787132645, + "loss": 0.6784, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001932486100079428, + "loss": 0.6785, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019277204130262112, + "loss": 0.678, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.6655837297439575, + "eval_runtime": 49.5019, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019229547259729942, + "loss": 0.6782, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019181890389197777, + "loss": 0.6683, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019134233518665607, + "loss": 0.6783, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019086576648133437, + "loss": 0.675, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903891977760127, + "loss": 0.6691, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.000189912629070691, + "loss": 0.6726, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018943606036536933, + "loss": 0.68, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018895949166004763, + "loss": 0.6694, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018848292295472598, + "loss": 0.6686, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018800635424940428, + "loss": 0.6766, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.6646501421928406, + "eval_runtime": 49.3188, + "eval_samples_per_second": 40.552, + "eval_steps_per_second": 2.535, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018752978554408258, + "loss": 0.6724, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001870532168387609, + "loss": 0.6801, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001865766481334392, + "loss": 0.6698, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001861000794281175, + "loss": 0.6723, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018562351072279586, + "loss": 0.6693, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018514694201747416, + "loss": 0.6716, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001846703733121525, + "loss": 0.674, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001841938046068308, + "loss": 0.6702, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018371723590150912, + "loss": 0.6716, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018324066719618742, + "loss": 0.672, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.662735104560852, + "eval_runtime": 49.275, + "eval_samples_per_second": 40.589, + "eval_steps_per_second": 2.537, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018276409849086577, + "loss": 0.6701, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018228752978554407, + "loss": 0.6663, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018181096108022237, + "loss": 0.6651, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813343923749007, + "loss": 0.6708, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.000180857823669579, + "loss": 0.6697, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018038125496425733, + "loss": 0.662, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017990468625893566, + "loss": 0.669, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017942811755361398, + "loss": 0.6649, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017895154884829228, + "loss": 0.668, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017847498014297058, + "loss": 0.6796, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.6609957218170166, + "eval_runtime": 49.2394, + "eval_samples_per_second": 40.618, + "eval_steps_per_second": 2.539, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001779984114376489, + "loss": 0.6745, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001775218427323272, + "loss": 0.6646, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704527402700556, + "loss": 0.6682, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017656870532168386, + "loss": 0.6713, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017609213661636216, + "loss": 0.6618, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001756155679110405, + "loss": 0.6703, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001751389992057188, + "loss": 0.6652, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017466243050039712, + "loss": 0.6698, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017418586179507545, + "loss": 0.6728, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017370929308975377, + "loss": 0.6752, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.6592395901679993, + "eval_runtime": 49.3426, + "eval_samples_per_second": 40.533, + "eval_steps_per_second": 2.533, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017323272438443207, + "loss": 0.6653, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017275615567911037, + "loss": 0.669, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001722795869737887, + "loss": 0.6698, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.000171803018268467, + "loss": 0.6742, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017132644956314536, + "loss": 0.6596, + "step": 5500 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017084988085782366, + "loss": 0.6699, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017037331215250198, + "loss": 0.664, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016989674344718028, + "loss": 0.6673, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016942017474185858, + "loss": 0.6684, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001689436060365369, + "loss": 0.6769, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.6582754850387573, + "eval_runtime": 49.469, + "eval_samples_per_second": 40.429, + "eval_steps_per_second": 2.527, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016846703733121524, + "loss": 0.6633, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016799046862589357, + "loss": 0.6679, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016751389992057187, + "loss": 0.6601, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670373312152502, + "loss": 0.6731, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665607625099285, + "loss": 0.6638, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660841938046068, + "loss": 0.6693, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016560762509928515, + "loss": 0.6642, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016513105639396345, + "loss": 0.6649, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016465448768864178, + "loss": 0.663, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016417791898332008, + "loss": 0.6629, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.6574136018753052, + "eval_runtime": 49.3019, + "eval_samples_per_second": 40.566, + "eval_steps_per_second": 2.535, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016370135027799838, + "loss": 0.6605, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001632247815726767, + "loss": 0.6707, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016274821286735503, + "loss": 0.6695, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016227164416203336, + "loss": 0.6647, + "step": 5880 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016179507545671166, + "loss": 0.6657, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016131850675138999, + "loss": 0.6656, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016084193804606829, + "loss": 0.6676, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016036536934074659, + "loss": 0.6678, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015988880063542494, + "loss": 0.6639, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015941223193010324, + "loss": 0.6645, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.656126081943512, + "eval_runtime": 49.5095, + "eval_samples_per_second": 40.396, + "eval_steps_per_second": 2.525, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015893566322478157, + "loss": 0.6672, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015845909451945987, + "loss": 0.6678, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001579825258141382, + "loss": 0.6676, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001575059571088165, + "loss": 0.6717, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015702938840349485, + "loss": 0.671, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015655281969817315, + "loss": 0.6611, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015607625099285145, + "loss": 0.6606, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015559968228752978, + "loss": 0.6647, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015512311358220808, + "loss": 0.6652, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546465448768864, + "loss": 0.6629, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.6549723148345947, + "eval_runtime": 49.4871, + "eval_samples_per_second": 40.415, + "eval_steps_per_second": 2.526, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015416997617156473, + "loss": 0.6685, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015369340746624306, + "loss": 0.6578, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015321683876092136, + "loss": 0.6587, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015274027005559966, + "loss": 0.6655, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.000152263701350278, + "loss": 0.6662, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517871326449563, + "loss": 0.6648, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015131056393963464, + "loss": 0.6638, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015083399523431294, + "loss": 0.6614, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015035742652899124, + "loss": 0.6552, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014988085782366957, + "loss": 0.6753, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.6544620990753174, + "eval_runtime": 49.3242, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014940428911834787, + "loss": 0.6588, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001489277204130262, + "loss": 0.6609, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014845115170770452, + "loss": 0.6565, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014797458300238282, + "loss": 0.6589, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749801429706115, + "loss": 0.6585, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014702144559173945, + "loss": 0.6737, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014654487688641778, + "loss": 0.6554, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001460683081810961, + "loss": 0.6603, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001455917394757744, + "loss": 0.6647, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014511517077045273, + "loss": 0.6632, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.6527110934257507, + "eval_runtime": 49.2622, + "eval_samples_per_second": 40.599, + "eval_steps_per_second": 2.537, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014463860206513106, + "loss": 0.6705, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014416203335980936, + "loss": 0.6703, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014368546465448766, + "loss": 0.6602, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.000143208895949166, + "loss": 0.6639, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014273232724384432, + "loss": 0.6645, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225575853852262, + "loss": 0.6655, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014177918983320094, + "loss": 0.664, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014130262112787927, + "loss": 0.6656, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014082605242255757, + "loss": 0.6658, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001403494837172359, + "loss": 0.6641, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.6513609886169434, + "eval_runtime": 49.4424, + "eval_samples_per_second": 40.451, + "eval_steps_per_second": 2.528, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001398729150119142, + "loss": 0.6599, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013939634630659252, + "loss": 0.6552, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013891977760127085, + "loss": 0.6616, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013844320889594915, + "loss": 0.6635, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013796664019062745, + "loss": 0.6608, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749007148530578, + "loss": 0.6596, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001370135027799841, + "loss": 0.6589, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001365369340746624, + "loss": 0.6627, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013606036536934073, + "loss": 0.6606, + "step": 6980 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013558379666401906, + "loss": 0.6658, + "step": 7000 + }, + { + "epoch": 1.65, + "eval_loss": 0.6510519981384277, + "eval_runtime": 49.5012, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013510722795869736, + "loss": 0.6571, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001346306592533757, + "loss": 0.6607, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.000134154090548054, + "loss": 0.6562, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013367752184273232, + "loss": 0.6582, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013320095313741064, + "loss": 0.6635, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013272438443208894, + "loss": 0.6682, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224781572676727, + "loss": 0.6633, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001317712470214456, + "loss": 0.6671, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001312946783161239, + "loss": 0.6645, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001308181096108022, + "loss": 0.6699, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.6502068042755127, + "eval_runtime": 49.4619, + "eval_samples_per_second": 40.435, + "eval_steps_per_second": 2.527, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034154090548053, + "loss": 0.6617, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012986497220015885, + "loss": 0.6639, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012938840349483715, + "loss": 0.6634, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012891183478951548, + "loss": 0.663, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012843526608419378, + "loss": 0.6653, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001279586973788721, + "loss": 0.6555, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012748212867355044, + "loss": 0.6653, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012700555996822874, + "loss": 0.6573, + "step": 7360 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012652899126290706, + "loss": 0.658, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260524225575854, + "loss": 0.6562, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.6491650342941284, + "eval_runtime": 49.2463, + "eval_samples_per_second": 40.612, + "eval_steps_per_second": 2.538, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001255758538522637, + "loss": 0.6592, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.000125099285146942, + "loss": 0.6587, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012462271644162032, + "loss": 0.6616, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414614773629865, + "loss": 0.655, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012366957903097695, + "loss": 0.6591, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012319301032565527, + "loss": 0.6545, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001227164416203336, + "loss": 0.6673, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222398729150119, + "loss": 0.6626, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012176330420969023, + "loss": 0.6663, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012128673550436854, + "loss": 0.6643, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.6482685804367065, + "eval_runtime": 49.3591, + "eval_samples_per_second": 40.519, + "eval_steps_per_second": 2.532, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012081016679904685, + "loss": 0.6623, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012033359809372518, + "loss": 0.6636, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011985702938840348, + "loss": 0.6598, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001193804606830818, + "loss": 0.6521, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011890389197776012, + "loss": 0.664, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011842732327243844, + "loss": 0.6529, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795075456711675, + "loss": 0.6622, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011747418586179508, + "loss": 0.6608, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011699761715647338, + "loss": 0.6556, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011652104845115169, + "loss": 0.6643, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.6474015116691589, + "eval_runtime": 49.3608, + "eval_samples_per_second": 40.518, + "eval_steps_per_second": 2.532, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011604447974583002, + "loss": 0.6541, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011556791104050833, + "loss": 0.6614, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011509134233518665, + "loss": 0.6499, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011461477362986497, + "loss": 0.6563, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011413820492454327, + "loss": 0.6589, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011366163621922159, + "loss": 0.6544, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011318506751389992, + "loss": 0.6606, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011270849880857823, + "loss": 0.657, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011223193010325654, + "loss": 0.6608, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011175536139793487, + "loss": 0.6595, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.6469079256057739, + "eval_runtime": 49.3012, + "eval_samples_per_second": 40.567, + "eval_steps_per_second": 2.535, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011127879269261318, + "loss": 0.6563, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011080222398729148, + "loss": 0.6602, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011032565528196981, + "loss": 0.6603, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010984908657664812, + "loss": 0.6495, + "step": 8080 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010937251787132644, + "loss": 0.6551, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010891977760127084, + "loss": 0.6497, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010844320889594917, + "loss": 0.6652, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796664019062747, + "loss": 0.6497, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010749007148530578, + "loss": 0.6554, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010701350277998411, + "loss": 0.6563, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.645990252494812, + "eval_runtime": 49.3957, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010653693407466242, + "loss": 0.6572, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010606036536934074, + "loss": 0.6563, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010558379666401906, + "loss": 0.6535, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010510722795869736, + "loss": 0.655, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010463065925337568, + "loss": 0.6554, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.000104154090548054, + "loss": 0.6559, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010367752184273232, + "loss": 0.6522, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010320095313741063, + "loss": 0.6568, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010272438443208896, + "loss": 0.6566, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010224781572676727, + "loss": 0.6496, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.6457875967025757, + "eval_runtime": 49.0201, + "eval_samples_per_second": 40.8, + "eval_steps_per_second": 2.55, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010177124702144557, + "loss": 0.66, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001012946783161239, + "loss": 0.6457, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001008419380460683, + "loss": 0.6349, + "step": 8460 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010036536934074662, + "loss": 0.6545, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.988880063542493e-05, + "loss": 0.6515, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.941223193010326e-05, + "loss": 0.6459, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.893566322478156e-05, + "loss": 0.6494, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.845909451945987e-05, + "loss": 0.6608, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.79825258141382e-05, + "loss": 0.6485, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.750595710881651e-05, + "loss": 0.6461, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.6450995802879333, + "eval_runtime": 49.2592, + "eval_samples_per_second": 40.602, + "eval_steps_per_second": 2.538, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.702938840349483e-05, + "loss": 0.6523, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.655281969817315e-05, + "loss": 0.6565, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.607625099285145e-05, + "loss": 0.6541, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.559968228752977e-05, + "loss": 0.6585, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.51231135822081e-05, + "loss": 0.6531, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.464654487688641e-05, + "loss": 0.6579, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.416997617156472e-05, + "loss": 0.6438, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.369340746624305e-05, + "loss": 0.6516, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.321683876092136e-05, + "loss": 0.6576, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.274027005559966e-05, + "loss": 0.6506, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.6444578170776367, + "eval_runtime": 49.0631, + "eval_samples_per_second": 40.764, + "eval_steps_per_second": 2.548, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.226370135027799e-05, + "loss": 0.6484, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.17871326449563e-05, + "loss": 0.6566, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.131056393963462e-05, + "loss": 0.6547, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.083399523431295e-05, + "loss": 0.6532, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.035742652899126e-05, + "loss": 0.6532, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.988085782366956e-05, + "loss": 0.6479, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.940428911834789e-05, + "loss": 0.6548, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.89277204130262e-05, + "loss": 0.647, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.845115170770452e-05, + "loss": 0.6478, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.797458300238284e-05, + "loss": 0.6553, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.6433074474334717, + "eval_runtime": 49.3831, + "eval_samples_per_second": 40.5, + "eval_steps_per_second": 2.531, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.749801429706116e-05, + "loss": 0.6443, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.702144559173947e-05, + "loss": 0.6518, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.65448768864178e-05, + "loss": 0.6578, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.60683081810961e-05, + "loss": 0.6472, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.559173947577441e-05, + "loss": 0.6471, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.511517077045274e-05, + "loss": 0.6482, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.463860206513105e-05, + "loss": 0.6522, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.416203335980937e-05, + "loss": 0.6584, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.368546465448769e-05, + "loss": 0.6596, + "step": 9180 + }, + { + "epoch": 2.17, + "learning_rate": 8.320889594916599e-05, + "loss": 0.6581, + "step": 9200 + }, + { + "epoch": 2.17, + "eval_loss": 0.6426697969436646, + "eval_runtime": 49.0935, + "eval_samples_per_second": 40.739, + "eval_steps_per_second": 2.546, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.273232724384431e-05, + "loss": 0.6441, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.225575853852263e-05, + "loss": 0.6509, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.177918983320095e-05, + "loss": 0.6409, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.130262112787926e-05, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.082605242255759e-05, + "loss": 0.6597, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.03494837172359e-05, + "loss": 0.6544, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.98729150119142e-05, + "loss": 0.6528, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.939634630659253e-05, + "loss": 0.644, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.891977760127084e-05, + "loss": 0.6552, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.844320889594916e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.6423606276512146, + "eval_runtime": 49.6466, + "eval_samples_per_second": 40.285, + "eval_steps_per_second": 2.518, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.796664019062748e-05, + "loss": 0.6568, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.74900714853058e-05, + "loss": 0.6539, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.70135027799841e-05, + "loss": 0.6468, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.653693407466243e-05, + "loss": 0.6425, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.606036536934074e-05, + "loss": 0.6523, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.558379666401905e-05, + "loss": 0.6468, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.510722795869738e-05, + "loss": 0.6518, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.46306592533757e-05, + "loss": 0.6534, + "step": 9560 + }, + { + "epoch": 2.26, + "learning_rate": 7.415409054805401e-05, + "loss": 0.6471, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.367752184273232e-05, + "loss": 0.6465, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.6418060064315796, + "eval_runtime": 49.4954, + "eval_samples_per_second": 40.408, + "eval_steps_per_second": 2.525, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.320095313741064e-05, + "loss": 0.6577, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.272438443208895e-05, + "loss": 0.6453, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.224781572676726e-05, + "loss": 0.6489, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.177124702144559e-05, + "loss": 0.6466, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12946783161239e-05, + "loss": 0.6493, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.081810961080222e-05, + "loss": 0.6537, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.034154090548053e-05, + "loss": 0.6486, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.986497220015885e-05, + "loss": 0.65, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.938840349483717e-05, + "loss": 0.6387, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.891183478951549e-05, + "loss": 0.6464, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.6412256360054016, + "eval_runtime": 49.3752, + "eval_samples_per_second": 40.506, + "eval_steps_per_second": 2.532, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.84352660841938e-05, + "loss": 0.6475, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.795869737887211e-05, + "loss": 0.6543, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.748212867355043e-05, + "loss": 0.6545, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.700555996822874e-05, + "loss": 0.6468, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.652899126290707e-05, + "loss": 0.651, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.605242255758538e-05, + "loss": 0.641, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.55758538522637e-05, + "loss": 0.657, + "step": 9940 + }, + { + "epoch": 2.35, + "learning_rate": 6.509928514694201e-05, + "loss": 0.6481, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.462271644162034e-05, + "loss": 0.6496, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.414614773629864e-05, + "loss": 0.6451, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.6414454579353333, + "eval_runtime": 49.395, + "eval_samples_per_second": 40.49, + "eval_steps_per_second": 2.531, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.366957903097696e-05, + "loss": 0.6555, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.319301032565528e-05, + "loss": 0.6494, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.271644162033359e-05, + "loss": 0.6487, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.22398729150119e-05, + "loss": 0.6544, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.176330420969023e-05, + "loss": 0.6468, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.128673550436853e-05, + "loss": 0.6441, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.081016679904686e-05, + "loss": 0.6478, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.033359809372518e-05, + "loss": 0.6539, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.985702938840349e-05, + "loss": 0.6486, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.938046068308181e-05, + "loss": 0.6467, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.6406835913658142, + "eval_runtime": 49.5084, + "eval_samples_per_second": 40.397, + "eval_steps_per_second": 2.525, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.890389197776013e-05, + "loss": 0.6399, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.8427323272438435e-05, + "loss": 0.6519, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.7950754567116756e-05, + "loss": 0.6465, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7474185861795076e-05, + "loss": 0.6479, + "step": 10280 + }, + { + "epoch": 2.43, + "learning_rate": 5.6997617156473383e-05, + "loss": 0.6462, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6521048451151704e-05, + "loss": 0.6451, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.604447974583002e-05, + "loss": 0.6453, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.556791104050833e-05, + "loss": 0.6543, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.509134233518665e-05, + "loss": 0.6428, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.4614773629864966e-05, + "loss": 0.6491, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.6400973796844482, + "eval_runtime": 49.3411, + "eval_samples_per_second": 40.534, + "eval_steps_per_second": 2.533, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.413820492454328e-05, + "loss": 0.649, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.36616362192216e-05, + "loss": 0.6494, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3185067513899913e-05, + "loss": 0.6431, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.2708498808578234e-05, + "loss": 0.6478, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.223193010325655e-05, + "loss": 0.6416, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.175536139793486e-05, + "loss": 0.6507, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.127879269261318e-05, + "loss": 0.6448, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0802223987291496e-05, + "loss": 0.6455, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.032565528196981e-05, + "loss": 0.6437, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.984908657664813e-05, + "loss": 0.6488, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.6400858163833618, + "eval_runtime": 49.8084, + "eval_samples_per_second": 40.154, + "eval_steps_per_second": 2.51, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.937251787132645e-05, + "loss": 0.6436, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.889594916600476e-05, + "loss": 0.6446, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.841938046068308e-05, + "loss": 0.6488, + "step": 10660 + }, + { + "epoch": 2.52, + "learning_rate": 4.79428117553614e-05, + "loss": 0.6485, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.7466243050039705e-05, + "loss": 0.6524, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.6989674344718026e-05, + "loss": 0.6376, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.6513105639396346e-05, + "loss": 0.649, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.603653693407465e-05, + "loss": 0.6444, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5559968228752974e-05, + "loss": 0.6407, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5083399523431294e-05, + "loss": 0.6448, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6392157077789307, + "eval_runtime": 49.7963, + "eval_samples_per_second": 40.164, + "eval_steps_per_second": 2.51, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.46068308181096e-05, + "loss": 0.6454, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.413026211278792e-05, + "loss": 0.6544, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.365369340746624e-05, + "loss": 0.6478, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3177124702144556e-05, + "loss": 0.6434, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.270055599682287e-05, + "loss": 0.6482, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.222398729150119e-05, + "loss": 0.6403, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.1747418586179504e-05, + "loss": 0.6501, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.127084988085782e-05, + "loss": 0.6507, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.079428117553614e-05, + "loss": 0.6496, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.031771247021445e-05, + "loss": 0.6544, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.6390016078948975, + "eval_runtime": 49.6306, + "eval_samples_per_second": 40.298, + "eval_steps_per_second": 2.519, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.984114376489277e-05, + "loss": 0.6405, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.9364575059571086e-05, + "loss": 0.6429, + "step": 11040 + }, + { + "epoch": 2.61, + "learning_rate": 3.88880063542494e-05, + "loss": 0.6403, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.841143764892772e-05, + "loss": 0.6338, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.7934868943606034e-05, + "loss": 0.6417, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.7458300238284354e-05, + "loss": 0.6463, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.698173153296267e-05, + "loss": 0.6498, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.650516282764098e-05, + "loss": 0.6415, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.6028594122319296e-05, + "loss": 0.645, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.5552025416997616e-05, + "loss": 0.6467, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.6387213468551636, + "eval_runtime": 49.1775, + "eval_samples_per_second": 40.669, + "eval_steps_per_second": 2.542, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.507545671167593e-05, + "loss": 0.6515, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.4598888006354244e-05, + "loss": 0.65, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.4122319301032564e-05, + "loss": 0.6512, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.364575059571088e-05, + "loss": 0.6443, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.316918189038919e-05, + "loss": 0.6483, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.269261318506751e-05, + "loss": 0.6455, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.2216044479745826e-05, + "loss": 0.6461, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.173947577442414e-05, + "loss": 0.6505, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.126290706910246e-05, + "loss": 0.6517, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.0786338363780774e-05, + "loss": 0.6406, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.6380326151847839, + "eval_runtime": 49.4129, + "eval_samples_per_second": 40.475, + "eval_steps_per_second": 2.53, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.030976965845909e-05, + "loss": 0.647, + "step": 11420 + }, + { + "epoch": 2.7, + "learning_rate": 2.9833200953137408e-05, + "loss": 0.6495, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.9356632247815725e-05, + "loss": 0.6448, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.888006354249404e-05, + "loss": 0.6447, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.840349483717236e-05, + "loss": 0.6527, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7926926131850673e-05, + "loss": 0.6406, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7450357426528987e-05, + "loss": 0.6443, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6973788721207307e-05, + "loss": 0.6351, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.649722001588562e-05, + "loss": 0.6417, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.6020651310563938e-05, + "loss": 0.6356, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.6381237506866455, + "eval_runtime": 49.5534, + "eval_samples_per_second": 40.36, + "eval_steps_per_second": 2.523, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.5544082605242255e-05, + "loss": 0.6412, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.506751389992057e-05, + "loss": 0.6418, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4590945194598886e-05, + "loss": 0.6426, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.4114376489277203e-05, + "loss": 0.6461, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.363780778395552e-05, + "loss": 0.6475, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3161239078633834e-05, + "loss": 0.6431, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.2684670373312148e-05, + "loss": 0.6416, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2208101667990468e-05, + "loss": 0.6495, + "step": 11760 + }, + { + "epoch": 2.78, + "learning_rate": 2.1731532962668782e-05, + "loss": 0.6404, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.1254964257347096e-05, + "loss": 0.6434, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.6377163529396057, + "eval_runtime": 49.328, + "eval_samples_per_second": 40.545, + "eval_steps_per_second": 2.534, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.0778395552025416e-05, + "loss": 0.6437, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.030182684670373e-05, + "loss": 0.6393, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.9825258141382047e-05, + "loss": 0.6412, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9348689436060364e-05, + "loss": 0.6494, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.887212073073868e-05, + "loss": 0.6481, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8395552025416998e-05, + "loss": 0.6407, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7918983320095312e-05, + "loss": 0.6422, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.744241461477363e-05, + "loss": 0.6487, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.6965845909451946e-05, + "loss": 0.6478, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.648927720413026e-05, + "loss": 0.6451, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.6374698281288147, + "eval_runtime": 49.9107, + "eval_samples_per_second": 40.072, + "eval_steps_per_second": 2.504, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.6012708498808577e-05, + "loss": 0.6454, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.5536139793486894e-05, + "loss": 0.6399, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5059571088165208e-05, + "loss": 0.6479, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4583002382843525e-05, + "loss": 0.6412, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4106433677521842e-05, + "loss": 0.65, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3629864972200157e-05, + "loss": 0.6461, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3153296266878475e-05, + "loss": 0.6434, + "step": 12140 + }, + { + "epoch": 2.87, + "learning_rate": 1.2676727561556788e-05, + "loss": 0.6463, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.2200158856235105e-05, + "loss": 0.6399, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1723590150913422e-05, + "loss": 0.6446, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.6372544765472412, + "eval_runtime": 49.6265, + "eval_samples_per_second": 40.301, + "eval_steps_per_second": 2.519, + "step": 12200 + }, + { + "epoch": 2.89, + "learning_rate": 1.1247021445591738e-05, + "loss": 0.6411, + "step": 12220 + }, + { + "epoch": 2.89, + "learning_rate": 1.0770452740270055e-05, + "loss": 0.6523, + "step": 12240 + }, + { + "epoch": 2.9, + "learning_rate": 1.0293884034948372e-05, + "loss": 0.6456, + "step": 12260 + }, + { + "epoch": 2.9, + "learning_rate": 9.817315329626686e-06, + "loss": 0.6394, + "step": 12280 + }, + { + "epoch": 2.91, + "learning_rate": 9.340746624305003e-06, + "loss": 0.6466, + "step": 12300 + }, + { + "epoch": 2.91, + "learning_rate": 8.864177918983318e-06, + "loss": 0.6415, + "step": 12320 + }, + { + "epoch": 2.92, + "learning_rate": 8.387609213661635e-06, + "loss": 0.6349, + "step": 12340 + }, + { + "epoch": 2.92, + "learning_rate": 7.911040508339953e-06, + "loss": 0.6415, + "step": 12360 + }, + { + "epoch": 2.93, + "learning_rate": 7.434471803018268e-06, + "loss": 0.6484, + "step": 12380 + }, + { + "epoch": 2.93, + "learning_rate": 6.957903097696583e-06, + "loss": 0.6522, + "step": 12400 + }, + { + "epoch": 2.93, + "eval_loss": 0.6368712186813354, + "eval_runtime": 49.0693, + "eval_samples_per_second": 40.759, + "eval_steps_per_second": 2.547, + "step": 12400 + } + ], + "max_steps": 12690, + "num_train_epochs": 3, + "total_flos": 1.6115968989795975e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cfd5ca55a1cd7f462c1d326faacf15d022e29425 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a070370e87c048b60fc888b8736a0166eb94eeb3a75f5f78918edab715d0fb1c +size 3579 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..743218e3c9362a41814999be1dd6aa178b7f534e --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc421ca0cff089fc9d5410754618990eeffd7211650fedbd4ea6c12ce50487d +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ee3619f29958aceb56045bec594d035fb6f984c5 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07cb1c0cd175ab3c89d7c1d0757096e6296fb26b15857a8055e81cea1ea2cc09 +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_0.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..325273bc4deb72be35ffc05ac7d39c6edbf69322 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fea007e68f1c9bcee110c38f61b818ac2bc4fd40e2301190a645e34c33a0f16 +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_1.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5856df977fdf7a5e9e28f5b8941c8b292413cf0d --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f8942b136a5483f01aef18b20bc8e5593a6ffc1c98f568387b2998ae5919ea +size 14583 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a51f329bd75e2531b9fa8a50da50949bfecf7780 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef71700e98ec366ab46243409e3aae93c95cdbd7ffb98f89074ebd1b9ec3b15 +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5667c2362a4287b4ae245c40855c4e15a7fb91e6 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:785a83cf92471c0356ef5051741195c9790867768c86ba3d8a21ba52906bbf2f +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2ca7fe610b4e2cf03b6d138e49727d481250c80f --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/trainer_state.json @@ -0,0 +1,4300 @@ +{ + "best_metric": 0.636846125125885, + "best_model_checkpoint": "lora-alpaca-cn/checkpoint-12600", + "epoch": 2.978723404255319, + "global_step": 12600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7735, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.1358, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.9749, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9316, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9072, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995234312946783, + "loss": 0.8963, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990468625893566, + "loss": 0.8853, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002985702938840349, + "loss": 0.8709, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029809372517871323, + "loss": 0.8555, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029761715647339156, + "loss": 0.8584, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.8360834717750549, + "eval_runtime": 49.29, + "eval_samples_per_second": 40.576, + "eval_steps_per_second": 2.536, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971405877680699, + "loss": 0.859, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666401906274816, + "loss": 0.8511, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961874503574265, + "loss": 0.8401, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002957108816521048, + "loss": 0.8357, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523431294678314, + "loss": 0.8413, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029475774424146147, + "loss": 0.8283, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002942811755361398, + "loss": 0.8202, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029380460683081807, + "loss": 0.8222, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002933280381254964, + "loss": 0.8178, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928514694201747, + "loss": 0.8177, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.7966175079345703, + "eval_runtime": 49.1752, + "eval_samples_per_second": 40.671, + "eval_steps_per_second": 2.542, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237490071485305, + "loss": 0.8057, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002918983320095314, + "loss": 0.811, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029142176330420965, + "loss": 0.8056, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290945194598888, + "loss": 0.7993, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904686258935663, + "loss": 0.7982, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899920571882446, + "loss": 0.8023, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028951548848292296, + "loss": 0.7968, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903891977760123, + "loss": 0.8029, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028856235107227956, + "loss": 0.7892, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880857823669579, + "loss": 0.7946, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.7735009789466858, + "eval_runtime": 49.3305, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028760921366163616, + "loss": 0.782, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002871326449563145, + "loss": 0.7799, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002866560762509928, + "loss": 0.7782, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617950754567114, + "loss": 0.7785, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028570293884034947, + "loss": 0.785, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852263701350278, + "loss": 0.7754, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474980142970607, + "loss": 0.7804, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842732327243844, + "loss": 0.7696, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002837966640190627, + "loss": 0.7692, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028332009531374105, + "loss": 0.7752, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.7564254403114319, + "eval_runtime": 49.106, + "eval_samples_per_second": 40.728, + "eval_steps_per_second": 2.546, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828435266084194, + "loss": 0.7698, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028236695790309765, + "loss": 0.7699, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.000281890389197776, + "loss": 0.7718, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002814138204924543, + "loss": 0.7644, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028093725178713263, + "loss": 0.7659, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028046068308181096, + "loss": 0.7641, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027998411437648923, + "loss": 0.7535, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950754567116756, + "loss": 0.7672, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790309769658459, + "loss": 0.7563, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002785544082605242, + "loss": 0.752, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.7433652281761169, + "eval_runtime": 48.9945, + "eval_samples_per_second": 40.821, + "eval_steps_per_second": 2.551, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027807783955520254, + "loss": 0.755, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027760127084988087, + "loss": 0.7563, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027712470214455914, + "loss": 0.7475, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027664813343923747, + "loss": 0.7599, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002761715647339158, + "loss": 0.7533, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027569499602859407, + "loss": 0.7488, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027521842732327245, + "loss": 0.753, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747418586179507, + "loss": 0.7435, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027426528991262905, + "loss": 0.7457, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737887212073074, + "loss": 0.742, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7321739792823792, + "eval_runtime": 48.8876, + "eval_samples_per_second": 40.91, + "eval_steps_per_second": 2.557, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00027331215250198565, + "loss": 0.7474, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.000272835583796664, + "loss": 0.7456, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002723590150913423, + "loss": 0.7406, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027188244638602063, + "loss": 0.7448, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027140587768069896, + "loss": 0.7445, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027092930897537723, + "loss": 0.7349, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027045274027005556, + "loss": 0.7395, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699761715647339, + "loss": 0.7382, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694996028594122, + "loss": 0.7357, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026902303415409054, + "loss": 0.7409, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.7235888242721558, + "eval_runtime": 49.2145, + "eval_samples_per_second": 40.638, + "eval_steps_per_second": 2.54, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026854646544876887, + "loss": 0.7376, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026806989674344714, + "loss": 0.7298, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026759332803812547, + "loss": 0.7379, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671167593328038, + "loss": 0.7354, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002666401906274821, + "loss": 0.7341, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026616362192216045, + "loss": 0.7352, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656870532168387, + "loss": 0.7321, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026521048451151705, + "loss": 0.7285, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647339158061954, + "loss": 0.73, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026425734710087365, + "loss": 0.7304, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.716058611869812, + "eval_runtime": 48.9201, + "eval_samples_per_second": 40.883, + "eval_steps_per_second": 2.555, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026378077839555203, + "loss": 0.7314, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002633042096902303, + "loss": 0.7315, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026282764098490863, + "loss": 0.7239, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026235107227958696, + "loss": 0.73, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026187450357426523, + "loss": 0.7243, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026139793486894356, + "loss": 0.7199, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609213661636219, + "loss": 0.7216, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002604447974583002, + "loss": 0.7358, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025996822875297854, + "loss": 0.7313, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025949166004765687, + "loss": 0.7236, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.7097632884979248, + "eval_runtime": 49.4908, + "eval_samples_per_second": 40.412, + "eval_steps_per_second": 2.526, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025901509134233514, + "loss": 0.7282, + "step": 1820 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025853852263701347, + "loss": 0.7187, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002580619539316918, + "loss": 0.7303, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575853852263701, + "loss": 0.724, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025710881652104845, + "loss": 0.7248, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566322478157267, + "loss": 0.7195, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025615567911040505, + "loss": 0.7269, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556791104050834, + "loss": 0.7209, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552025416997617, + "loss": 0.7282, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025472597299444003, + "loss": 0.7195, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7037709355354309, + "eval_runtime": 49.7167, + "eval_samples_per_second": 40.228, + "eval_steps_per_second": 2.514, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542494042891183, + "loss": 0.7229, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025377283558379664, + "loss": 0.718, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025329626687847496, + "loss": 0.7223, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025281969817315324, + "loss": 0.7209, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002523431294678316, + "loss": 0.7151, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518665607625099, + "loss": 0.7141, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513899920571882, + "loss": 0.7084, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025091342335186654, + "loss": 0.7075, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025043685464654487, + "loss": 0.7133, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024996028594122314, + "loss": 0.7092, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.6989386677742004, + "eval_runtime": 49.2344, + "eval_samples_per_second": 40.622, + "eval_steps_per_second": 2.539, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494837172359015, + "loss": 0.7178, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002490071485305798, + "loss": 0.7188, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002485305798252581, + "loss": 0.7161, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024805401111993645, + "loss": 0.7078, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475774424146147, + "loss": 0.7, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024710087370929305, + "loss": 0.718, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002466243050039714, + "loss": 0.7059, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002461477362986497, + "loss": 0.712, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567116759332804, + "loss": 0.7116, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002451945988880063, + "loss": 0.6986, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.6939737796783447, + "eval_runtime": 49.459, + "eval_samples_per_second": 40.438, + "eval_steps_per_second": 2.527, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024471803018268464, + "loss": 0.7168, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024424146147736296, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024376489277204126, + "loss": 0.7095, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024328832406671962, + "loss": 0.7091, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024281175536139792, + "loss": 0.7015, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024233518665607622, + "loss": 0.7109, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024185861795075455, + "loss": 0.7086, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024138204924543285, + "loss": 0.7118, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024090548054011117, + "loss": 0.7033, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002404289118347895, + "loss": 0.7128, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.6901652812957764, + "eval_runtime": 49.5038, + "eval_samples_per_second": 40.401, + "eval_steps_per_second": 2.525, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023995234312946783, + "loss": 0.6968, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023947577442414613, + "loss": 0.7109, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023899920571882443, + "loss": 0.7048, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023852263701350276, + "loss": 0.7012, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023804606830818106, + "loss": 0.7065, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375694996028594, + "loss": 0.7009, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002370929308975377, + "loss": 0.7035, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023661636219221604, + "loss": 0.6973, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613979348689434, + "loss": 0.7075, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023566322478157264, + "loss": 0.6952, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.6865400671958923, + "eval_runtime": 49.2814, + "eval_samples_per_second": 40.583, + "eval_steps_per_second": 2.536, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023518665607625097, + "loss": 0.6979, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002347100873709293, + "loss": 0.6973, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023423351866560762, + "loss": 0.7033, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023375694996028592, + "loss": 0.6964, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023328038125496422, + "loss": 0.7052, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023280381254964255, + "loss": 0.6999, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023232724384432085, + "loss": 0.6963, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002318506751389992, + "loss": 0.7025, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313741064336775, + "loss": 0.704, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089753772835583, + "loss": 0.6926, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.6828380227088928, + "eval_runtime": 49.5667, + "eval_samples_per_second": 40.35, + "eval_steps_per_second": 2.522, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023042096902303413, + "loss": 0.698, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022994440031771243, + "loss": 0.6893, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022946783161239076, + "loss": 0.6938, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022899126290706908, + "loss": 0.6974, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002285146942017474, + "loss": 0.6922, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002280381254964257, + "loss": 0.7073, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022756155679110404, + "loss": 0.6895, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022708498808578234, + "loss": 0.7012, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022660841938046064, + "loss": 0.6985, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.000226131850675139, + "loss": 0.6901, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.6807068586349487, + "eval_runtime": 49.2421, + "eval_samples_per_second": 40.616, + "eval_steps_per_second": 2.538, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256552819698173, + "loss": 0.697, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022517871326449562, + "loss": 0.7002, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022470214455917392, + "loss": 0.6918, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022422557585385225, + "loss": 0.6999, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022374900714853055, + "loss": 0.6961, + "step": 3300 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002232724384432089, + "loss": 0.6888, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002227958697378872, + "loss": 0.695, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002223193010325655, + "loss": 0.6861, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022184273232724383, + "loss": 0.6864, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022136616362192213, + "loss": 0.6917, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.6773961782455444, + "eval_runtime": 49.3961, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022088959491660043, + "loss": 0.679, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022041302621127879, + "loss": 0.6915, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021993645750595709, + "loss": 0.6937, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002194598888006354, + "loss": 0.6831, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189833200953137, + "loss": 0.6875, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021850675138999204, + "loss": 0.6916, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021803018268467034, + "loss": 0.6896, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002175536139793487, + "loss": 0.6986, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.000217077045274027, + "loss": 0.693, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002166004765687053, + "loss": 0.6893, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.6753410696983337, + "eval_runtime": 49.3307, + "eval_samples_per_second": 40.543, + "eval_steps_per_second": 2.534, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021612390786338362, + "loss": 0.6872, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021564733915806192, + "loss": 0.6862, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021517077045274025, + "loss": 0.6943, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021469420174741858, + "loss": 0.6896, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002142176330420969, + "loss": 0.6912, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137410643367752, + "loss": 0.6859, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002132644956314535, + "loss": 0.6791, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021278792692613183, + "loss": 0.6882, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021231135822081013, + "loss": 0.6823, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002118347895154885, + "loss": 0.6831, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.6738302707672119, + "eval_runtime": 49.4648, + "eval_samples_per_second": 40.433, + "eval_steps_per_second": 2.527, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113582208101668, + "loss": 0.6818, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108816521048451, + "loss": 0.6912, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021040508339952341, + "loss": 0.6884, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020992851469420171, + "loss": 0.6888, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020945194598888004, + "loss": 0.6822, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020897537728355837, + "loss": 0.6879, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002084988085782367, + "loss": 0.6771, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.000208022239872915, + "loss": 0.684, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002075456711675933, + "loss": 0.6878, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020706910246227162, + "loss": 0.6913, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.6711302995681763, + "eval_runtime": 49.117, + "eval_samples_per_second": 40.719, + "eval_steps_per_second": 2.545, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020659253375694992, + "loss": 0.683, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020611596505162828, + "loss": 0.6833, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020563939634630658, + "loss": 0.6793, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002051628276409849, + "loss": 0.6843, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002046862589356632, + "loss": 0.6822, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002042096902303415, + "loss": 0.6856, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020373312152501983, + "loss": 0.6809, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020325655281969816, + "loss": 0.6843, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002027799841143765, + "loss": 0.6754, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002023034154090548, + "loss": 0.6823, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.6697036027908325, + "eval_runtime": 49.3237, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020182684670373312, + "loss": 0.6861, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020135027799841142, + "loss": 0.6806, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020087370929308972, + "loss": 0.6823, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020039714058776807, + "loss": 0.6805, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019992057188244637, + "loss": 0.6813, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994440031771247, + "loss": 0.675, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.000198967434471803, + "loss": 0.6728, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001984908657664813, + "loss": 0.6676, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019801429706115963, + "loss": 0.6729, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019753772835583795, + "loss": 0.685, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.6667952537536621, + "eval_runtime": 49.3899, + "eval_samples_per_second": 40.494, + "eval_steps_per_second": 2.531, + "step": 4400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019706115965051628, + "loss": 0.6786, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658459094519458, + "loss": 0.6738, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961080222398729, + "loss": 0.686, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956314535345512, + "loss": 0.6818, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951548848292295, + "loss": 0.6741, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019467831612390786, + "loss": 0.6756, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019420174741858616, + "loss": 0.6851, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001937251787132645, + "loss": 0.6784, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001932486100079428, + "loss": 0.6785, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019277204130262112, + "loss": 0.678, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.6655837297439575, + "eval_runtime": 49.5019, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019229547259729942, + "loss": 0.6782, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019181890389197777, + "loss": 0.6683, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019134233518665607, + "loss": 0.6783, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019086576648133437, + "loss": 0.675, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903891977760127, + "loss": 0.6691, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.000189912629070691, + "loss": 0.6726, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018943606036536933, + "loss": 0.68, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018895949166004763, + "loss": 0.6694, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018848292295472598, + "loss": 0.6686, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018800635424940428, + "loss": 0.6766, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.6646501421928406, + "eval_runtime": 49.3188, + "eval_samples_per_second": 40.552, + "eval_steps_per_second": 2.535, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018752978554408258, + "loss": 0.6724, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001870532168387609, + "loss": 0.6801, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001865766481334392, + "loss": 0.6698, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001861000794281175, + "loss": 0.6723, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018562351072279586, + "loss": 0.6693, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018514694201747416, + "loss": 0.6716, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001846703733121525, + "loss": 0.674, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001841938046068308, + "loss": 0.6702, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018371723590150912, + "loss": 0.6716, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018324066719618742, + "loss": 0.672, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.662735104560852, + "eval_runtime": 49.275, + "eval_samples_per_second": 40.589, + "eval_steps_per_second": 2.537, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018276409849086577, + "loss": 0.6701, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018228752978554407, + "loss": 0.6663, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018181096108022237, + "loss": 0.6651, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813343923749007, + "loss": 0.6708, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.000180857823669579, + "loss": 0.6697, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018038125496425733, + "loss": 0.662, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017990468625893566, + "loss": 0.669, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017942811755361398, + "loss": 0.6649, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017895154884829228, + "loss": 0.668, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017847498014297058, + "loss": 0.6796, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.6609957218170166, + "eval_runtime": 49.2394, + "eval_samples_per_second": 40.618, + "eval_steps_per_second": 2.539, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001779984114376489, + "loss": 0.6745, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001775218427323272, + "loss": 0.6646, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704527402700556, + "loss": 0.6682, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017656870532168386, + "loss": 0.6713, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017609213661636216, + "loss": 0.6618, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001756155679110405, + "loss": 0.6703, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001751389992057188, + "loss": 0.6652, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017466243050039712, + "loss": 0.6698, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017418586179507545, + "loss": 0.6728, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017370929308975377, + "loss": 0.6752, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.6592395901679993, + "eval_runtime": 49.3426, + "eval_samples_per_second": 40.533, + "eval_steps_per_second": 2.533, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017323272438443207, + "loss": 0.6653, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017275615567911037, + "loss": 0.669, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001722795869737887, + "loss": 0.6698, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.000171803018268467, + "loss": 0.6742, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017132644956314536, + "loss": 0.6596, + "step": 5500 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017084988085782366, + "loss": 0.6699, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017037331215250198, + "loss": 0.664, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016989674344718028, + "loss": 0.6673, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016942017474185858, + "loss": 0.6684, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001689436060365369, + "loss": 0.6769, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.6582754850387573, + "eval_runtime": 49.469, + "eval_samples_per_second": 40.429, + "eval_steps_per_second": 2.527, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016846703733121524, + "loss": 0.6633, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016799046862589357, + "loss": 0.6679, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016751389992057187, + "loss": 0.6601, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670373312152502, + "loss": 0.6731, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665607625099285, + "loss": 0.6638, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660841938046068, + "loss": 0.6693, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016560762509928515, + "loss": 0.6642, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016513105639396345, + "loss": 0.6649, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016465448768864178, + "loss": 0.663, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016417791898332008, + "loss": 0.6629, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.6574136018753052, + "eval_runtime": 49.3019, + "eval_samples_per_second": 40.566, + "eval_steps_per_second": 2.535, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016370135027799838, + "loss": 0.6605, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001632247815726767, + "loss": 0.6707, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016274821286735503, + "loss": 0.6695, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016227164416203336, + "loss": 0.6647, + "step": 5880 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016179507545671166, + "loss": 0.6657, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016131850675138999, + "loss": 0.6656, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016084193804606829, + "loss": 0.6676, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016036536934074659, + "loss": 0.6678, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015988880063542494, + "loss": 0.6639, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015941223193010324, + "loss": 0.6645, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.656126081943512, + "eval_runtime": 49.5095, + "eval_samples_per_second": 40.396, + "eval_steps_per_second": 2.525, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015893566322478157, + "loss": 0.6672, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015845909451945987, + "loss": 0.6678, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001579825258141382, + "loss": 0.6676, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001575059571088165, + "loss": 0.6717, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015702938840349485, + "loss": 0.671, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015655281969817315, + "loss": 0.6611, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015607625099285145, + "loss": 0.6606, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015559968228752978, + "loss": 0.6647, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015512311358220808, + "loss": 0.6652, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546465448768864, + "loss": 0.6629, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.6549723148345947, + "eval_runtime": 49.4871, + "eval_samples_per_second": 40.415, + "eval_steps_per_second": 2.526, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015416997617156473, + "loss": 0.6685, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015369340746624306, + "loss": 0.6578, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015321683876092136, + "loss": 0.6587, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015274027005559966, + "loss": 0.6655, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.000152263701350278, + "loss": 0.6662, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517871326449563, + "loss": 0.6648, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015131056393963464, + "loss": 0.6638, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015083399523431294, + "loss": 0.6614, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015035742652899124, + "loss": 0.6552, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014988085782366957, + "loss": 0.6753, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.6544620990753174, + "eval_runtime": 49.3242, + "eval_samples_per_second": 40.548, + "eval_steps_per_second": 2.534, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014940428911834787, + "loss": 0.6588, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001489277204130262, + "loss": 0.6609, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014845115170770452, + "loss": 0.6565, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014797458300238282, + "loss": 0.6589, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749801429706115, + "loss": 0.6585, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014702144559173945, + "loss": 0.6737, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014654487688641778, + "loss": 0.6554, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001460683081810961, + "loss": 0.6603, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001455917394757744, + "loss": 0.6647, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014511517077045273, + "loss": 0.6632, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.6527110934257507, + "eval_runtime": 49.2622, + "eval_samples_per_second": 40.599, + "eval_steps_per_second": 2.537, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014463860206513106, + "loss": 0.6705, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014416203335980936, + "loss": 0.6703, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014368546465448766, + "loss": 0.6602, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.000143208895949166, + "loss": 0.6639, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014273232724384432, + "loss": 0.6645, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225575853852262, + "loss": 0.6655, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014177918983320094, + "loss": 0.664, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014130262112787927, + "loss": 0.6656, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014082605242255757, + "loss": 0.6658, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001403494837172359, + "loss": 0.6641, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.6513609886169434, + "eval_runtime": 49.4424, + "eval_samples_per_second": 40.451, + "eval_steps_per_second": 2.528, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001398729150119142, + "loss": 0.6599, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013939634630659252, + "loss": 0.6552, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013891977760127085, + "loss": 0.6616, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013844320889594915, + "loss": 0.6635, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013796664019062745, + "loss": 0.6608, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749007148530578, + "loss": 0.6596, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001370135027799841, + "loss": 0.6589, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001365369340746624, + "loss": 0.6627, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013606036536934073, + "loss": 0.6606, + "step": 6980 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013558379666401906, + "loss": 0.6658, + "step": 7000 + }, + { + "epoch": 1.65, + "eval_loss": 0.6510519981384277, + "eval_runtime": 49.5012, + "eval_samples_per_second": 40.403, + "eval_steps_per_second": 2.525, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013510722795869736, + "loss": 0.6571, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001346306592533757, + "loss": 0.6607, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.000134154090548054, + "loss": 0.6562, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013367752184273232, + "loss": 0.6582, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013320095313741064, + "loss": 0.6635, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013272438443208894, + "loss": 0.6682, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224781572676727, + "loss": 0.6633, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001317712470214456, + "loss": 0.6671, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001312946783161239, + "loss": 0.6645, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001308181096108022, + "loss": 0.6699, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.6502068042755127, + "eval_runtime": 49.4619, + "eval_samples_per_second": 40.435, + "eval_steps_per_second": 2.527, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034154090548053, + "loss": 0.6617, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012986497220015885, + "loss": 0.6639, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012938840349483715, + "loss": 0.6634, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012891183478951548, + "loss": 0.663, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012843526608419378, + "loss": 0.6653, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001279586973788721, + "loss": 0.6555, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012748212867355044, + "loss": 0.6653, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012700555996822874, + "loss": 0.6573, + "step": 7360 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012652899126290706, + "loss": 0.658, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260524225575854, + "loss": 0.6562, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.6491650342941284, + "eval_runtime": 49.2463, + "eval_samples_per_second": 40.612, + "eval_steps_per_second": 2.538, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001255758538522637, + "loss": 0.6592, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.000125099285146942, + "loss": 0.6587, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012462271644162032, + "loss": 0.6616, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414614773629865, + "loss": 0.655, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012366957903097695, + "loss": 0.6591, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012319301032565527, + "loss": 0.6545, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001227164416203336, + "loss": 0.6673, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222398729150119, + "loss": 0.6626, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012176330420969023, + "loss": 0.6663, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012128673550436854, + "loss": 0.6643, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.6482685804367065, + "eval_runtime": 49.3591, + "eval_samples_per_second": 40.519, + "eval_steps_per_second": 2.532, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012081016679904685, + "loss": 0.6623, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012033359809372518, + "loss": 0.6636, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011985702938840348, + "loss": 0.6598, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001193804606830818, + "loss": 0.6521, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011890389197776012, + "loss": 0.664, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011842732327243844, + "loss": 0.6529, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795075456711675, + "loss": 0.6622, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011747418586179508, + "loss": 0.6608, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011699761715647338, + "loss": 0.6556, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011652104845115169, + "loss": 0.6643, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.6474015116691589, + "eval_runtime": 49.3608, + "eval_samples_per_second": 40.518, + "eval_steps_per_second": 2.532, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011604447974583002, + "loss": 0.6541, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011556791104050833, + "loss": 0.6614, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011509134233518665, + "loss": 0.6499, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011461477362986497, + "loss": 0.6563, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011413820492454327, + "loss": 0.6589, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011366163621922159, + "loss": 0.6544, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011318506751389992, + "loss": 0.6606, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011270849880857823, + "loss": 0.657, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011223193010325654, + "loss": 0.6608, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011175536139793487, + "loss": 0.6595, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.6469079256057739, + "eval_runtime": 49.3012, + "eval_samples_per_second": 40.567, + "eval_steps_per_second": 2.535, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011127879269261318, + "loss": 0.6563, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011080222398729148, + "loss": 0.6602, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011032565528196981, + "loss": 0.6603, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010984908657664812, + "loss": 0.6495, + "step": 8080 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010937251787132644, + "loss": 0.6551, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010891977760127084, + "loss": 0.6497, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010844320889594917, + "loss": 0.6652, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796664019062747, + "loss": 0.6497, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010749007148530578, + "loss": 0.6554, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010701350277998411, + "loss": 0.6563, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.645990252494812, + "eval_runtime": 49.3957, + "eval_samples_per_second": 40.489, + "eval_steps_per_second": 2.531, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010653693407466242, + "loss": 0.6572, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010606036536934074, + "loss": 0.6563, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010558379666401906, + "loss": 0.6535, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010510722795869736, + "loss": 0.655, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010463065925337568, + "loss": 0.6554, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.000104154090548054, + "loss": 0.6559, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010367752184273232, + "loss": 0.6522, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010320095313741063, + "loss": 0.6568, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010272438443208896, + "loss": 0.6566, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010224781572676727, + "loss": 0.6496, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.6457875967025757, + "eval_runtime": 49.0201, + "eval_samples_per_second": 40.8, + "eval_steps_per_second": 2.55, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010177124702144557, + "loss": 0.66, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001012946783161239, + "loss": 0.6457, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001008419380460683, + "loss": 0.6349, + "step": 8460 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010036536934074662, + "loss": 0.6545, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.988880063542493e-05, + "loss": 0.6515, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.941223193010326e-05, + "loss": 0.6459, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.893566322478156e-05, + "loss": 0.6494, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.845909451945987e-05, + "loss": 0.6608, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.79825258141382e-05, + "loss": 0.6485, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.750595710881651e-05, + "loss": 0.6461, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.6450995802879333, + "eval_runtime": 49.2592, + "eval_samples_per_second": 40.602, + "eval_steps_per_second": 2.538, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.702938840349483e-05, + "loss": 0.6523, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.655281969817315e-05, + "loss": 0.6565, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.607625099285145e-05, + "loss": 0.6541, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.559968228752977e-05, + "loss": 0.6585, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.51231135822081e-05, + "loss": 0.6531, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.464654487688641e-05, + "loss": 0.6579, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.416997617156472e-05, + "loss": 0.6438, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.369340746624305e-05, + "loss": 0.6516, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.321683876092136e-05, + "loss": 0.6576, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.274027005559966e-05, + "loss": 0.6506, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.6444578170776367, + "eval_runtime": 49.0631, + "eval_samples_per_second": 40.764, + "eval_steps_per_second": 2.548, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.226370135027799e-05, + "loss": 0.6484, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.17871326449563e-05, + "loss": 0.6566, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.131056393963462e-05, + "loss": 0.6547, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.083399523431295e-05, + "loss": 0.6532, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.035742652899126e-05, + "loss": 0.6532, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.988085782366956e-05, + "loss": 0.6479, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.940428911834789e-05, + "loss": 0.6548, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.89277204130262e-05, + "loss": 0.647, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.845115170770452e-05, + "loss": 0.6478, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.797458300238284e-05, + "loss": 0.6553, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.6433074474334717, + "eval_runtime": 49.3831, + "eval_samples_per_second": 40.5, + "eval_steps_per_second": 2.531, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.749801429706116e-05, + "loss": 0.6443, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.702144559173947e-05, + "loss": 0.6518, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.65448768864178e-05, + "loss": 0.6578, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.60683081810961e-05, + "loss": 0.6472, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.559173947577441e-05, + "loss": 0.6471, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.511517077045274e-05, + "loss": 0.6482, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.463860206513105e-05, + "loss": 0.6522, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.416203335980937e-05, + "loss": 0.6584, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.368546465448769e-05, + "loss": 0.6596, + "step": 9180 + }, + { + "epoch": 2.17, + "learning_rate": 8.320889594916599e-05, + "loss": 0.6581, + "step": 9200 + }, + { + "epoch": 2.17, + "eval_loss": 0.6426697969436646, + "eval_runtime": 49.0935, + "eval_samples_per_second": 40.739, + "eval_steps_per_second": 2.546, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.273232724384431e-05, + "loss": 0.6441, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.225575853852263e-05, + "loss": 0.6509, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.177918983320095e-05, + "loss": 0.6409, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.130262112787926e-05, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.082605242255759e-05, + "loss": 0.6597, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.03494837172359e-05, + "loss": 0.6544, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.98729150119142e-05, + "loss": 0.6528, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.939634630659253e-05, + "loss": 0.644, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.891977760127084e-05, + "loss": 0.6552, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.844320889594916e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.6423606276512146, + "eval_runtime": 49.6466, + "eval_samples_per_second": 40.285, + "eval_steps_per_second": 2.518, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.796664019062748e-05, + "loss": 0.6568, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.74900714853058e-05, + "loss": 0.6539, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.70135027799841e-05, + "loss": 0.6468, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.653693407466243e-05, + "loss": 0.6425, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.606036536934074e-05, + "loss": 0.6523, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.558379666401905e-05, + "loss": 0.6468, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.510722795869738e-05, + "loss": 0.6518, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.46306592533757e-05, + "loss": 0.6534, + "step": 9560 + }, + { + "epoch": 2.26, + "learning_rate": 7.415409054805401e-05, + "loss": 0.6471, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.367752184273232e-05, + "loss": 0.6465, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.6418060064315796, + "eval_runtime": 49.4954, + "eval_samples_per_second": 40.408, + "eval_steps_per_second": 2.525, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.320095313741064e-05, + "loss": 0.6577, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.272438443208895e-05, + "loss": 0.6453, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.224781572676726e-05, + "loss": 0.6489, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.177124702144559e-05, + "loss": 0.6466, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12946783161239e-05, + "loss": 0.6493, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.081810961080222e-05, + "loss": 0.6537, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.034154090548053e-05, + "loss": 0.6486, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.986497220015885e-05, + "loss": 0.65, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.938840349483717e-05, + "loss": 0.6387, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.891183478951549e-05, + "loss": 0.6464, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.6412256360054016, + "eval_runtime": 49.3752, + "eval_samples_per_second": 40.506, + "eval_steps_per_second": 2.532, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.84352660841938e-05, + "loss": 0.6475, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.795869737887211e-05, + "loss": 0.6543, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.748212867355043e-05, + "loss": 0.6545, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.700555996822874e-05, + "loss": 0.6468, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.652899126290707e-05, + "loss": 0.651, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.605242255758538e-05, + "loss": 0.641, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.55758538522637e-05, + "loss": 0.657, + "step": 9940 + }, + { + "epoch": 2.35, + "learning_rate": 6.509928514694201e-05, + "loss": 0.6481, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.462271644162034e-05, + "loss": 0.6496, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.414614773629864e-05, + "loss": 0.6451, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.6414454579353333, + "eval_runtime": 49.395, + "eval_samples_per_second": 40.49, + "eval_steps_per_second": 2.531, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.366957903097696e-05, + "loss": 0.6555, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.319301032565528e-05, + "loss": 0.6494, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.271644162033359e-05, + "loss": 0.6487, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.22398729150119e-05, + "loss": 0.6544, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.176330420969023e-05, + "loss": 0.6468, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.128673550436853e-05, + "loss": 0.6441, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.081016679904686e-05, + "loss": 0.6478, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.033359809372518e-05, + "loss": 0.6539, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.985702938840349e-05, + "loss": 0.6486, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.938046068308181e-05, + "loss": 0.6467, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.6406835913658142, + "eval_runtime": 49.5084, + "eval_samples_per_second": 40.397, + "eval_steps_per_second": 2.525, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.890389197776013e-05, + "loss": 0.6399, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.8427323272438435e-05, + "loss": 0.6519, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.7950754567116756e-05, + "loss": 0.6465, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7474185861795076e-05, + "loss": 0.6479, + "step": 10280 + }, + { + "epoch": 2.43, + "learning_rate": 5.6997617156473383e-05, + "loss": 0.6462, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6521048451151704e-05, + "loss": 0.6451, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.604447974583002e-05, + "loss": 0.6453, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.556791104050833e-05, + "loss": 0.6543, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.509134233518665e-05, + "loss": 0.6428, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.4614773629864966e-05, + "loss": 0.6491, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.6400973796844482, + "eval_runtime": 49.3411, + "eval_samples_per_second": 40.534, + "eval_steps_per_second": 2.533, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.413820492454328e-05, + "loss": 0.649, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.36616362192216e-05, + "loss": 0.6494, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3185067513899913e-05, + "loss": 0.6431, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.2708498808578234e-05, + "loss": 0.6478, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.223193010325655e-05, + "loss": 0.6416, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.175536139793486e-05, + "loss": 0.6507, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.127879269261318e-05, + "loss": 0.6448, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0802223987291496e-05, + "loss": 0.6455, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.032565528196981e-05, + "loss": 0.6437, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.984908657664813e-05, + "loss": 0.6488, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.6400858163833618, + "eval_runtime": 49.8084, + "eval_samples_per_second": 40.154, + "eval_steps_per_second": 2.51, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.937251787132645e-05, + "loss": 0.6436, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.889594916600476e-05, + "loss": 0.6446, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.841938046068308e-05, + "loss": 0.6488, + "step": 10660 + }, + { + "epoch": 2.52, + "learning_rate": 4.79428117553614e-05, + "loss": 0.6485, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.7466243050039705e-05, + "loss": 0.6524, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.6989674344718026e-05, + "loss": 0.6376, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.6513105639396346e-05, + "loss": 0.649, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.603653693407465e-05, + "loss": 0.6444, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5559968228752974e-05, + "loss": 0.6407, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5083399523431294e-05, + "loss": 0.6448, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6392157077789307, + "eval_runtime": 49.7963, + "eval_samples_per_second": 40.164, + "eval_steps_per_second": 2.51, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.46068308181096e-05, + "loss": 0.6454, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.413026211278792e-05, + "loss": 0.6544, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.365369340746624e-05, + "loss": 0.6478, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3177124702144556e-05, + "loss": 0.6434, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.270055599682287e-05, + "loss": 0.6482, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.222398729150119e-05, + "loss": 0.6403, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.1747418586179504e-05, + "loss": 0.6501, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.127084988085782e-05, + "loss": 0.6507, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.079428117553614e-05, + "loss": 0.6496, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.031771247021445e-05, + "loss": 0.6544, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.6390016078948975, + "eval_runtime": 49.6306, + "eval_samples_per_second": 40.298, + "eval_steps_per_second": 2.519, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.984114376489277e-05, + "loss": 0.6405, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.9364575059571086e-05, + "loss": 0.6429, + "step": 11040 + }, + { + "epoch": 2.61, + "learning_rate": 3.88880063542494e-05, + "loss": 0.6403, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.841143764892772e-05, + "loss": 0.6338, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.7934868943606034e-05, + "loss": 0.6417, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.7458300238284354e-05, + "loss": 0.6463, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.698173153296267e-05, + "loss": 0.6498, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.650516282764098e-05, + "loss": 0.6415, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.6028594122319296e-05, + "loss": 0.645, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.5552025416997616e-05, + "loss": 0.6467, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.6387213468551636, + "eval_runtime": 49.1775, + "eval_samples_per_second": 40.669, + "eval_steps_per_second": 2.542, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.507545671167593e-05, + "loss": 0.6515, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.4598888006354244e-05, + "loss": 0.65, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.4122319301032564e-05, + "loss": 0.6512, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.364575059571088e-05, + "loss": 0.6443, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.316918189038919e-05, + "loss": 0.6483, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.269261318506751e-05, + "loss": 0.6455, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.2216044479745826e-05, + "loss": 0.6461, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.173947577442414e-05, + "loss": 0.6505, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.126290706910246e-05, + "loss": 0.6517, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.0786338363780774e-05, + "loss": 0.6406, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.6380326151847839, + "eval_runtime": 49.4129, + "eval_samples_per_second": 40.475, + "eval_steps_per_second": 2.53, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.030976965845909e-05, + "loss": 0.647, + "step": 11420 + }, + { + "epoch": 2.7, + "learning_rate": 2.9833200953137408e-05, + "loss": 0.6495, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.9356632247815725e-05, + "loss": 0.6448, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.888006354249404e-05, + "loss": 0.6447, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.840349483717236e-05, + "loss": 0.6527, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7926926131850673e-05, + "loss": 0.6406, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7450357426528987e-05, + "loss": 0.6443, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6973788721207307e-05, + "loss": 0.6351, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.649722001588562e-05, + "loss": 0.6417, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.6020651310563938e-05, + "loss": 0.6356, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.6381237506866455, + "eval_runtime": 49.5534, + "eval_samples_per_second": 40.36, + "eval_steps_per_second": 2.523, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.5544082605242255e-05, + "loss": 0.6412, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.506751389992057e-05, + "loss": 0.6418, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4590945194598886e-05, + "loss": 0.6426, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.4114376489277203e-05, + "loss": 0.6461, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.363780778395552e-05, + "loss": 0.6475, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3161239078633834e-05, + "loss": 0.6431, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.2684670373312148e-05, + "loss": 0.6416, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2208101667990468e-05, + "loss": 0.6495, + "step": 11760 + }, + { + "epoch": 2.78, + "learning_rate": 2.1731532962668782e-05, + "loss": 0.6404, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.1254964257347096e-05, + "loss": 0.6434, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.6377163529396057, + "eval_runtime": 49.328, + "eval_samples_per_second": 40.545, + "eval_steps_per_second": 2.534, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.0778395552025416e-05, + "loss": 0.6437, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.030182684670373e-05, + "loss": 0.6393, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.9825258141382047e-05, + "loss": 0.6412, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9348689436060364e-05, + "loss": 0.6494, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.887212073073868e-05, + "loss": 0.6481, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8395552025416998e-05, + "loss": 0.6407, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7918983320095312e-05, + "loss": 0.6422, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.744241461477363e-05, + "loss": 0.6487, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.6965845909451946e-05, + "loss": 0.6478, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.648927720413026e-05, + "loss": 0.6451, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.6374698281288147, + "eval_runtime": 49.9107, + "eval_samples_per_second": 40.072, + "eval_steps_per_second": 2.504, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.6012708498808577e-05, + "loss": 0.6454, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.5536139793486894e-05, + "loss": 0.6399, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5059571088165208e-05, + "loss": 0.6479, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4583002382843525e-05, + "loss": 0.6412, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4106433677521842e-05, + "loss": 0.65, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3629864972200157e-05, + "loss": 0.6461, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3153296266878475e-05, + "loss": 0.6434, + "step": 12140 + }, + { + "epoch": 2.87, + "learning_rate": 1.2676727561556788e-05, + "loss": 0.6463, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.2200158856235105e-05, + "loss": 0.6399, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1723590150913422e-05, + "loss": 0.6446, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.6372544765472412, + "eval_runtime": 49.6265, + "eval_samples_per_second": 40.301, + "eval_steps_per_second": 2.519, + "step": 12200 + }, + { + "epoch": 2.89, + "learning_rate": 1.1247021445591738e-05, + "loss": 0.6411, + "step": 12220 + }, + { + "epoch": 2.89, + "learning_rate": 1.0770452740270055e-05, + "loss": 0.6523, + "step": 12240 + }, + { + "epoch": 2.9, + "learning_rate": 1.0293884034948372e-05, + "loss": 0.6456, + "step": 12260 + }, + { + "epoch": 2.9, + "learning_rate": 9.817315329626686e-06, + "loss": 0.6394, + "step": 12280 + }, + { + "epoch": 2.91, + "learning_rate": 9.340746624305003e-06, + "loss": 0.6466, + "step": 12300 + }, + { + "epoch": 2.91, + "learning_rate": 8.864177918983318e-06, + "loss": 0.6415, + "step": 12320 + }, + { + "epoch": 2.92, + "learning_rate": 8.387609213661635e-06, + "loss": 0.6349, + "step": 12340 + }, + { + "epoch": 2.92, + "learning_rate": 7.911040508339953e-06, + "loss": 0.6415, + "step": 12360 + }, + { + "epoch": 2.93, + "learning_rate": 7.434471803018268e-06, + "loss": 0.6484, + "step": 12380 + }, + { + "epoch": 2.93, + "learning_rate": 6.957903097696583e-06, + "loss": 0.6522, + "step": 12400 + }, + { + "epoch": 2.93, + "eval_loss": 0.6368712186813354, + "eval_runtime": 49.0693, + "eval_samples_per_second": 40.759, + "eval_steps_per_second": 2.547, + "step": 12400 + }, + { + "epoch": 2.94, + "learning_rate": 6.4813343923749005e-06, + "loss": 0.643, + "step": 12420 + }, + { + "epoch": 2.94, + "learning_rate": 6.004765687053216e-06, + "loss": 0.6515, + "step": 12440 + }, + { + "epoch": 2.95, + "learning_rate": 5.528196981731532e-06, + "loss": 0.6512, + "step": 12460 + }, + { + "epoch": 2.95, + "learning_rate": 5.051628276409849e-06, + "loss": 0.6483, + "step": 12480 + }, + { + "epoch": 2.96, + "learning_rate": 4.575059571088165e-06, + "loss": 0.6342, + "step": 12500 + }, + { + "epoch": 2.96, + "learning_rate": 4.098490865766481e-06, + "loss": 0.6445, + "step": 12520 + }, + { + "epoch": 2.96, + "learning_rate": 3.6219221604447972e-06, + "loss": 0.6419, + "step": 12540 + }, + { + "epoch": 2.97, + "learning_rate": 3.145353455123113e-06, + "loss": 0.642, + "step": 12560 + }, + { + "epoch": 2.97, + "learning_rate": 2.6687847498014293e-06, + "loss": 0.6541, + "step": 12580 + }, + { + "epoch": 2.98, + "learning_rate": 2.1922160444797456e-06, + "loss": 0.6334, + "step": 12600 + }, + { + "epoch": 2.98, + "eval_loss": 0.636846125125885, + "eval_runtime": 49.1332, + "eval_samples_per_second": 40.706, + "eval_steps_per_second": 2.544, + "step": 12600 + } + ], + "max_steps": 12690, + "num_train_epochs": 3, + "total_flos": 1.6375945250008465e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cfd5ca55a1cd7f462c1d326faacf15d022e29425 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/checkpoint-12600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a070370e87c048b60fc888b8736a0166eb94eeb3a75f5f78918edab715d0fb1c +size 3579 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/1679643354.1908646/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.1 b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/1679643354.1908646/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.1 new file mode 100644 index 0000000000000000000000000000000000000000..abade5249021c12559712c2a3237d0f414bdc777 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/1679643354.1908646/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477b50c5556946ba35aa59cfb080f2a1bd4cac9e4a43341795e805da0cb424a6 +size 5809 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.0 b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.0 new file mode 100644 index 0000000000000000000000000000000000000000..a6068389678d74801501efff46deabd2eff8ec17 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-35-50_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643354.autodl-container-a629119d3c-e4df2c26.49450.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4957dd8c37c2b3f2c2b15760d760ef448dcefee8854fc3a61e4c52f7534aa406 +size 3824 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/1679643935.915997/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.1 b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/1679643935.915997/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.1 new file mode 100644 index 0000000000000000000000000000000000000000..c79cf5fccad2139be2ce44835fdd55f74811b1c0 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/1679643935.915997/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76cc90b5b36269f261f73b916664154bf922abeb27a09f26454bb838f0eb2025 +size 5809 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.0 b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.0 new file mode 100644 index 0000000000000000000000000000000000000000..8664a4b51c26927f4daa63fe85a9e1bf34776e11 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-0.5m/runs/Mar24_15-45-35_autodl-container-a629119d3c-e4df2c26/events.out.tfevents.1679643935.autodl-container-a629119d3c-e4df2c26.51017.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7d32319c70ab9cd6397d6b0232d2623e3f4d4a8f70cb92200eb2b7e70054f2 +size 120771 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_config.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..33c5303265e3866eb29318b856ec56744ffbad33 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/root/autodl-tmp/llama_hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1ea95deb1c29edde0b241dc4c08179e36d7d9ec --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a3b43e50220a5244e4f3cdf199c4a63673c716a9ff2d013c0bb1ac7f7c8d916 +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea6abf5f6df459b76882e8c4a38a5ac0b1f46fad --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a1e464507a0eb6d65330833b5e6a2f0222aa6a42dc1b421a82bc6b179f11d7 +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9d414f73d7a12e9ddb21bf9863c661abd645adc8 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62816331a190d7783fed48f4fe5472a7cf8d37afe2179e5a1f8f0a8168b1dc96 +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/rng_state.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0543a18c22ece2a1e3ad7cfee11d5718cfb57aef --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf4148eed691c08b2b9f0a833d08075b124d9054abbfc2f5e7747c2ce5c051f +size 14575 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a30a038c910afe06dd3e035596c0ede67576138 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e7309791e544e8cb732592406919407855bab183563e2fcbaad5f6249d386dd +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3df990c9693376d868e17a1f06929b605d8ed9 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3bbb8e74ed1b7ca44a8f95d5cdb8d457c0318daf33eba316ef6fde8a11ee92c +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1a84ba4799337035b3010801b206e90f678c10a --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/trainer_state.json @@ -0,0 +1,5116 @@ +{ + "best_metric": 0.6879639625549316, + "best_model_checkpoint": "lora-alpaca/checkpoint-15000", + "epoch": 1.9238168526356292, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.8988, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 0.7184, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.7227, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 0.7244, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 0.7225, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002996127533238673, + "loss": 0.7183, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002992255066477346, + "loss": 0.7246, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029883825997160186, + "loss": 0.7334, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002984510132954692, + "loss": 0.7224, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002980637666193365, + "loss": 0.7212, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 0.7327759861946106, + "eval_runtime": 178.5232, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002976765199432038, + "loss": 0.7298, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002972892732670711, + "loss": 0.7275, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002969020265909384, + "loss": 0.7265, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002965147799148057, + "loss": 0.7285, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 0.000296127533238673, + "loss": 0.7218, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002957402865625403, + "loss": 0.715, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029535303988640764, + "loss": 0.7347, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002949657932102749, + "loss": 0.7228, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029457854653414225, + "loss": 0.7198, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029419129985800953, + "loss": 0.7196, + "step": 400 + }, + { + "epoch": 0.05, + "eval_loss": 0.7325090765953064, + "eval_runtime": 178.6018, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002938040531818768, + "loss": 0.7233, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029341680650574414, + "loss": 0.7272, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002930295598296114, + "loss": 0.7272, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029264231315347876, + "loss": 0.7281, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029225506647734603, + "loss": 0.7289, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002918678198012133, + "loss": 0.7215, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029148057312508065, + "loss": 0.7234, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002910933264489479, + "loss": 0.7229, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029070607977281526, + "loss": 0.7277, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002903188330966826, + "loss": 0.7275, + "step": 600 + }, + { + "epoch": 0.08, + "eval_loss": 0.7319443821907043, + "eval_runtime": 178.6932, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028993158642054987, + "loss": 0.7195, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028954433974441715, + "loss": 0.723, + "step": 640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002891570930682845, + "loss": 0.7337, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028876984639215176, + "loss": 0.7249, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002883825997160191, + "loss": 0.7374, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028799535303988637, + "loss": 0.7144, + "step": 720 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002876081063637537, + "loss": 0.7213, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 0.000287220859687621, + "loss": 0.7215, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00028683361301148826, + "loss": 0.7275, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002864463663353556, + "loss": 0.7247, + "step": 800 + }, + { + "epoch": 0.1, + "eval_loss": 0.730565071105957, + "eval_runtime": 178.4749, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002860591196592229, + "loss": 0.717, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002856718729830902, + "loss": 0.7263, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 0.00028528462630695754, + "loss": 0.7188, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002848973796308248, + "loss": 0.7241, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002845101329546921, + "loss": 0.7257, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028412288627855943, + "loss": 0.7315, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002837356396024267, + "loss": 0.7239, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028334839292629404, + "loss": 0.7219, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002829611462501613, + "loss": 0.7257, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002825738995740286, + "loss": 0.7287, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.7289888858795166, + "eval_runtime": 178.5088, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028218665289789593, + "loss": 0.7173, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028179940622176326, + "loss": 0.7185, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028141215954563054, + "loss": 0.7204, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002810249128694979, + "loss": 0.7301, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028063766619336515, + "loss": 0.7254, + "step": 1100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028025041951723243, + "loss": 0.7212, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027986317284109976, + "loss": 0.7265, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027947592616496704, + "loss": 0.7201, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002790886794888344, + "loss": 0.7273, + "step": 1180 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027870143281270165, + "loss": 0.72, + "step": 1200 + }, + { + "epoch": 0.15, + "eval_loss": 0.727615237236023, + "eval_runtime": 178.7045, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 0.000278314186136569, + "loss": 0.7307, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027792693946043627, + "loss": 0.7164, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027753969278430354, + "loss": 0.7163, + "step": 1260 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002771524461081709, + "loss": 0.7127, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002767651994320382, + "loss": 0.7123, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002763779527559055, + "loss": 0.728, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002759907060797728, + "loss": 0.7263, + "step": 1340 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002756034594036401, + "loss": 0.7188, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002752162127275074, + "loss": 0.7142, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002748289660513747, + "loss": 0.7215, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_loss": 0.7257346510887146, + "eval_runtime": 178.9403, + "eval_samples_per_second": 11.177, + "eval_steps_per_second": 1.397, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 0.000274441719375242, + "loss": 0.7262, + "step": 1420 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002740544726991093, + "loss": 0.7185, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002736672260229766, + "loss": 0.7093, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002732799793468439, + "loss": 0.7183, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002728927326707112, + "loss": 0.7006, + "step": 1500 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027250548599457855, + "loss": 0.7168, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002721182393184458, + "loss": 0.7225, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027173099264231316, + "loss": 0.7248, + "step": 1560 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027134374596618044, + "loss": 0.7215, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002709564992900477, + "loss": 0.7179, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_loss": 0.7245064377784729, + "eval_runtime": 177.8967, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027056925261391505, + "loss": 0.7208, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002701820059377823, + "loss": 0.7213, + "step": 1640 + }, + { + "epoch": 0.21, + "learning_rate": 0.00026979475926164966, + "loss": 0.7221, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026940751258551694, + "loss": 0.7207, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026902026590938427, + "loss": 0.7218, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026863301923325155, + "loss": 0.718, + "step": 1720 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002682457725571189, + "loss": 0.7235, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026785852588098616, + "loss": 0.7138, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002674712792048535, + "loss": 0.7178, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026708403252872077, + "loss": 0.7127, + "step": 1800 + }, + { + "epoch": 0.23, + "eval_loss": 0.7234225869178772, + "eval_runtime": 177.3731, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 1800 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002666967858525881, + "loss": 0.7214, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002663095391764554, + "loss": 0.7164, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026592229250032266, + "loss": 0.7132, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026553504582419, + "loss": 0.718, + "step": 1880 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002651477991480573, + "loss": 0.7088, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002647605524719246, + "loss": 0.7145, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026437330579579194, + "loss": 0.7166, + "step": 1940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026398605911965916, + "loss": 0.7141, + "step": 1960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002635988124435265, + "loss": 0.708, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026321156576739383, + "loss": 0.7162, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_loss": 0.7224385142326355, + "eval_runtime": 177.124, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002628243190912611, + "loss": 0.7091, + "step": 2020 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026243707241512844, + "loss": 0.7133, + "step": 2040 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002620498257389957, + "loss": 0.717, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 0.000261662579062863, + "loss": 0.7246, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026127533238673033, + "loss": 0.7169, + "step": 2100 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002608880857105976, + "loss": 0.7121, + "step": 2120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026050083903446494, + "loss": 0.719, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002601135923583322, + "loss": 0.7236, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025972634568219955, + "loss": 0.7154, + "step": 2180 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025933909900606683, + "loss": 0.7148, + "step": 2200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7211937308311462, + "eval_runtime": 177.2195, + "eval_samples_per_second": 11.285, + "eval_steps_per_second": 1.411, + "step": 2200 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025895185232993416, + "loss": 0.7193, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025856460565380144, + "loss": 0.7046, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002581773589776688, + "loss": 0.7231, + "step": 2260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025779011230153605, + "loss": 0.719, + "step": 2280 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025740286562540333, + "loss": 0.707, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025701561894927067, + "loss": 0.7176, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025662837227313794, + "loss": 0.7205, + "step": 2340 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002562411255970053, + "loss": 0.7162, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025585387892087256, + "loss": 0.7223, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002554666322447399, + "loss": 0.7158, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_loss": 0.7201904654502869, + "eval_runtime": 177.1891, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 2400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025507938556860717, + "loss": 0.7155, + "step": 2420 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025469213889247445, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002543048922163418, + "loss": 0.7206, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002539176455402091, + "loss": 0.7147, + "step": 2480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002535303988640764, + "loss": 0.7116, + "step": 2500 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002531431521879437, + "loss": 0.7161, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 0.000252755905511811, + "loss": 0.7173, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002523686588356783, + "loss": 0.7159, + "step": 2560 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002519814121595456, + "loss": 0.7182, + "step": 2580 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002515941654834129, + "loss": 0.723, + "step": 2600 + }, + { + "epoch": 0.33, + "eval_loss": 0.7193037867546082, + "eval_runtime": 177.0591, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002512069188072802, + "loss": 0.717, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 0.00025081967213114756, + "loss": 0.7164, + "step": 2640 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002504324254550148, + "loss": 0.7173, + "step": 2660 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002500451787788821, + "loss": 0.7154, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024965793210274945, + "loss": 0.7101, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002492706854266167, + "loss": 0.7088, + "step": 2720 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024888343875048406, + "loss": 0.715, + "step": 2740 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024849619207435134, + "loss": 0.7165, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002481089453982186, + "loss": 0.7094, + "step": 2780 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024772169872208595, + "loss": 0.7116, + "step": 2800 + }, + { + "epoch": 0.36, + "eval_loss": 0.7178795337677002, + "eval_runtime": 177.1788, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 2800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024733445204595323, + "loss": 0.7121, + "step": 2820 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024694720536982056, + "loss": 0.7197, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024655995869368784, + "loss": 0.7133, + "step": 2860 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024617271201755517, + "loss": 0.7124, + "step": 2880 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024578546534142245, + "loss": 0.7081, + "step": 2900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002453982186652898, + "loss": 0.7105, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 0.00024501097198915706, + "loss": 0.7005, + "step": 2940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002446237253130244, + "loss": 0.7111, + "step": 2960 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002442364786368917, + "loss": 0.7035, + "step": 2980 + }, + { + "epoch": 0.38, + "learning_rate": 0.000243849231960759, + "loss": 0.7125, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7173203229904175, + "eval_runtime": 176.9402, + "eval_samples_per_second": 11.303, + "eval_steps_per_second": 1.413, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00024346198528462626, + "loss": 0.7143, + "step": 3020 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002430747386084936, + "loss": 0.7121, + "step": 3040 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002426874919323609, + "loss": 0.7093, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002423002452562282, + "loss": 0.711, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002419129985800955, + "loss": 0.7239, + "step": 3100 + }, + { + "epoch": 0.4, + "learning_rate": 0.00024152575190396281, + "loss": 0.7183, + "step": 3120 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002411385052278301, + "loss": 0.7056, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002407512585516974, + "loss": 0.7109, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002403640118755647, + "loss": 0.7183, + "step": 3180 + }, + { + "epoch": 0.41, + "learning_rate": 0.000239976765199432, + "loss": 0.7135, + "step": 3200 + }, + { + "epoch": 0.41, + "eval_loss": 0.7156603932380676, + "eval_runtime": 178.5954, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00023958951852329932, + "loss": 0.7022, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023920227184716665, + "loss": 0.7155, + "step": 3240 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002388150251710339, + "loss": 0.7072, + "step": 3260 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023842777849490123, + "loss": 0.7151, + "step": 3280 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023804053181876854, + "loss": 0.7044, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023765328514263584, + "loss": 0.7141, + "step": 3320 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023726603846650315, + "loss": 0.7033, + "step": 3340 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023687879179037046, + "loss": 0.7137, + "step": 3360 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023649154511423773, + "loss": 0.7042, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023610429843810504, + "loss": 0.7156, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_loss": 0.7149476408958435, + "eval_runtime": 178.5798, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023571705176197235, + "loss": 0.7045, + "step": 3420 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023532980508583965, + "loss": 0.7021, + "step": 3440 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023494255840970698, + "loss": 0.7092, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002345553117335743, + "loss": 0.7213, + "step": 3480 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023416806505744157, + "loss": 0.7046, + "step": 3500 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023378081838130887, + "loss": 0.7076, + "step": 3520 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023339357170517618, + "loss": 0.7107, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023300632502904349, + "loss": 0.7087, + "step": 3560 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002326190783529108, + "loss": 0.7005, + "step": 3580 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002322318316767781, + "loss": 0.7064, + "step": 3600 + }, + { + "epoch": 0.46, + "eval_loss": 0.7140311002731323, + "eval_runtime": 179.0415, + "eval_samples_per_second": 11.171, + "eval_steps_per_second": 1.396, + "step": 3600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023184458500064538, + "loss": 0.714, + "step": 3620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023145733832451268, + "loss": 0.7102, + "step": 3640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023107009164838, + "loss": 0.7202, + "step": 3660 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002306828449722473, + "loss": 0.7016, + "step": 3680 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023029559829611463, + "loss": 0.7126, + "step": 3700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022990835161998193, + "loss": 0.7055, + "step": 3720 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002295211049438492, + "loss": 0.7118, + "step": 3740 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022913385826771652, + "loss": 0.707, + "step": 3760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022874661159158382, + "loss": 0.7119, + "step": 3780 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022835936491545113, + "loss": 0.7023, + "step": 3800 + }, + { + "epoch": 0.49, + "eval_loss": 0.7134947776794434, + "eval_runtime": 179.7115, + "eval_samples_per_second": 11.129, + "eval_steps_per_second": 1.391, + "step": 3800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022797211823931843, + "loss": 0.6967, + "step": 3820 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022758487156318574, + "loss": 0.7172, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022719762488705302, + "loss": 0.7137, + "step": 3860 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022681037821092032, + "loss": 0.7164, + "step": 3880 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022642313153478763, + "loss": 0.7099, + "step": 3900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022603588485865494, + "loss": 0.7119, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022564863818252227, + "loss": 0.7098, + "step": 3940 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022526139150638957, + "loss": 0.7067, + "step": 3960 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022487414483025685, + "loss": 0.705, + "step": 3980 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022448689815412416, + "loss": 0.7125, + "step": 4000 + }, + { + "epoch": 0.51, + "eval_loss": 0.7128713130950928, + "eval_runtime": 178.8128, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022409965147799146, + "loss": 0.7098, + "step": 4020 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022371240480185877, + "loss": 0.7081, + "step": 4040 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022332515812572608, + "loss": 0.6982, + "step": 4060 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022293791144959338, + "loss": 0.7122, + "step": 4080 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022255066477346066, + "loss": 0.6974, + "step": 4100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022216341809732797, + "loss": 0.7018, + "step": 4120 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022177617142119527, + "loss": 0.7075, + "step": 4140 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022138892474506258, + "loss": 0.7013, + "step": 4160 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002210016780689299, + "loss": 0.7103, + "step": 4180 + }, + { + "epoch": 0.54, + "learning_rate": 0.00022061443139279722, + "loss": 0.7005, + "step": 4200 + }, + { + "epoch": 0.54, + "eval_loss": 0.7116231918334961, + "eval_runtime": 179.0816, + "eval_samples_per_second": 11.168, + "eval_steps_per_second": 1.396, + "step": 4200 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002202271847166645, + "loss": 0.7071, + "step": 4220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002198399380405318, + "loss": 0.7114, + "step": 4240 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002194526913643991, + "loss": 0.705, + "step": 4260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002190654446882664, + "loss": 0.706, + "step": 4280 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021867819801213372, + "loss": 0.7016, + "step": 4300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021829095133600102, + "loss": 0.7084, + "step": 4320 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002179037046598683, + "loss": 0.7187, + "step": 4340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002175164579837356, + "loss": 0.7044, + "step": 4360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002171292113076029, + "loss": 0.7068, + "step": 4380 + }, + { + "epoch": 0.56, + "learning_rate": 0.00021674196463147025, + "loss": 0.7082, + "step": 4400 + }, + { + "epoch": 0.56, + "eval_loss": 0.7112064957618713, + "eval_runtime": 178.8847, + "eval_samples_per_second": 11.18, + "eval_steps_per_second": 1.398, + "step": 4400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021635471795533755, + "loss": 0.705, + "step": 4420 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021596747127920486, + "loss": 0.704, + "step": 4440 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021558022460307214, + "loss": 0.7071, + "step": 4460 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021519297792693944, + "loss": 0.708, + "step": 4480 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021480573125080675, + "loss": 0.705, + "step": 4500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021441848457467405, + "loss": 0.7061, + "step": 4520 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021403123789854136, + "loss": 0.7074, + "step": 4540 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021364399122240864, + "loss": 0.7148, + "step": 4560 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021325674454627594, + "loss": 0.7091, + "step": 4580 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021286949787014325, + "loss": 0.7103, + "step": 4600 + }, + { + "epoch": 0.59, + "eval_loss": 0.7104864716529846, + "eval_runtime": 178.4457, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 4600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021248225119401055, + "loss": 0.706, + "step": 4620 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002120950045178779, + "loss": 0.6966, + "step": 4640 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002117077578417452, + "loss": 0.6991, + "step": 4660 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021132051116561247, + "loss": 0.7039, + "step": 4680 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021093326448947978, + "loss": 0.7059, + "step": 4700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00021054601781334708, + "loss": 0.7122, + "step": 4720 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002101587711372144, + "loss": 0.7099, + "step": 4740 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002097715244610817, + "loss": 0.6998, + "step": 4760 + }, + { + "epoch": 0.61, + "learning_rate": 0.000209384277784949, + "loss": 0.7048, + "step": 4780 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020899703110881628, + "loss": 0.7077, + "step": 4800 + }, + { + "epoch": 0.62, + "eval_loss": 0.7102417945861816, + "eval_runtime": 178.4169, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 4800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020860978443268359, + "loss": 0.7172, + "step": 4820 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002082225377565509, + "loss": 0.7084, + "step": 4840 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002078352910804182, + "loss": 0.7058, + "step": 4860 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020744804440428553, + "loss": 0.6988, + "step": 4880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020706079772815284, + "loss": 0.7008, + "step": 4900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020667355105202011, + "loss": 0.6986, + "step": 4920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020628630437588742, + "loss": 0.7042, + "step": 4940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020589905769975473, + "loss": 0.7139, + "step": 4960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020551181102362203, + "loss": 0.7094, + "step": 4980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020512456434748934, + "loss": 0.7059, + "step": 5000 + }, + { + "epoch": 0.64, + "eval_loss": 0.7092374563217163, + "eval_runtime": 177.3439, + "eval_samples_per_second": 11.278, + "eval_steps_per_second": 1.41, + "step": 5000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020473731767135664, + "loss": 0.7042, + "step": 5020 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020435007099522392, + "loss": 0.6964, + "step": 5040 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020396282431909123, + "loss": 0.7041, + "step": 5060 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020357557764295853, + "loss": 0.6972, + "step": 5080 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020318833096682587, + "loss": 0.7011, + "step": 5100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020280108429069317, + "loss": 0.7073, + "step": 5120 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020241383761456048, + "loss": 0.706, + "step": 5140 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020202659093842776, + "loss": 0.6949, + "step": 5160 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020163934426229506, + "loss": 0.703, + "step": 5180 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020125209758616237, + "loss": 0.7058, + "step": 5200 + }, + { + "epoch": 0.67, + "eval_loss": 0.7084789276123047, + "eval_runtime": 177.3051, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 5200 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020086485091002967, + "loss": 0.7032, + "step": 5220 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020047760423389698, + "loss": 0.7045, + "step": 5240 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020009035755776428, + "loss": 0.7069, + "step": 5260 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019970311088163156, + "loss": 0.6961, + "step": 5280 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019931586420549887, + "loss": 0.6981, + "step": 5300 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019892861752936617, + "loss": 0.701, + "step": 5320 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001985413708532335, + "loss": 0.6979, + "step": 5340 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001981541241771008, + "loss": 0.6975, + "step": 5360 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019776687750096812, + "loss": 0.6923, + "step": 5380 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001973796308248354, + "loss": 0.7089, + "step": 5400 + }, + { + "epoch": 0.69, + "eval_loss": 0.7074704170227051, + "eval_runtime": 177.1889, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 5400 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001969923841487027, + "loss": 0.6986, + "step": 5420 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019660513747257, + "loss": 0.6991, + "step": 5440 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019621789079643731, + "loss": 0.6946, + "step": 5460 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019583064412030462, + "loss": 0.7057, + "step": 5480 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019544339744417193, + "loss": 0.694, + "step": 5500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001950561507680392, + "loss": 0.7046, + "step": 5520 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001946689040919065, + "loss": 0.6998, + "step": 5540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019428165741577382, + "loss": 0.6995, + "step": 5560 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019389441073964115, + "loss": 0.7081, + "step": 5580 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019350716406350845, + "loss": 0.6974, + "step": 5600 + }, + { + "epoch": 0.72, + "eval_loss": 0.7067714333534241, + "eval_runtime": 177.0363, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 5600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019311991738737576, + "loss": 0.7019, + "step": 5620 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019273267071124304, + "loss": 0.7003, + "step": 5640 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019234542403511034, + "loss": 0.6966, + "step": 5660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019195817735897765, + "loss": 0.7055, + "step": 5680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019157093068284496, + "loss": 0.7069, + "step": 5700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019118368400671226, + "loss": 0.6981, + "step": 5720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019079643733057957, + "loss": 0.7005, + "step": 5740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019040919065444685, + "loss": 0.7033, + "step": 5760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019002194397831415, + "loss": 0.7009, + "step": 5780 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018963469730218146, + "loss": 0.7001, + "step": 5800 + }, + { + "epoch": 0.74, + "eval_loss": 0.7066617608070374, + "eval_runtime": 177.2571, + "eval_samples_per_second": 11.283, + "eval_steps_per_second": 1.41, + "step": 5800 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001892474506260488, + "loss": 0.7048, + "step": 5820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001888602039499161, + "loss": 0.698, + "step": 5840 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001884729572737834, + "loss": 0.7035, + "step": 5860 + }, + { + "epoch": 0.75, + "learning_rate": 0.00018808571059765068, + "loss": 0.6997, + "step": 5880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00018769846392151799, + "loss": 0.7053, + "step": 5900 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001873112172453853, + "loss": 0.6951, + "step": 5920 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001869239705692526, + "loss": 0.701, + "step": 5940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001865367238931199, + "loss": 0.7032, + "step": 5960 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001861494772169872, + "loss": 0.7005, + "step": 5980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001857622305408545, + "loss": 0.7013, + "step": 6000 + }, + { + "epoch": 0.77, + "eval_loss": 0.705744743347168, + "eval_runtime": 177.421, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 6000 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001853749838647218, + "loss": 0.6923, + "step": 6020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00018498773718858913, + "loss": 0.701, + "step": 6040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018460049051245643, + "loss": 0.7027, + "step": 6060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018421324383632374, + "loss": 0.6973, + "step": 6080 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018382599716019104, + "loss": 0.6982, + "step": 6100 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018343875048405832, + "loss": 0.7024, + "step": 6120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018305150380792563, + "loss": 0.6996, + "step": 6140 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018266425713179293, + "loss": 0.7063, + "step": 6160 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018227701045566024, + "loss": 0.7005, + "step": 6180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018188976377952755, + "loss": 0.6913, + "step": 6200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7049428224563599, + "eval_runtime": 180.0706, + "eval_samples_per_second": 11.107, + "eval_steps_per_second": 1.388, + "step": 6200 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018150251710339485, + "loss": 0.6998, + "step": 6220 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018111527042726213, + "loss": 0.7044, + "step": 6240 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018072802375112944, + "loss": 0.6988, + "step": 6260 + }, + { + "epoch": 0.81, + "learning_rate": 0.00018034077707499677, + "loss": 0.6979, + "step": 6280 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017995353039886407, + "loss": 0.7048, + "step": 6300 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017956628372273138, + "loss": 0.6941, + "step": 6320 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017917903704659869, + "loss": 0.6963, + "step": 6340 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017879179037046596, + "loss": 0.6954, + "step": 6360 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017840454369433327, + "loss": 0.6953, + "step": 6380 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017801729701820058, + "loss": 0.693, + "step": 6400 + }, + { + "epoch": 0.82, + "eval_loss": 0.7036707997322083, + "eval_runtime": 178.6236, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 6400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017763005034206788, + "loss": 0.7041, + "step": 6420 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001772428036659352, + "loss": 0.6908, + "step": 6440 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001768555569898025, + "loss": 0.6961, + "step": 6460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017646831031366977, + "loss": 0.6967, + "step": 6480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017608106363753708, + "loss": 0.7019, + "step": 6500 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001756938169614044, + "loss": 0.7036, + "step": 6520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017530657028527172, + "loss": 0.6941, + "step": 6540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017491932360913902, + "loss": 0.6995, + "step": 6560 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017453207693300633, + "loss": 0.6962, + "step": 6580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001741448302568736, + "loss": 0.6963, + "step": 6600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7036789655685425, + "eval_runtime": 177.1424, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 6600 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001737575835807409, + "loss": 0.7009, + "step": 6620 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017337033690460822, + "loss": 0.6964, + "step": 6640 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017298309022847552, + "loss": 0.6974, + "step": 6660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017259584355234283, + "loss": 0.6964, + "step": 6680 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001722085968762101, + "loss": 0.6966, + "step": 6700 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001718213502000774, + "loss": 0.7016, + "step": 6720 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017143410352394475, + "loss": 0.6996, + "step": 6740 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017104685684781205, + "loss": 0.6985, + "step": 6760 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017065961017167936, + "loss": 0.7, + "step": 6780 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017027236349554666, + "loss": 0.6846, + "step": 6800 + }, + { + "epoch": 0.87, + "eval_loss": 0.7028091549873352, + "eval_runtime": 177.0434, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 6800 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016988511681941394, + "loss": 0.6994, + "step": 6820 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016949787014328125, + "loss": 0.6995, + "step": 6840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016911062346714855, + "loss": 0.6949, + "step": 6860 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016872337679101586, + "loss": 0.6903, + "step": 6880 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016833613011488316, + "loss": 0.6983, + "step": 6900 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016794888343875047, + "loss": 0.6979, + "step": 6920 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016756163676261775, + "loss": 0.6963, + "step": 6940 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016717439008648505, + "loss": 0.6963, + "step": 6960 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001667871434103524, + "loss": 0.7109, + "step": 6980 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001663998967342197, + "loss": 0.6996, + "step": 7000 + }, + { + "epoch": 0.9, + "eval_loss": 0.7027884721755981, + "eval_runtime": 177.0464, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 7000 + }, + { + "epoch": 0.9, + "learning_rate": 0.000166012650058087, + "loss": 0.6953, + "step": 7020 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001656254033819543, + "loss": 0.701, + "step": 7040 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016523815670582158, + "loss": 0.6941, + "step": 7060 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001648509100296889, + "loss": 0.6946, + "step": 7080 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001644636633535562, + "loss": 0.6905, + "step": 7100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001640764166774235, + "loss": 0.6938, + "step": 7120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001636891700012908, + "loss": 0.6964, + "step": 7140 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001633019233251581, + "loss": 0.6979, + "step": 7160 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001629146766490254, + "loss": 0.6909, + "step": 7180 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001625274299728927, + "loss": 0.7017, + "step": 7200 + }, + { + "epoch": 0.92, + "eval_loss": 0.7013801336288452, + "eval_runtime": 177.3532, + "eval_samples_per_second": 11.277, + "eval_steps_per_second": 1.41, + "step": 7200 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016214018329676003, + "loss": 0.6986, + "step": 7220 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016175293662062734, + "loss": 0.6985, + "step": 7240 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016136568994449464, + "loss": 0.694, + "step": 7260 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016097844326836195, + "loss": 0.7124, + "step": 7280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016059119659222923, + "loss": 0.6936, + "step": 7300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016020394991609653, + "loss": 0.6904, + "step": 7320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015981670323996384, + "loss": 0.6994, + "step": 7340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015942945656383114, + "loss": 0.7046, + "step": 7360 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015904220988769845, + "loss": 0.6999, + "step": 7380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015865496321156575, + "loss": 0.6952, + "step": 7400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7015686631202698, + "eval_runtime": 178.4607, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 7400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015826771653543303, + "loss": 0.7001, + "step": 7420 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015788046985930034, + "loss": 0.6948, + "step": 7440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015749322318316767, + "loss": 0.702, + "step": 7460 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015710597650703498, + "loss": 0.695, + "step": 7480 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015671872983090228, + "loss": 0.7053, + "step": 7500 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001563314831547696, + "loss": 0.7041, + "step": 7520 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015594423647863687, + "loss": 0.6975, + "step": 7540 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015555698980250417, + "loss": 0.6914, + "step": 7560 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015516974312637148, + "loss": 0.6961, + "step": 7580 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015478249645023878, + "loss": 0.6968, + "step": 7600 + }, + { + "epoch": 0.97, + "eval_loss": 0.7004283666610718, + "eval_runtime": 178.8057, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 7600 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001543952497741061, + "loss": 0.6847, + "step": 7620 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015400800309797342, + "loss": 0.6954, + "step": 7640 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015362075642184067, + "loss": 0.6967, + "step": 7660 + }, + { + "epoch": 0.98, + "learning_rate": 0.000153233509745708, + "loss": 0.6941, + "step": 7680 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001528462630695753, + "loss": 0.6928, + "step": 7700 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015245901639344262, + "loss": 0.7035, + "step": 7720 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015207176971730992, + "loss": 0.6918, + "step": 7740 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015168452304117723, + "loss": 0.6996, + "step": 7760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001512972763650445, + "loss": 0.6975, + "step": 7780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015091002968891181, + "loss": 0.7022, + "step": 7800 + }, + { + "epoch": 1.0, + "eval_loss": 0.6998333930969238, + "eval_runtime": 178.2222, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 7800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015052278301277912, + "loss": 0.6854, + "step": 7820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015013553633664643, + "loss": 0.6911, + "step": 7840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014974828966051373, + "loss": 0.6846, + "step": 7860 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014936104298438104, + "loss": 0.6859, + "step": 7880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014897379630824834, + "loss": 0.6802, + "step": 7900 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014858654963211565, + "loss": 0.6891, + "step": 7920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014819930295598295, + "loss": 0.6833, + "step": 7940 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014781205627985026, + "loss": 0.6866, + "step": 7960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014742480960371754, + "loss": 0.6863, + "step": 7980 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014703756292758484, + "loss": 0.6898, + "step": 8000 + }, + { + "epoch": 1.03, + "eval_loss": 0.699661135673523, + "eval_runtime": 177.9496, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 8000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014665031625145218, + "loss": 0.6881, + "step": 8020 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014626306957531946, + "loss": 0.6894, + "step": 8040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014587582289918676, + "loss": 0.685, + "step": 8060 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014548857622305407, + "loss": 0.6837, + "step": 8080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014510132954692137, + "loss": 0.6944, + "step": 8100 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014471408287078868, + "loss": 0.6883, + "step": 8120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014432683619465598, + "loss": 0.6874, + "step": 8140 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001439395895185233, + "loss": 0.6867, + "step": 8160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001435523428423906, + "loss": 0.6875, + "step": 8180 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001431650961662579, + "loss": 0.7005, + "step": 8200 + }, + { + "epoch": 1.05, + "eval_loss": 0.699004590511322, + "eval_runtime": 177.1183, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 8200 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014277784949012518, + "loss": 0.6968, + "step": 8220 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014239060281399249, + "loss": 0.6884, + "step": 8240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014200335613785982, + "loss": 0.6808, + "step": 8260 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001416161094617271, + "loss": 0.6851, + "step": 8280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001412288627855944, + "loss": 0.6917, + "step": 8300 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001408416161094617, + "loss": 0.6944, + "step": 8320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014045436943332901, + "loss": 0.6851, + "step": 8340 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014006712275719632, + "loss": 0.6829, + "step": 8360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00013967987608106363, + "loss": 0.6872, + "step": 8380 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013929262940493093, + "loss": 0.6909, + "step": 8400 + }, + { + "epoch": 1.08, + "eval_loss": 0.6990391612052917, + "eval_runtime": 177.2945, + "eval_samples_per_second": 11.281, + "eval_steps_per_second": 1.41, + "step": 8400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013890538272879824, + "loss": 0.6924, + "step": 8420 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013851813605266554, + "loss": 0.6747, + "step": 8440 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013813088937653282, + "loss": 0.6932, + "step": 8460 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013774364270040016, + "loss": 0.6892, + "step": 8480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013735639602426746, + "loss": 0.6868, + "step": 8500 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013696914934813474, + "loss": 0.6898, + "step": 8520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013658190267200205, + "loss": 0.6896, + "step": 8540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013619465599586935, + "loss": 0.6848, + "step": 8560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013580740931973666, + "loss": 0.6801, + "step": 8580 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013542016264360396, + "loss": 0.6927, + "step": 8600 + }, + { + "epoch": 1.1, + "eval_loss": 0.6984953880310059, + "eval_runtime": 177.3114, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 8600 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013503291596747127, + "loss": 0.688, + "step": 8620 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013464566929133857, + "loss": 0.693, + "step": 8640 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013425842261520588, + "loss": 0.6815, + "step": 8660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013387117593907319, + "loss": 0.6828, + "step": 8680 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013348392926294046, + "loss": 0.6834, + "step": 8700 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001330966825868078, + "loss": 0.6839, + "step": 8720 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001327094359106751, + "loss": 0.6838, + "step": 8740 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013232218923454238, + "loss": 0.6847, + "step": 8760 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001319349425584097, + "loss": 0.6894, + "step": 8780 + }, + { + "epoch": 1.13, + "learning_rate": 0.000131547695882277, + "loss": 0.6922, + "step": 8800 + }, + { + "epoch": 1.13, + "eval_loss": 0.698199450969696, + "eval_runtime": 177.0428, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 8800 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001311604492061443, + "loss": 0.6904, + "step": 8820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001307732025300116, + "loss": 0.6854, + "step": 8840 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001303859558538789, + "loss": 0.6877, + "step": 8860 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012999870917774622, + "loss": 0.6857, + "step": 8880 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012961146250161352, + "loss": 0.6856, + "step": 8900 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012922421582548083, + "loss": 0.687, + "step": 8920 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001288369691493481, + "loss": 0.6905, + "step": 8940 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012844972247321544, + "loss": 0.6865, + "step": 8960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012806247579708274, + "loss": 0.6829, + "step": 8980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012767522912095002, + "loss": 0.696, + "step": 9000 + }, + { + "epoch": 1.15, + "eval_loss": 0.6971157789230347, + "eval_runtime": 177.1296, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 9000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012728798244481733, + "loss": 0.6825, + "step": 9020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012690073576868463, + "loss": 0.6844, + "step": 9040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012651348909255194, + "loss": 0.6853, + "step": 9060 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012612624241641925, + "loss": 0.6889, + "step": 9080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012573899574028655, + "loss": 0.6848, + "step": 9100 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012535174906415386, + "loss": 0.6953, + "step": 9120 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012496450238802116, + "loss": 0.6944, + "step": 9140 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012457725571188847, + "loss": 0.6893, + "step": 9160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012419000903575575, + "loss": 0.6831, + "step": 9180 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012380276235962308, + "loss": 0.683, + "step": 9200 + }, + { + "epoch": 1.18, + "eval_loss": 0.6971254944801331, + "eval_runtime": 177.2118, + "eval_samples_per_second": 11.286, + "eval_steps_per_second": 1.411, + "step": 9200 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012341551568349039, + "loss": 0.6782, + "step": 9220 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012302826900735766, + "loss": 0.6962, + "step": 9240 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012264102233122497, + "loss": 0.6808, + "step": 9260 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001222537756550923, + "loss": 0.6931, + "step": 9280 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012186652897895958, + "loss": 0.6878, + "step": 9300 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012147928230282689, + "loss": 0.6855, + "step": 9320 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012109203562669421, + "loss": 0.6865, + "step": 9340 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001207047889505615, + "loss": 0.6945, + "step": 9360 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001203175422744288, + "loss": 0.6862, + "step": 9380 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011993029559829611, + "loss": 0.6932, + "step": 9400 + }, + { + "epoch": 1.21, + "eval_loss": 0.6963634490966797, + "eval_runtime": 177.2999, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 9400 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001195430489221634, + "loss": 0.6793, + "step": 9420 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011915580224603071, + "loss": 0.6827, + "step": 9440 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011876855556989803, + "loss": 0.6836, + "step": 9460 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011838130889376532, + "loss": 0.6876, + "step": 9480 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011799406221763263, + "loss": 0.6949, + "step": 9500 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011760681554149993, + "loss": 0.6809, + "step": 9520 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011721956886536722, + "loss": 0.6923, + "step": 9540 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011683232218923453, + "loss": 0.6975, + "step": 9560 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011644507551310185, + "loss": 0.6981, + "step": 9580 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011605782883696914, + "loss": 0.6843, + "step": 9600 + }, + { + "epoch": 1.23, + "eval_loss": 0.6958213448524475, + "eval_runtime": 177.1541, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 9600 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011567058216083645, + "loss": 0.6904, + "step": 9620 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011528333548470375, + "loss": 0.6761, + "step": 9640 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011489608880857104, + "loss": 0.6933, + "step": 9660 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011450884213243835, + "loss": 0.6913, + "step": 9680 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011412159545630567, + "loss": 0.6958, + "step": 9700 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011373434878017296, + "loss": 0.6902, + "step": 9720 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011334710210404027, + "loss": 0.6796, + "step": 9740 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011295985542790757, + "loss": 0.6906, + "step": 9760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011257260875177487, + "loss": 0.6882, + "step": 9780 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011218536207564217, + "loss": 0.6856, + "step": 9800 + }, + { + "epoch": 1.26, + "eval_loss": 0.6958709359169006, + "eval_runtime": 177.1197, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 9800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011179811539950949, + "loss": 0.681, + "step": 9820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011141086872337678, + "loss": 0.6824, + "step": 9840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011102362204724409, + "loss": 0.6921, + "step": 9860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001106363753711114, + "loss": 0.6862, + "step": 9880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00011024912869497869, + "loss": 0.6869, + "step": 9900 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010986188201884599, + "loss": 0.6867, + "step": 9920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010947463534271328, + "loss": 0.6885, + "step": 9940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001090873886665806, + "loss": 0.6801, + "step": 9960 + }, + { + "epoch": 1.28, + "learning_rate": 0.00010870014199044791, + "loss": 0.684, + "step": 9980 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001083128953143152, + "loss": 0.6885, + "step": 10000 + }, + { + "epoch": 1.28, + "eval_loss": 0.6948391795158386, + "eval_runtime": 184.1668, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.357, + "step": 10000 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010792564863818251, + "loss": 0.685, + "step": 10020 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010753840196204981, + "loss": 0.6849, + "step": 10040 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010715115528591712, + "loss": 0.6901, + "step": 10060 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010678327094359106, + "loss": 0.6926, + "step": 10080 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010639602426745837, + "loss": 0.6863, + "step": 10100 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010600877759132567, + "loss": 0.6846, + "step": 10120 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010562153091519296, + "loss": 0.6859, + "step": 10140 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010523428423906027, + "loss": 0.6887, + "step": 10160 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010484703756292759, + "loss": 0.689, + "step": 10180 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010445979088679488, + "loss": 0.6795, + "step": 10200 + }, + { + "epoch": 1.31, + "eval_loss": 0.6949850916862488, + "eval_runtime": 177.1947, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 10200 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010407254421066219, + "loss": 0.6874, + "step": 10220 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010368529753452949, + "loss": 0.6875, + "step": 10240 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010329805085839678, + "loss": 0.6842, + "step": 10260 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010291080418226409, + "loss": 0.6842, + "step": 10280 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010252355750613141, + "loss": 0.6834, + "step": 10300 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001021363108299987, + "loss": 0.6894, + "step": 10320 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010174906415386601, + "loss": 0.6814, + "step": 10340 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010136181747773331, + "loss": 0.6731, + "step": 10360 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001009745708016006, + "loss": 0.6955, + "step": 10380 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010058732412546791, + "loss": 0.6884, + "step": 10400 + }, + { + "epoch": 1.33, + "eval_loss": 0.6947998404502869, + "eval_runtime": 183.6797, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.361, + "step": 10400 + }, + { + "epoch": 1.34, + "learning_rate": 0.00010020007744933523, + "loss": 0.6844, + "step": 10420 + }, + { + "epoch": 1.34, + "learning_rate": 9.981283077320252e-05, + "loss": 0.6795, + "step": 10440 + }, + { + "epoch": 1.34, + "learning_rate": 9.942558409706983e-05, + "loss": 0.6783, + "step": 10460 + }, + { + "epoch": 1.34, + "learning_rate": 9.903833742093713e-05, + "loss": 0.6951, + "step": 10480 + }, + { + "epoch": 1.35, + "learning_rate": 9.865109074480443e-05, + "loss": 0.6891, + "step": 10500 + }, + { + "epoch": 1.35, + "learning_rate": 9.826384406867173e-05, + "loss": 0.6899, + "step": 10520 + }, + { + "epoch": 1.35, + "learning_rate": 9.787659739253905e-05, + "loss": 0.6905, + "step": 10540 + }, + { + "epoch": 1.35, + "learning_rate": 9.748935071640634e-05, + "loss": 0.6786, + "step": 10560 + }, + { + "epoch": 1.36, + "learning_rate": 9.710210404027365e-05, + "loss": 0.6884, + "step": 10580 + }, + { + "epoch": 1.36, + "learning_rate": 9.671485736414096e-05, + "loss": 0.6757, + "step": 10600 + }, + { + "epoch": 1.36, + "eval_loss": 0.6941512227058411, + "eval_runtime": 179.4312, + "eval_samples_per_second": 11.146, + "eval_steps_per_second": 1.393, + "step": 10600 + }, + { + "epoch": 1.36, + "learning_rate": 9.632761068800825e-05, + "loss": 0.6844, + "step": 10620 + }, + { + "epoch": 1.36, + "learning_rate": 9.594036401187555e-05, + "loss": 0.6829, + "step": 10640 + }, + { + "epoch": 1.37, + "learning_rate": 9.555311733574287e-05, + "loss": 0.7007, + "step": 10660 + }, + { + "epoch": 1.37, + "learning_rate": 9.516587065961016e-05, + "loss": 0.6989, + "step": 10680 + }, + { + "epoch": 1.37, + "learning_rate": 9.477862398347747e-05, + "loss": 0.6783, + "step": 10700 + }, + { + "epoch": 1.37, + "learning_rate": 9.439137730734476e-05, + "loss": 0.6783, + "step": 10720 + }, + { + "epoch": 1.38, + "learning_rate": 9.400413063121207e-05, + "loss": 0.6879, + "step": 10740 + }, + { + "epoch": 1.38, + "learning_rate": 9.361688395507937e-05, + "loss": 0.6801, + "step": 10760 + }, + { + "epoch": 1.38, + "learning_rate": 9.322963727894667e-05, + "loss": 0.6786, + "step": 10780 + }, + { + "epoch": 1.39, + "learning_rate": 9.284239060281399e-05, + "loss": 0.6882, + "step": 10800 + }, + { + "epoch": 1.39, + "eval_loss": 0.6937060356140137, + "eval_runtime": 178.6714, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 1.39, + "learning_rate": 9.245514392668129e-05, + "loss": 0.6796, + "step": 10820 + }, + { + "epoch": 1.39, + "learning_rate": 9.206789725054858e-05, + "loss": 0.6886, + "step": 10840 + }, + { + "epoch": 1.39, + "learning_rate": 9.168065057441589e-05, + "loss": 0.683, + "step": 10860 + }, + { + "epoch": 1.4, + "learning_rate": 9.129340389828321e-05, + "loss": 0.6786, + "step": 10880 + }, + { + "epoch": 1.4, + "learning_rate": 9.09061572221505e-05, + "loss": 0.6843, + "step": 10900 + }, + { + "epoch": 1.4, + "learning_rate": 9.05189105460178e-05, + "loss": 0.6795, + "step": 10920 + }, + { + "epoch": 1.4, + "learning_rate": 9.013166386988511e-05, + "loss": 0.6813, + "step": 10940 + }, + { + "epoch": 1.41, + "learning_rate": 8.97444171937524e-05, + "loss": 0.687, + "step": 10960 + }, + { + "epoch": 1.41, + "learning_rate": 8.935717051761971e-05, + "loss": 0.6872, + "step": 10980 + }, + { + "epoch": 1.41, + "learning_rate": 8.896992384148703e-05, + "loss": 0.6746, + "step": 11000 + }, + { + "epoch": 1.41, + "eval_loss": 0.6936533451080322, + "eval_runtime": 178.4177, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 11000 + }, + { + "epoch": 1.41, + "learning_rate": 8.858267716535432e-05, + "loss": 0.6747, + "step": 11020 + }, + { + "epoch": 1.42, + "learning_rate": 8.819543048922163e-05, + "loss": 0.6975, + "step": 11040 + }, + { + "epoch": 1.42, + "learning_rate": 8.780818381308893e-05, + "loss": 0.6806, + "step": 11060 + }, + { + "epoch": 1.42, + "learning_rate": 8.742093713695623e-05, + "loss": 0.6789, + "step": 11080 + }, + { + "epoch": 1.42, + "learning_rate": 8.703369046082353e-05, + "loss": 0.6851, + "step": 11100 + }, + { + "epoch": 1.43, + "learning_rate": 8.664644378469085e-05, + "loss": 0.6836, + "step": 11120 + }, + { + "epoch": 1.43, + "learning_rate": 8.625919710855814e-05, + "loss": 0.6891, + "step": 11140 + }, + { + "epoch": 1.43, + "learning_rate": 8.587195043242545e-05, + "loss": 0.6898, + "step": 11160 + }, + { + "epoch": 1.43, + "learning_rate": 8.548470375629275e-05, + "loss": 0.6798, + "step": 11180 + }, + { + "epoch": 1.44, + "learning_rate": 8.509745708016005e-05, + "loss": 0.6822, + "step": 11200 + }, + { + "epoch": 1.44, + "eval_loss": 0.6932746767997742, + "eval_runtime": 178.4279, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 11200 + }, + { + "epoch": 1.44, + "learning_rate": 8.471021040402735e-05, + "loss": 0.6776, + "step": 11220 + }, + { + "epoch": 1.44, + "learning_rate": 8.432296372789467e-05, + "loss": 0.6835, + "step": 11240 + }, + { + "epoch": 1.44, + "learning_rate": 8.393571705176196e-05, + "loss": 0.6852, + "step": 11260 + }, + { + "epoch": 1.45, + "learning_rate": 8.354847037562927e-05, + "loss": 0.686, + "step": 11280 + }, + { + "epoch": 1.45, + "learning_rate": 8.316122369949657e-05, + "loss": 0.6884, + "step": 11300 + }, + { + "epoch": 1.45, + "learning_rate": 8.277397702336387e-05, + "loss": 0.6811, + "step": 11320 + }, + { + "epoch": 1.45, + "learning_rate": 8.238673034723117e-05, + "loss": 0.6751, + "step": 11340 + }, + { + "epoch": 1.46, + "learning_rate": 8.199948367109849e-05, + "loss": 0.6837, + "step": 11360 + }, + { + "epoch": 1.46, + "learning_rate": 8.161223699496578e-05, + "loss": 0.6839, + "step": 11380 + }, + { + "epoch": 1.46, + "learning_rate": 8.122499031883309e-05, + "loss": 0.6804, + "step": 11400 + }, + { + "epoch": 1.46, + "eval_loss": 0.6925450563430786, + "eval_runtime": 177.1737, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 11400 + }, + { + "epoch": 1.46, + "learning_rate": 8.08377436427004e-05, + "loss": 0.6885, + "step": 11420 + }, + { + "epoch": 1.47, + "learning_rate": 8.045049696656769e-05, + "loss": 0.6907, + "step": 11440 + }, + { + "epoch": 1.47, + "learning_rate": 8.0063250290435e-05, + "loss": 0.6868, + "step": 11460 + }, + { + "epoch": 1.47, + "learning_rate": 7.967600361430231e-05, + "loss": 0.6945, + "step": 11480 + }, + { + "epoch": 1.47, + "learning_rate": 7.92887569381696e-05, + "loss": 0.6851, + "step": 11500 + }, + { + "epoch": 1.48, + "learning_rate": 7.890151026203691e-05, + "loss": 0.6878, + "step": 11520 + }, + { + "epoch": 1.48, + "learning_rate": 7.851426358590422e-05, + "loss": 0.6955, + "step": 11540 + }, + { + "epoch": 1.48, + "learning_rate": 7.812701690977151e-05, + "loss": 0.6783, + "step": 11560 + }, + { + "epoch": 1.49, + "learning_rate": 7.773977023363881e-05, + "loss": 0.6857, + "step": 11580 + }, + { + "epoch": 1.49, + "learning_rate": 7.735252355750613e-05, + "loss": 0.6828, + "step": 11600 + }, + { + "epoch": 1.49, + "eval_loss": 0.6924574971199036, + "eval_runtime": 177.3841, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.409, + "step": 11600 + }, + { + "epoch": 1.49, + "learning_rate": 7.696527688137343e-05, + "loss": 0.6796, + "step": 11620 + }, + { + "epoch": 1.49, + "learning_rate": 7.657803020524073e-05, + "loss": 0.6823, + "step": 11640 + }, + { + "epoch": 1.5, + "learning_rate": 7.619078352910804e-05, + "loss": 0.6854, + "step": 11660 + }, + { + "epoch": 1.5, + "learning_rate": 7.580353685297533e-05, + "loss": 0.6797, + "step": 11680 + }, + { + "epoch": 1.5, + "learning_rate": 7.541629017684265e-05, + "loss": 0.6811, + "step": 11700 + }, + { + "epoch": 1.5, + "learning_rate": 7.502904350070995e-05, + "loss": 0.6775, + "step": 11720 + }, + { + "epoch": 1.51, + "learning_rate": 7.464179682457725e-05, + "loss": 0.687, + "step": 11740 + }, + { + "epoch": 1.51, + "learning_rate": 7.425455014844455e-05, + "loss": 0.6859, + "step": 11760 + }, + { + "epoch": 1.51, + "learning_rate": 7.386730347231186e-05, + "loss": 0.683, + "step": 11780 + }, + { + "epoch": 1.51, + "learning_rate": 7.348005679617916e-05, + "loss": 0.6812, + "step": 11800 + }, + { + "epoch": 1.51, + "eval_loss": 0.692126452922821, + "eval_runtime": 182.938, + "eval_samples_per_second": 10.933, + "eval_steps_per_second": 1.367, + "step": 11800 + }, + { + "epoch": 1.52, + "learning_rate": 7.309281012004647e-05, + "loss": 0.688, + "step": 11820 + }, + { + "epoch": 1.52, + "learning_rate": 7.270556344391376e-05, + "loss": 0.6774, + "step": 11840 + }, + { + "epoch": 1.52, + "learning_rate": 7.231831676778107e-05, + "loss": 0.6806, + "step": 11860 + }, + { + "epoch": 1.52, + "learning_rate": 7.193107009164837e-05, + "loss": 0.6756, + "step": 11880 + }, + { + "epoch": 1.53, + "learning_rate": 7.154382341551568e-05, + "loss": 0.6856, + "step": 11900 + }, + { + "epoch": 1.53, + "learning_rate": 7.115657673938298e-05, + "loss": 0.6822, + "step": 11920 + }, + { + "epoch": 1.53, + "learning_rate": 7.076933006325029e-05, + "loss": 0.6769, + "step": 11940 + }, + { + "epoch": 1.53, + "learning_rate": 7.038208338711758e-05, + "loss": 0.6759, + "step": 11960 + }, + { + "epoch": 1.54, + "learning_rate": 6.999483671098489e-05, + "loss": 0.6854, + "step": 11980 + }, + { + "epoch": 1.54, + "learning_rate": 6.96075900348522e-05, + "loss": 0.6855, + "step": 12000 + }, + { + "epoch": 1.54, + "eval_loss": 0.6914573907852173, + "eval_runtime": 177.6919, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 12000 + }, + { + "epoch": 1.54, + "learning_rate": 6.92203433587195e-05, + "loss": 0.6816, + "step": 12020 + }, + { + "epoch": 1.54, + "learning_rate": 6.88330966825868e-05, + "loss": 0.6801, + "step": 12040 + }, + { + "epoch": 1.55, + "learning_rate": 6.844585000645411e-05, + "loss": 0.6818, + "step": 12060 + }, + { + "epoch": 1.55, + "learning_rate": 6.80586033303214e-05, + "loss": 0.6808, + "step": 12080 + }, + { + "epoch": 1.55, + "learning_rate": 6.767135665418872e-05, + "loss": 0.6849, + "step": 12100 + }, + { + "epoch": 1.55, + "learning_rate": 6.728410997805601e-05, + "loss": 0.6902, + "step": 12120 + }, + { + "epoch": 1.56, + "learning_rate": 6.689686330192332e-05, + "loss": 0.6795, + "step": 12140 + }, + { + "epoch": 1.56, + "learning_rate": 6.652897895959726e-05, + "loss": 0.6783, + "step": 12160 + }, + { + "epoch": 1.56, + "learning_rate": 6.614173228346455e-05, + "loss": 0.6839, + "step": 12180 + }, + { + "epoch": 1.56, + "learning_rate": 6.575448560733186e-05, + "loss": 0.6875, + "step": 12200 + }, + { + "epoch": 1.56, + "eval_loss": 0.6915743947029114, + "eval_runtime": 177.7146, + "eval_samples_per_second": 11.254, + "eval_steps_per_second": 1.407, + "step": 12200 + }, + { + "epoch": 1.57, + "learning_rate": 6.536723893119917e-05, + "loss": 0.687, + "step": 12220 + }, + { + "epoch": 1.57, + "learning_rate": 6.497999225506647e-05, + "loss": 0.683, + "step": 12240 + }, + { + "epoch": 1.57, + "learning_rate": 6.459274557893378e-05, + "loss": 0.6796, + "step": 12260 + }, + { + "epoch": 1.57, + "learning_rate": 6.420549890280108e-05, + "loss": 0.6757, + "step": 12280 + }, + { + "epoch": 1.58, + "learning_rate": 6.381825222666838e-05, + "loss": 0.6901, + "step": 12300 + }, + { + "epoch": 1.58, + "learning_rate": 6.343100555053568e-05, + "loss": 0.6734, + "step": 12320 + }, + { + "epoch": 1.58, + "learning_rate": 6.304375887440299e-05, + "loss": 0.6863, + "step": 12340 + }, + { + "epoch": 1.59, + "learning_rate": 6.265651219827029e-05, + "loss": 0.6769, + "step": 12360 + }, + { + "epoch": 1.59, + "learning_rate": 6.22692655221376e-05, + "loss": 0.678, + "step": 12380 + }, + { + "epoch": 1.59, + "learning_rate": 6.18820188460049e-05, + "loss": 0.6934, + "step": 12400 + }, + { + "epoch": 1.59, + "eval_loss": 0.6910441517829895, + "eval_runtime": 177.091, + "eval_samples_per_second": 11.294, + "eval_steps_per_second": 1.412, + "step": 12400 + }, + { + "epoch": 1.59, + "learning_rate": 6.14947721698722e-05, + "loss": 0.6925, + "step": 12420 + }, + { + "epoch": 1.6, + "learning_rate": 6.11075254937395e-05, + "loss": 0.6797, + "step": 12440 + }, + { + "epoch": 1.6, + "learning_rate": 6.0720278817606814e-05, + "loss": 0.6839, + "step": 12460 + }, + { + "epoch": 1.6, + "learning_rate": 6.0333032141474113e-05, + "loss": 0.6825, + "step": 12480 + }, + { + "epoch": 1.6, + "learning_rate": 5.994578546534141e-05, + "loss": 0.6816, + "step": 12500 + }, + { + "epoch": 1.61, + "learning_rate": 5.9558538789208725e-05, + "loss": 0.6844, + "step": 12520 + }, + { + "epoch": 1.61, + "learning_rate": 5.9171292113076024e-05, + "loss": 0.6833, + "step": 12540 + }, + { + "epoch": 1.61, + "learning_rate": 5.878404543694332e-05, + "loss": 0.6827, + "step": 12560 + }, + { + "epoch": 1.61, + "learning_rate": 5.8396798760810635e-05, + "loss": 0.6802, + "step": 12580 + }, + { + "epoch": 1.62, + "learning_rate": 5.8009552084677934e-05, + "loss": 0.6837, + "step": 12600 + }, + { + "epoch": 1.62, + "eval_loss": 0.690994381904602, + "eval_runtime": 177.3691, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 12600 + }, + { + "epoch": 1.62, + "learning_rate": 5.762230540854523e-05, + "loss": 0.6781, + "step": 12620 + }, + { + "epoch": 1.62, + "learning_rate": 5.7235058732412546e-05, + "loss": 0.6816, + "step": 12640 + }, + { + "epoch": 1.62, + "learning_rate": 5.6847812056279845e-05, + "loss": 0.6823, + "step": 12660 + }, + { + "epoch": 1.63, + "learning_rate": 5.6460565380147144e-05, + "loss": 0.6824, + "step": 12680 + }, + { + "epoch": 1.63, + "learning_rate": 5.6073318704014456e-05, + "loss": 0.6786, + "step": 12700 + }, + { + "epoch": 1.63, + "learning_rate": 5.5686072027881755e-05, + "loss": 0.6822, + "step": 12720 + }, + { + "epoch": 1.63, + "learning_rate": 5.5298825351749054e-05, + "loss": 0.6792, + "step": 12740 + }, + { + "epoch": 1.64, + "learning_rate": 5.491157867561637e-05, + "loss": 0.6797, + "step": 12760 + }, + { + "epoch": 1.64, + "learning_rate": 5.4524331999483666e-05, + "loss": 0.6814, + "step": 12780 + }, + { + "epoch": 1.64, + "learning_rate": 5.4137085323350965e-05, + "loss": 0.6827, + "step": 12800 + }, + { + "epoch": 1.64, + "eval_loss": 0.6906899809837341, + "eval_runtime": 177.4169, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 12800 + }, + { + "epoch": 1.64, + "learning_rate": 5.374983864721828e-05, + "loss": 0.6776, + "step": 12820 + }, + { + "epoch": 1.65, + "learning_rate": 5.3362591971085576e-05, + "loss": 0.6877, + "step": 12840 + }, + { + "epoch": 1.65, + "learning_rate": 5.297534529495288e-05, + "loss": 0.6786, + "step": 12860 + }, + { + "epoch": 1.65, + "learning_rate": 5.258809861882019e-05, + "loss": 0.6853, + "step": 12880 + }, + { + "epoch": 1.65, + "learning_rate": 5.2200851942687486e-05, + "loss": 0.6843, + "step": 12900 + }, + { + "epoch": 1.66, + "learning_rate": 5.181360526655479e-05, + "loss": 0.6872, + "step": 12920 + }, + { + "epoch": 1.66, + "learning_rate": 5.14263585904221e-05, + "loss": 0.6862, + "step": 12940 + }, + { + "epoch": 1.66, + "learning_rate": 5.10391119142894e-05, + "loss": 0.6804, + "step": 12960 + }, + { + "epoch": 1.66, + "learning_rate": 5.06518652381567e-05, + "loss": 0.6786, + "step": 12980 + }, + { + "epoch": 1.67, + "learning_rate": 5.026461856202401e-05, + "loss": 0.6839, + "step": 13000 + }, + { + "epoch": 1.67, + "eval_loss": 0.6902768015861511, + "eval_runtime": 177.1355, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 13000 + }, + { + "epoch": 1.67, + "learning_rate": 4.987737188589131e-05, + "loss": 0.682, + "step": 13020 + }, + { + "epoch": 1.67, + "learning_rate": 4.949012520975861e-05, + "loss": 0.6835, + "step": 13040 + }, + { + "epoch": 1.68, + "learning_rate": 4.910287853362592e-05, + "loss": 0.6822, + "step": 13060 + }, + { + "epoch": 1.68, + "learning_rate": 4.871563185749322e-05, + "loss": 0.6827, + "step": 13080 + }, + { + "epoch": 1.68, + "learning_rate": 4.8328385181360523e-05, + "loss": 0.6744, + "step": 13100 + }, + { + "epoch": 1.68, + "learning_rate": 4.794113850522782e-05, + "loss": 0.6817, + "step": 13120 + }, + { + "epoch": 1.69, + "learning_rate": 4.755389182909513e-05, + "loss": 0.6765, + "step": 13140 + }, + { + "epoch": 1.69, + "learning_rate": 4.7166645152962434e-05, + "loss": 0.682, + "step": 13160 + }, + { + "epoch": 1.69, + "learning_rate": 4.677939847682973e-05, + "loss": 0.6849, + "step": 13180 + }, + { + "epoch": 1.69, + "learning_rate": 4.639215180069704e-05, + "loss": 0.686, + "step": 13200 + }, + { + "epoch": 1.69, + "eval_loss": 0.6899891495704651, + "eval_runtime": 183.0495, + "eval_samples_per_second": 10.926, + "eval_steps_per_second": 1.366, + "step": 13200 + }, + { + "epoch": 1.7, + "learning_rate": 4.6004905124564344e-05, + "loss": 0.6787, + "step": 13220 + }, + { + "epoch": 1.7, + "learning_rate": 4.561765844843164e-05, + "loss": 0.6786, + "step": 13240 + }, + { + "epoch": 1.7, + "learning_rate": 4.523041177229895e-05, + "loss": 0.6913, + "step": 13260 + }, + { + "epoch": 1.7, + "learning_rate": 4.4843165096166255e-05, + "loss": 0.6721, + "step": 13280 + }, + { + "epoch": 1.71, + "learning_rate": 4.4455918420033554e-05, + "loss": 0.6783, + "step": 13300 + }, + { + "epoch": 1.71, + "learning_rate": 4.4068671743900866e-05, + "loss": 0.687, + "step": 13320 + }, + { + "epoch": 1.71, + "learning_rate": 4.3681425067768165e-05, + "loss": 0.6815, + "step": 13340 + }, + { + "epoch": 1.71, + "learning_rate": 4.3294178391635464e-05, + "loss": 0.6786, + "step": 13360 + }, + { + "epoch": 1.72, + "learning_rate": 4.290693171550278e-05, + "loss": 0.6845, + "step": 13380 + }, + { + "epoch": 1.72, + "learning_rate": 4.2519685039370076e-05, + "loss": 0.6817, + "step": 13400 + }, + { + "epoch": 1.72, + "eval_loss": 0.6897545456886292, + "eval_runtime": 177.4673, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.409, + "step": 13400 + }, + { + "epoch": 1.72, + "learning_rate": 4.2132438363237375e-05, + "loss": 0.6783, + "step": 13420 + }, + { + "epoch": 1.72, + "learning_rate": 4.174519168710469e-05, + "loss": 0.6775, + "step": 13440 + }, + { + "epoch": 1.73, + "learning_rate": 4.1357945010971986e-05, + "loss": 0.6752, + "step": 13460 + }, + { + "epoch": 1.73, + "learning_rate": 4.0970698334839285e-05, + "loss": 0.6732, + "step": 13480 + }, + { + "epoch": 1.73, + "learning_rate": 4.05834516587066e-05, + "loss": 0.6854, + "step": 13500 + }, + { + "epoch": 1.73, + "learning_rate": 4.0196204982573896e-05, + "loss": 0.6814, + "step": 13520 + }, + { + "epoch": 1.74, + "learning_rate": 3.9808958306441195e-05, + "loss": 0.6832, + "step": 13540 + }, + { + "epoch": 1.74, + "learning_rate": 3.942171163030851e-05, + "loss": 0.6764, + "step": 13560 + }, + { + "epoch": 1.74, + "learning_rate": 3.903446495417581e-05, + "loss": 0.6797, + "step": 13580 + }, + { + "epoch": 1.74, + "learning_rate": 3.8647218278043106e-05, + "loss": 0.6786, + "step": 13600 + }, + { + "epoch": 1.74, + "eval_loss": 0.6894007325172424, + "eval_runtime": 177.6931, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 13600 + }, + { + "epoch": 1.75, + "learning_rate": 3.825997160191042e-05, + "loss": 0.6878, + "step": 13620 + }, + { + "epoch": 1.75, + "learning_rate": 3.787272492577772e-05, + "loss": 0.6847, + "step": 13640 + }, + { + "epoch": 1.75, + "learning_rate": 3.748547824964502e-05, + "loss": 0.6773, + "step": 13660 + }, + { + "epoch": 1.75, + "learning_rate": 3.709823157351232e-05, + "loss": 0.6723, + "step": 13680 + }, + { + "epoch": 1.76, + "learning_rate": 3.671098489737963e-05, + "loss": 0.6849, + "step": 13700 + }, + { + "epoch": 1.76, + "learning_rate": 3.6323738221246933e-05, + "loss": 0.6788, + "step": 13720 + }, + { + "epoch": 1.76, + "learning_rate": 3.593649154511423e-05, + "loss": 0.6851, + "step": 13740 + }, + { + "epoch": 1.76, + "learning_rate": 3.554924486898154e-05, + "loss": 0.6842, + "step": 13760 + }, + { + "epoch": 1.77, + "learning_rate": 3.5161998192848844e-05, + "loss": 0.6763, + "step": 13780 + }, + { + "epoch": 1.77, + "learning_rate": 3.477475151671614e-05, + "loss": 0.6795, + "step": 13800 + }, + { + "epoch": 1.77, + "eval_loss": 0.6892591714859009, + "eval_runtime": 178.329, + "eval_samples_per_second": 11.215, + "eval_steps_per_second": 1.402, + "step": 13800 + }, + { + "epoch": 1.77, + "learning_rate": 3.438750484058345e-05, + "loss": 0.6804, + "step": 13820 + }, + { + "epoch": 1.78, + "learning_rate": 3.4000258164450754e-05, + "loss": 0.6873, + "step": 13840 + }, + { + "epoch": 1.78, + "learning_rate": 3.361301148831805e-05, + "loss": 0.6783, + "step": 13860 + }, + { + "epoch": 1.78, + "learning_rate": 3.322576481218536e-05, + "loss": 0.6843, + "step": 13880 + }, + { + "epoch": 1.78, + "learning_rate": 3.2838518136052665e-05, + "loss": 0.6755, + "step": 13900 + }, + { + "epoch": 1.79, + "learning_rate": 3.2451271459919964e-05, + "loss": 0.684, + "step": 13920 + }, + { + "epoch": 1.79, + "learning_rate": 3.206402478378727e-05, + "loss": 0.6828, + "step": 13940 + }, + { + "epoch": 1.79, + "learning_rate": 3.167677810765457e-05, + "loss": 0.6771, + "step": 13960 + }, + { + "epoch": 1.79, + "learning_rate": 3.1289531431521874e-05, + "loss": 0.6818, + "step": 13980 + }, + { + "epoch": 1.8, + "learning_rate": 3.090228475538918e-05, + "loss": 0.6751, + "step": 14000 + }, + { + "epoch": 1.8, + "eval_loss": 0.6889638304710388, + "eval_runtime": 178.8668, + "eval_samples_per_second": 11.182, + "eval_steps_per_second": 1.398, + "step": 14000 + }, + { + "epoch": 1.8, + "learning_rate": 3.0515038079256482e-05, + "loss": 0.6794, + "step": 14020 + }, + { + "epoch": 1.8, + "learning_rate": 3.0127791403123788e-05, + "loss": 0.6652, + "step": 14040 + }, + { + "epoch": 1.8, + "learning_rate": 2.974054472699109e-05, + "loss": 0.6746, + "step": 14060 + }, + { + "epoch": 1.81, + "learning_rate": 2.9353298050858393e-05, + "loss": 0.6847, + "step": 14080 + }, + { + "epoch": 1.81, + "learning_rate": 2.89660513747257e-05, + "loss": 0.6733, + "step": 14100 + }, + { + "epoch": 1.81, + "learning_rate": 2.8578804698593004e-05, + "loss": 0.6794, + "step": 14120 + }, + { + "epoch": 1.81, + "learning_rate": 2.8191558022460303e-05, + "loss": 0.6748, + "step": 14140 + }, + { + "epoch": 1.82, + "learning_rate": 2.780431134632761e-05, + "loss": 0.6668, + "step": 14160 + }, + { + "epoch": 1.82, + "learning_rate": 2.7417064670194915e-05, + "loss": 0.6845, + "step": 14180 + }, + { + "epoch": 1.82, + "learning_rate": 2.7029817994062214e-05, + "loss": 0.6819, + "step": 14200 + }, + { + "epoch": 1.82, + "eval_loss": 0.68879234790802, + "eval_runtime": 178.3854, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 14200 + }, + { + "epoch": 1.82, + "learning_rate": 2.6661933651736152e-05, + "loss": 0.6857, + "step": 14220 + }, + { + "epoch": 1.83, + "learning_rate": 2.6274686975603458e-05, + "loss": 0.6837, + "step": 14240 + }, + { + "epoch": 1.83, + "learning_rate": 2.588744029947076e-05, + "loss": 0.679, + "step": 14260 + }, + { + "epoch": 1.83, + "learning_rate": 2.5500193623338062e-05, + "loss": 0.6809, + "step": 14280 + }, + { + "epoch": 1.83, + "learning_rate": 2.5112946947205368e-05, + "loss": 0.683, + "step": 14300 + }, + { + "epoch": 1.84, + "learning_rate": 2.4725700271072674e-05, + "loss": 0.6787, + "step": 14320 + }, + { + "epoch": 1.84, + "learning_rate": 2.4338453594939973e-05, + "loss": 0.6842, + "step": 14340 + }, + { + "epoch": 1.84, + "learning_rate": 2.395120691880728e-05, + "loss": 0.682, + "step": 14360 + }, + { + "epoch": 1.84, + "learning_rate": 2.3563960242674584e-05, + "loss": 0.6751, + "step": 14380 + }, + { + "epoch": 1.85, + "learning_rate": 2.3176713566541883e-05, + "loss": 0.682, + "step": 14400 + }, + { + "epoch": 1.85, + "eval_loss": 0.6884602308273315, + "eval_runtime": 180.582, + "eval_samples_per_second": 11.075, + "eval_steps_per_second": 1.384, + "step": 14400 + }, + { + "epoch": 1.85, + "learning_rate": 2.278946689040919e-05, + "loss": 0.6728, + "step": 14420 + }, + { + "epoch": 1.85, + "learning_rate": 2.2402220214276495e-05, + "loss": 0.6839, + "step": 14440 + }, + { + "epoch": 1.85, + "learning_rate": 2.2014973538143794e-05, + "loss": 0.6828, + "step": 14460 + }, + { + "epoch": 1.86, + "learning_rate": 2.16277268620111e-05, + "loss": 0.6752, + "step": 14480 + }, + { + "epoch": 1.86, + "learning_rate": 2.1240480185878405e-05, + "loss": 0.682, + "step": 14500 + }, + { + "epoch": 1.86, + "learning_rate": 2.0853233509745704e-05, + "loss": 0.6802, + "step": 14520 + }, + { + "epoch": 1.86, + "learning_rate": 2.046598683361301e-05, + "loss": 0.6809, + "step": 14540 + }, + { + "epoch": 1.87, + "learning_rate": 2.0078740157480316e-05, + "loss": 0.6802, + "step": 14560 + }, + { + "epoch": 1.87, + "learning_rate": 1.9691493481347615e-05, + "loss": 0.6763, + "step": 14580 + }, + { + "epoch": 1.87, + "learning_rate": 1.930424680521492e-05, + "loss": 0.6876, + "step": 14600 + }, + { + "epoch": 1.87, + "eval_loss": 0.6882807612419128, + "eval_runtime": 177.6338, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 14600 + }, + { + "epoch": 1.88, + "learning_rate": 1.8917000129082226e-05, + "loss": 0.6789, + "step": 14620 + }, + { + "epoch": 1.88, + "learning_rate": 1.8529753452949525e-05, + "loss": 0.6772, + "step": 14640 + }, + { + "epoch": 1.88, + "learning_rate": 1.814250677681683e-05, + "loss": 0.6783, + "step": 14660 + }, + { + "epoch": 1.88, + "learning_rate": 1.7755260100684133e-05, + "loss": 0.6829, + "step": 14680 + }, + { + "epoch": 1.89, + "learning_rate": 1.736801342455144e-05, + "loss": 0.6809, + "step": 14700 + }, + { + "epoch": 1.89, + "learning_rate": 1.698076674841874e-05, + "loss": 0.6766, + "step": 14720 + }, + { + "epoch": 1.89, + "learning_rate": 1.6593520072286043e-05, + "loss": 0.6813, + "step": 14740 + }, + { + "epoch": 1.89, + "learning_rate": 1.620627339615335e-05, + "loss": 0.6793, + "step": 14760 + }, + { + "epoch": 1.9, + "learning_rate": 1.581902672002065e-05, + "loss": 0.6736, + "step": 14780 + }, + { + "epoch": 1.9, + "learning_rate": 1.5431780043887957e-05, + "loss": 0.6842, + "step": 14800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6880614757537842, + "eval_runtime": 177.7541, + "eval_samples_per_second": 11.251, + "eval_steps_per_second": 1.406, + "step": 14800 + }, + { + "epoch": 1.9, + "learning_rate": 1.5044533367755258e-05, + "loss": 0.682, + "step": 14820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4657286691622562e-05, + "loss": 0.6776, + "step": 14840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4270040015489866e-05, + "loss": 0.6776, + "step": 14860 + }, + { + "epoch": 1.91, + "learning_rate": 1.3882793339357168e-05, + "loss": 0.6717, + "step": 14880 + }, + { + "epoch": 1.91, + "learning_rate": 1.3495546663224472e-05, + "loss": 0.6871, + "step": 14900 + }, + { + "epoch": 1.91, + "learning_rate": 1.3108299987091776e-05, + "loss": 0.6872, + "step": 14920 + }, + { + "epoch": 1.92, + "learning_rate": 1.272105331095908e-05, + "loss": 0.6833, + "step": 14940 + }, + { + "epoch": 1.92, + "learning_rate": 1.2333806634826383e-05, + "loss": 0.6926, + "step": 14960 + }, + { + "epoch": 1.92, + "learning_rate": 1.1946559958693687e-05, + "loss": 0.6741, + "step": 14980 + }, + { + "epoch": 1.92, + "learning_rate": 1.1559313282560991e-05, + "loss": 0.6756, + "step": 15000 + }, + { + "epoch": 1.92, + "eval_loss": 0.6879639625549316, + "eval_runtime": 177.6222, + "eval_samples_per_second": 11.26, + "eval_steps_per_second": 1.407, + "step": 15000 + } + ], + "max_steps": 15594, + "num_train_epochs": 2, + "total_flos": 1.949805622547448e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4820a8207e6b86d4107eb87f94c763093a3c7f88 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d327ea9f712be818d41a24603cd835992a4e9e3612a85caf2415ab699d6a50 +size 3579 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb39891543c3e6a5a6b4e38a7b4f1d1088c0a160 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8c200bd8422cf044a90c223c0d176459a14cf9687f0e41a37cfe2ab3b8d6a67 +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dad33e13ec371db4c18c4d4cc59378aa9ac79c45 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dcdffef65339b11ffc55279a0ad63ce04e3c09e7247c7f54d1a6bc29d6fff2 +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/rng_state.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee89a2fe430e3da87b790c66c7d931992232f9e9 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52704287125685e13be49c11d3c17bf338c00ed9fdbcdd6972348fc8e6b6dc29 +size 14575 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4836eb9ff1268f82e6a2468ea7621dc08bb56fcb --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d7c2e69e5b3f7cb43530e1cc8336651f8c7f2d08e0e496d6eb190f9cd9c465b +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d673173e267dd27a9efcee8eaa4bb03e6133ad68 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3630575f35c6cb2516d6d68a466f2304033a5a8ec817a07f3f5722bbb0a874c2 +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f312a82078e313bb7d92428468ff37741af98a99 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json @@ -0,0 +1,5184 @@ +{ + "best_metric": 0.6878132224082947, + "best_model_checkpoint": "lora-alpaca/checkpoint-15200", + "epoch": 1.9494677440041042, + "global_step": 15200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.8988, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 0.7184, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.7227, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 0.7244, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 0.7225, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002996127533238673, + "loss": 0.7183, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002992255066477346, + "loss": 0.7246, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029883825997160186, + "loss": 0.7334, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002984510132954692, + "loss": 0.7224, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002980637666193365, + "loss": 0.7212, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 0.7327759861946106, + "eval_runtime": 178.5232, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002976765199432038, + "loss": 0.7298, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002972892732670711, + "loss": 0.7275, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002969020265909384, + "loss": 0.7265, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002965147799148057, + "loss": 0.7285, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 0.000296127533238673, + "loss": 0.7218, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002957402865625403, + "loss": 0.715, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029535303988640764, + "loss": 0.7347, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002949657932102749, + "loss": 0.7228, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029457854653414225, + "loss": 0.7198, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029419129985800953, + "loss": 0.7196, + "step": 400 + }, + { + "epoch": 0.05, + "eval_loss": 0.7325090765953064, + "eval_runtime": 178.6018, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002938040531818768, + "loss": 0.7233, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029341680650574414, + "loss": 0.7272, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002930295598296114, + "loss": 0.7272, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029264231315347876, + "loss": 0.7281, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029225506647734603, + "loss": 0.7289, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002918678198012133, + "loss": 0.7215, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029148057312508065, + "loss": 0.7234, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002910933264489479, + "loss": 0.7229, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029070607977281526, + "loss": 0.7277, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002903188330966826, + "loss": 0.7275, + "step": 600 + }, + { + "epoch": 0.08, + "eval_loss": 0.7319443821907043, + "eval_runtime": 178.6932, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028993158642054987, + "loss": 0.7195, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028954433974441715, + "loss": 0.723, + "step": 640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002891570930682845, + "loss": 0.7337, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028876984639215176, + "loss": 0.7249, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002883825997160191, + "loss": 0.7374, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028799535303988637, + "loss": 0.7144, + "step": 720 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002876081063637537, + "loss": 0.7213, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 0.000287220859687621, + "loss": 0.7215, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00028683361301148826, + "loss": 0.7275, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002864463663353556, + "loss": 0.7247, + "step": 800 + }, + { + "epoch": 0.1, + "eval_loss": 0.730565071105957, + "eval_runtime": 178.4749, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002860591196592229, + "loss": 0.717, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002856718729830902, + "loss": 0.7263, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 0.00028528462630695754, + "loss": 0.7188, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002848973796308248, + "loss": 0.7241, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002845101329546921, + "loss": 0.7257, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028412288627855943, + "loss": 0.7315, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002837356396024267, + "loss": 0.7239, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028334839292629404, + "loss": 0.7219, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002829611462501613, + "loss": 0.7257, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002825738995740286, + "loss": 0.7287, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.7289888858795166, + "eval_runtime": 178.5088, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028218665289789593, + "loss": 0.7173, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028179940622176326, + "loss": 0.7185, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028141215954563054, + "loss": 0.7204, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002810249128694979, + "loss": 0.7301, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028063766619336515, + "loss": 0.7254, + "step": 1100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028025041951723243, + "loss": 0.7212, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027986317284109976, + "loss": 0.7265, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027947592616496704, + "loss": 0.7201, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002790886794888344, + "loss": 0.7273, + "step": 1180 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027870143281270165, + "loss": 0.72, + "step": 1200 + }, + { + "epoch": 0.15, + "eval_loss": 0.727615237236023, + "eval_runtime": 178.7045, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 0.000278314186136569, + "loss": 0.7307, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027792693946043627, + "loss": 0.7164, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027753969278430354, + "loss": 0.7163, + "step": 1260 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002771524461081709, + "loss": 0.7127, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002767651994320382, + "loss": 0.7123, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002763779527559055, + "loss": 0.728, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002759907060797728, + "loss": 0.7263, + "step": 1340 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002756034594036401, + "loss": 0.7188, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002752162127275074, + "loss": 0.7142, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002748289660513747, + "loss": 0.7215, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_loss": 0.7257346510887146, + "eval_runtime": 178.9403, + "eval_samples_per_second": 11.177, + "eval_steps_per_second": 1.397, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 0.000274441719375242, + "loss": 0.7262, + "step": 1420 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002740544726991093, + "loss": 0.7185, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002736672260229766, + "loss": 0.7093, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002732799793468439, + "loss": 0.7183, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002728927326707112, + "loss": 0.7006, + "step": 1500 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027250548599457855, + "loss": 0.7168, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002721182393184458, + "loss": 0.7225, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027173099264231316, + "loss": 0.7248, + "step": 1560 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027134374596618044, + "loss": 0.7215, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002709564992900477, + "loss": 0.7179, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_loss": 0.7245064377784729, + "eval_runtime": 177.8967, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027056925261391505, + "loss": 0.7208, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002701820059377823, + "loss": 0.7213, + "step": 1640 + }, + { + "epoch": 0.21, + "learning_rate": 0.00026979475926164966, + "loss": 0.7221, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026940751258551694, + "loss": 0.7207, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026902026590938427, + "loss": 0.7218, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026863301923325155, + "loss": 0.718, + "step": 1720 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002682457725571189, + "loss": 0.7235, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026785852588098616, + "loss": 0.7138, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002674712792048535, + "loss": 0.7178, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026708403252872077, + "loss": 0.7127, + "step": 1800 + }, + { + "epoch": 0.23, + "eval_loss": 0.7234225869178772, + "eval_runtime": 177.3731, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 1800 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002666967858525881, + "loss": 0.7214, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002663095391764554, + "loss": 0.7164, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026592229250032266, + "loss": 0.7132, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026553504582419, + "loss": 0.718, + "step": 1880 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002651477991480573, + "loss": 0.7088, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002647605524719246, + "loss": 0.7145, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026437330579579194, + "loss": 0.7166, + "step": 1940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026398605911965916, + "loss": 0.7141, + "step": 1960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002635988124435265, + "loss": 0.708, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026321156576739383, + "loss": 0.7162, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_loss": 0.7224385142326355, + "eval_runtime": 177.124, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002628243190912611, + "loss": 0.7091, + "step": 2020 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026243707241512844, + "loss": 0.7133, + "step": 2040 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002620498257389957, + "loss": 0.717, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 0.000261662579062863, + "loss": 0.7246, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026127533238673033, + "loss": 0.7169, + "step": 2100 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002608880857105976, + "loss": 0.7121, + "step": 2120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026050083903446494, + "loss": 0.719, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002601135923583322, + "loss": 0.7236, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025972634568219955, + "loss": 0.7154, + "step": 2180 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025933909900606683, + "loss": 0.7148, + "step": 2200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7211937308311462, + "eval_runtime": 177.2195, + "eval_samples_per_second": 11.285, + "eval_steps_per_second": 1.411, + "step": 2200 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025895185232993416, + "loss": 0.7193, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025856460565380144, + "loss": 0.7046, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002581773589776688, + "loss": 0.7231, + "step": 2260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025779011230153605, + "loss": 0.719, + "step": 2280 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025740286562540333, + "loss": 0.707, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025701561894927067, + "loss": 0.7176, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025662837227313794, + "loss": 0.7205, + "step": 2340 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002562411255970053, + "loss": 0.7162, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025585387892087256, + "loss": 0.7223, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002554666322447399, + "loss": 0.7158, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_loss": 0.7201904654502869, + "eval_runtime": 177.1891, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 2400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025507938556860717, + "loss": 0.7155, + "step": 2420 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025469213889247445, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002543048922163418, + "loss": 0.7206, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002539176455402091, + "loss": 0.7147, + "step": 2480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002535303988640764, + "loss": 0.7116, + "step": 2500 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002531431521879437, + "loss": 0.7161, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 0.000252755905511811, + "loss": 0.7173, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002523686588356783, + "loss": 0.7159, + "step": 2560 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002519814121595456, + "loss": 0.7182, + "step": 2580 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002515941654834129, + "loss": 0.723, + "step": 2600 + }, + { + "epoch": 0.33, + "eval_loss": 0.7193037867546082, + "eval_runtime": 177.0591, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002512069188072802, + "loss": 0.717, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 0.00025081967213114756, + "loss": 0.7164, + "step": 2640 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002504324254550148, + "loss": 0.7173, + "step": 2660 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002500451787788821, + "loss": 0.7154, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024965793210274945, + "loss": 0.7101, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002492706854266167, + "loss": 0.7088, + "step": 2720 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024888343875048406, + "loss": 0.715, + "step": 2740 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024849619207435134, + "loss": 0.7165, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002481089453982186, + "loss": 0.7094, + "step": 2780 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024772169872208595, + "loss": 0.7116, + "step": 2800 + }, + { + "epoch": 0.36, + "eval_loss": 0.7178795337677002, + "eval_runtime": 177.1788, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 2800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024733445204595323, + "loss": 0.7121, + "step": 2820 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024694720536982056, + "loss": 0.7197, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024655995869368784, + "loss": 0.7133, + "step": 2860 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024617271201755517, + "loss": 0.7124, + "step": 2880 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024578546534142245, + "loss": 0.7081, + "step": 2900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002453982186652898, + "loss": 0.7105, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 0.00024501097198915706, + "loss": 0.7005, + "step": 2940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002446237253130244, + "loss": 0.7111, + "step": 2960 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002442364786368917, + "loss": 0.7035, + "step": 2980 + }, + { + "epoch": 0.38, + "learning_rate": 0.000243849231960759, + "loss": 0.7125, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7173203229904175, + "eval_runtime": 176.9402, + "eval_samples_per_second": 11.303, + "eval_steps_per_second": 1.413, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00024346198528462626, + "loss": 0.7143, + "step": 3020 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002430747386084936, + "loss": 0.7121, + "step": 3040 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002426874919323609, + "loss": 0.7093, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002423002452562282, + "loss": 0.711, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002419129985800955, + "loss": 0.7239, + "step": 3100 + }, + { + "epoch": 0.4, + "learning_rate": 0.00024152575190396281, + "loss": 0.7183, + "step": 3120 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002411385052278301, + "loss": 0.7056, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002407512585516974, + "loss": 0.7109, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002403640118755647, + "loss": 0.7183, + "step": 3180 + }, + { + "epoch": 0.41, + "learning_rate": 0.000239976765199432, + "loss": 0.7135, + "step": 3200 + }, + { + "epoch": 0.41, + "eval_loss": 0.7156603932380676, + "eval_runtime": 178.5954, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00023958951852329932, + "loss": 0.7022, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023920227184716665, + "loss": 0.7155, + "step": 3240 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002388150251710339, + "loss": 0.7072, + "step": 3260 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023842777849490123, + "loss": 0.7151, + "step": 3280 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023804053181876854, + "loss": 0.7044, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023765328514263584, + "loss": 0.7141, + "step": 3320 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023726603846650315, + "loss": 0.7033, + "step": 3340 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023687879179037046, + "loss": 0.7137, + "step": 3360 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023649154511423773, + "loss": 0.7042, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023610429843810504, + "loss": 0.7156, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_loss": 0.7149476408958435, + "eval_runtime": 178.5798, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023571705176197235, + "loss": 0.7045, + "step": 3420 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023532980508583965, + "loss": 0.7021, + "step": 3440 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023494255840970698, + "loss": 0.7092, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002345553117335743, + "loss": 0.7213, + "step": 3480 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023416806505744157, + "loss": 0.7046, + "step": 3500 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023378081838130887, + "loss": 0.7076, + "step": 3520 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023339357170517618, + "loss": 0.7107, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023300632502904349, + "loss": 0.7087, + "step": 3560 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002326190783529108, + "loss": 0.7005, + "step": 3580 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002322318316767781, + "loss": 0.7064, + "step": 3600 + }, + { + "epoch": 0.46, + "eval_loss": 0.7140311002731323, + "eval_runtime": 179.0415, + "eval_samples_per_second": 11.171, + "eval_steps_per_second": 1.396, + "step": 3600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023184458500064538, + "loss": 0.714, + "step": 3620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023145733832451268, + "loss": 0.7102, + "step": 3640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023107009164838, + "loss": 0.7202, + "step": 3660 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002306828449722473, + "loss": 0.7016, + "step": 3680 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023029559829611463, + "loss": 0.7126, + "step": 3700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022990835161998193, + "loss": 0.7055, + "step": 3720 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002295211049438492, + "loss": 0.7118, + "step": 3740 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022913385826771652, + "loss": 0.707, + "step": 3760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022874661159158382, + "loss": 0.7119, + "step": 3780 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022835936491545113, + "loss": 0.7023, + "step": 3800 + }, + { + "epoch": 0.49, + "eval_loss": 0.7134947776794434, + "eval_runtime": 179.7115, + "eval_samples_per_second": 11.129, + "eval_steps_per_second": 1.391, + "step": 3800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022797211823931843, + "loss": 0.6967, + "step": 3820 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022758487156318574, + "loss": 0.7172, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022719762488705302, + "loss": 0.7137, + "step": 3860 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022681037821092032, + "loss": 0.7164, + "step": 3880 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022642313153478763, + "loss": 0.7099, + "step": 3900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022603588485865494, + "loss": 0.7119, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022564863818252227, + "loss": 0.7098, + "step": 3940 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022526139150638957, + "loss": 0.7067, + "step": 3960 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022487414483025685, + "loss": 0.705, + "step": 3980 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022448689815412416, + "loss": 0.7125, + "step": 4000 + }, + { + "epoch": 0.51, + "eval_loss": 0.7128713130950928, + "eval_runtime": 178.8128, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022409965147799146, + "loss": 0.7098, + "step": 4020 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022371240480185877, + "loss": 0.7081, + "step": 4040 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022332515812572608, + "loss": 0.6982, + "step": 4060 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022293791144959338, + "loss": 0.7122, + "step": 4080 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022255066477346066, + "loss": 0.6974, + "step": 4100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022216341809732797, + "loss": 0.7018, + "step": 4120 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022177617142119527, + "loss": 0.7075, + "step": 4140 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022138892474506258, + "loss": 0.7013, + "step": 4160 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002210016780689299, + "loss": 0.7103, + "step": 4180 + }, + { + "epoch": 0.54, + "learning_rate": 0.00022061443139279722, + "loss": 0.7005, + "step": 4200 + }, + { + "epoch": 0.54, + "eval_loss": 0.7116231918334961, + "eval_runtime": 179.0816, + "eval_samples_per_second": 11.168, + "eval_steps_per_second": 1.396, + "step": 4200 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002202271847166645, + "loss": 0.7071, + "step": 4220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002198399380405318, + "loss": 0.7114, + "step": 4240 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002194526913643991, + "loss": 0.705, + "step": 4260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002190654446882664, + "loss": 0.706, + "step": 4280 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021867819801213372, + "loss": 0.7016, + "step": 4300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021829095133600102, + "loss": 0.7084, + "step": 4320 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002179037046598683, + "loss": 0.7187, + "step": 4340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002175164579837356, + "loss": 0.7044, + "step": 4360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002171292113076029, + "loss": 0.7068, + "step": 4380 + }, + { + "epoch": 0.56, + "learning_rate": 0.00021674196463147025, + "loss": 0.7082, + "step": 4400 + }, + { + "epoch": 0.56, + "eval_loss": 0.7112064957618713, + "eval_runtime": 178.8847, + "eval_samples_per_second": 11.18, + "eval_steps_per_second": 1.398, + "step": 4400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021635471795533755, + "loss": 0.705, + "step": 4420 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021596747127920486, + "loss": 0.704, + "step": 4440 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021558022460307214, + "loss": 0.7071, + "step": 4460 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021519297792693944, + "loss": 0.708, + "step": 4480 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021480573125080675, + "loss": 0.705, + "step": 4500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021441848457467405, + "loss": 0.7061, + "step": 4520 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021403123789854136, + "loss": 0.7074, + "step": 4540 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021364399122240864, + "loss": 0.7148, + "step": 4560 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021325674454627594, + "loss": 0.7091, + "step": 4580 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021286949787014325, + "loss": 0.7103, + "step": 4600 + }, + { + "epoch": 0.59, + "eval_loss": 0.7104864716529846, + "eval_runtime": 178.4457, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 4600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021248225119401055, + "loss": 0.706, + "step": 4620 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002120950045178779, + "loss": 0.6966, + "step": 4640 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002117077578417452, + "loss": 0.6991, + "step": 4660 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021132051116561247, + "loss": 0.7039, + "step": 4680 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021093326448947978, + "loss": 0.7059, + "step": 4700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00021054601781334708, + "loss": 0.7122, + "step": 4720 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002101587711372144, + "loss": 0.7099, + "step": 4740 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002097715244610817, + "loss": 0.6998, + "step": 4760 + }, + { + "epoch": 0.61, + "learning_rate": 0.000209384277784949, + "loss": 0.7048, + "step": 4780 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020899703110881628, + "loss": 0.7077, + "step": 4800 + }, + { + "epoch": 0.62, + "eval_loss": 0.7102417945861816, + "eval_runtime": 178.4169, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 4800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020860978443268359, + "loss": 0.7172, + "step": 4820 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002082225377565509, + "loss": 0.7084, + "step": 4840 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002078352910804182, + "loss": 0.7058, + "step": 4860 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020744804440428553, + "loss": 0.6988, + "step": 4880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020706079772815284, + "loss": 0.7008, + "step": 4900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020667355105202011, + "loss": 0.6986, + "step": 4920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020628630437588742, + "loss": 0.7042, + "step": 4940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020589905769975473, + "loss": 0.7139, + "step": 4960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020551181102362203, + "loss": 0.7094, + "step": 4980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020512456434748934, + "loss": 0.7059, + "step": 5000 + }, + { + "epoch": 0.64, + "eval_loss": 0.7092374563217163, + "eval_runtime": 177.3439, + "eval_samples_per_second": 11.278, + "eval_steps_per_second": 1.41, + "step": 5000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020473731767135664, + "loss": 0.7042, + "step": 5020 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020435007099522392, + "loss": 0.6964, + "step": 5040 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020396282431909123, + "loss": 0.7041, + "step": 5060 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020357557764295853, + "loss": 0.6972, + "step": 5080 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020318833096682587, + "loss": 0.7011, + "step": 5100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020280108429069317, + "loss": 0.7073, + "step": 5120 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020241383761456048, + "loss": 0.706, + "step": 5140 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020202659093842776, + "loss": 0.6949, + "step": 5160 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020163934426229506, + "loss": 0.703, + "step": 5180 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020125209758616237, + "loss": 0.7058, + "step": 5200 + }, + { + "epoch": 0.67, + "eval_loss": 0.7084789276123047, + "eval_runtime": 177.3051, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 5200 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020086485091002967, + "loss": 0.7032, + "step": 5220 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020047760423389698, + "loss": 0.7045, + "step": 5240 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020009035755776428, + "loss": 0.7069, + "step": 5260 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019970311088163156, + "loss": 0.6961, + "step": 5280 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019931586420549887, + "loss": 0.6981, + "step": 5300 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019892861752936617, + "loss": 0.701, + "step": 5320 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001985413708532335, + "loss": 0.6979, + "step": 5340 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001981541241771008, + "loss": 0.6975, + "step": 5360 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019776687750096812, + "loss": 0.6923, + "step": 5380 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001973796308248354, + "loss": 0.7089, + "step": 5400 + }, + { + "epoch": 0.69, + "eval_loss": 0.7074704170227051, + "eval_runtime": 177.1889, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 5400 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001969923841487027, + "loss": 0.6986, + "step": 5420 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019660513747257, + "loss": 0.6991, + "step": 5440 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019621789079643731, + "loss": 0.6946, + "step": 5460 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019583064412030462, + "loss": 0.7057, + "step": 5480 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019544339744417193, + "loss": 0.694, + "step": 5500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001950561507680392, + "loss": 0.7046, + "step": 5520 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001946689040919065, + "loss": 0.6998, + "step": 5540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019428165741577382, + "loss": 0.6995, + "step": 5560 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019389441073964115, + "loss": 0.7081, + "step": 5580 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019350716406350845, + "loss": 0.6974, + "step": 5600 + }, + { + "epoch": 0.72, + "eval_loss": 0.7067714333534241, + "eval_runtime": 177.0363, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 5600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019311991738737576, + "loss": 0.7019, + "step": 5620 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019273267071124304, + "loss": 0.7003, + "step": 5640 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019234542403511034, + "loss": 0.6966, + "step": 5660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019195817735897765, + "loss": 0.7055, + "step": 5680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019157093068284496, + "loss": 0.7069, + "step": 5700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019118368400671226, + "loss": 0.6981, + "step": 5720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019079643733057957, + "loss": 0.7005, + "step": 5740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019040919065444685, + "loss": 0.7033, + "step": 5760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019002194397831415, + "loss": 0.7009, + "step": 5780 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018963469730218146, + "loss": 0.7001, + "step": 5800 + }, + { + "epoch": 0.74, + "eval_loss": 0.7066617608070374, + "eval_runtime": 177.2571, + "eval_samples_per_second": 11.283, + "eval_steps_per_second": 1.41, + "step": 5800 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001892474506260488, + "loss": 0.7048, + "step": 5820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001888602039499161, + "loss": 0.698, + "step": 5840 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001884729572737834, + "loss": 0.7035, + "step": 5860 + }, + { + "epoch": 0.75, + "learning_rate": 0.00018808571059765068, + "loss": 0.6997, + "step": 5880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00018769846392151799, + "loss": 0.7053, + "step": 5900 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001873112172453853, + "loss": 0.6951, + "step": 5920 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001869239705692526, + "loss": 0.701, + "step": 5940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001865367238931199, + "loss": 0.7032, + "step": 5960 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001861494772169872, + "loss": 0.7005, + "step": 5980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001857622305408545, + "loss": 0.7013, + "step": 6000 + }, + { + "epoch": 0.77, + "eval_loss": 0.705744743347168, + "eval_runtime": 177.421, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 6000 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001853749838647218, + "loss": 0.6923, + "step": 6020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00018498773718858913, + "loss": 0.701, + "step": 6040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018460049051245643, + "loss": 0.7027, + "step": 6060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018421324383632374, + "loss": 0.6973, + "step": 6080 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018382599716019104, + "loss": 0.6982, + "step": 6100 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018343875048405832, + "loss": 0.7024, + "step": 6120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018305150380792563, + "loss": 0.6996, + "step": 6140 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018266425713179293, + "loss": 0.7063, + "step": 6160 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018227701045566024, + "loss": 0.7005, + "step": 6180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018188976377952755, + "loss": 0.6913, + "step": 6200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7049428224563599, + "eval_runtime": 180.0706, + "eval_samples_per_second": 11.107, + "eval_steps_per_second": 1.388, + "step": 6200 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018150251710339485, + "loss": 0.6998, + "step": 6220 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018111527042726213, + "loss": 0.7044, + "step": 6240 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018072802375112944, + "loss": 0.6988, + "step": 6260 + }, + { + "epoch": 0.81, + "learning_rate": 0.00018034077707499677, + "loss": 0.6979, + "step": 6280 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017995353039886407, + "loss": 0.7048, + "step": 6300 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017956628372273138, + "loss": 0.6941, + "step": 6320 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017917903704659869, + "loss": 0.6963, + "step": 6340 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017879179037046596, + "loss": 0.6954, + "step": 6360 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017840454369433327, + "loss": 0.6953, + "step": 6380 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017801729701820058, + "loss": 0.693, + "step": 6400 + }, + { + "epoch": 0.82, + "eval_loss": 0.7036707997322083, + "eval_runtime": 178.6236, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 6400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017763005034206788, + "loss": 0.7041, + "step": 6420 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001772428036659352, + "loss": 0.6908, + "step": 6440 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001768555569898025, + "loss": 0.6961, + "step": 6460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017646831031366977, + "loss": 0.6967, + "step": 6480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017608106363753708, + "loss": 0.7019, + "step": 6500 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001756938169614044, + "loss": 0.7036, + "step": 6520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017530657028527172, + "loss": 0.6941, + "step": 6540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017491932360913902, + "loss": 0.6995, + "step": 6560 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017453207693300633, + "loss": 0.6962, + "step": 6580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001741448302568736, + "loss": 0.6963, + "step": 6600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7036789655685425, + "eval_runtime": 177.1424, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 6600 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001737575835807409, + "loss": 0.7009, + "step": 6620 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017337033690460822, + "loss": 0.6964, + "step": 6640 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017298309022847552, + "loss": 0.6974, + "step": 6660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017259584355234283, + "loss": 0.6964, + "step": 6680 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001722085968762101, + "loss": 0.6966, + "step": 6700 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001718213502000774, + "loss": 0.7016, + "step": 6720 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017143410352394475, + "loss": 0.6996, + "step": 6740 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017104685684781205, + "loss": 0.6985, + "step": 6760 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017065961017167936, + "loss": 0.7, + "step": 6780 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017027236349554666, + "loss": 0.6846, + "step": 6800 + }, + { + "epoch": 0.87, + "eval_loss": 0.7028091549873352, + "eval_runtime": 177.0434, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 6800 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016988511681941394, + "loss": 0.6994, + "step": 6820 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016949787014328125, + "loss": 0.6995, + "step": 6840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016911062346714855, + "loss": 0.6949, + "step": 6860 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016872337679101586, + "loss": 0.6903, + "step": 6880 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016833613011488316, + "loss": 0.6983, + "step": 6900 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016794888343875047, + "loss": 0.6979, + "step": 6920 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016756163676261775, + "loss": 0.6963, + "step": 6940 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016717439008648505, + "loss": 0.6963, + "step": 6960 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001667871434103524, + "loss": 0.7109, + "step": 6980 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001663998967342197, + "loss": 0.6996, + "step": 7000 + }, + { + "epoch": 0.9, + "eval_loss": 0.7027884721755981, + "eval_runtime": 177.0464, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 7000 + }, + { + "epoch": 0.9, + "learning_rate": 0.000166012650058087, + "loss": 0.6953, + "step": 7020 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001656254033819543, + "loss": 0.701, + "step": 7040 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016523815670582158, + "loss": 0.6941, + "step": 7060 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001648509100296889, + "loss": 0.6946, + "step": 7080 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001644636633535562, + "loss": 0.6905, + "step": 7100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001640764166774235, + "loss": 0.6938, + "step": 7120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001636891700012908, + "loss": 0.6964, + "step": 7140 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001633019233251581, + "loss": 0.6979, + "step": 7160 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001629146766490254, + "loss": 0.6909, + "step": 7180 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001625274299728927, + "loss": 0.7017, + "step": 7200 + }, + { + "epoch": 0.92, + "eval_loss": 0.7013801336288452, + "eval_runtime": 177.3532, + "eval_samples_per_second": 11.277, + "eval_steps_per_second": 1.41, + "step": 7200 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016214018329676003, + "loss": 0.6986, + "step": 7220 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016175293662062734, + "loss": 0.6985, + "step": 7240 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016136568994449464, + "loss": 0.694, + "step": 7260 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016097844326836195, + "loss": 0.7124, + "step": 7280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016059119659222923, + "loss": 0.6936, + "step": 7300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016020394991609653, + "loss": 0.6904, + "step": 7320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015981670323996384, + "loss": 0.6994, + "step": 7340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015942945656383114, + "loss": 0.7046, + "step": 7360 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015904220988769845, + "loss": 0.6999, + "step": 7380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015865496321156575, + "loss": 0.6952, + "step": 7400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7015686631202698, + "eval_runtime": 178.4607, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 7400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015826771653543303, + "loss": 0.7001, + "step": 7420 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015788046985930034, + "loss": 0.6948, + "step": 7440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015749322318316767, + "loss": 0.702, + "step": 7460 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015710597650703498, + "loss": 0.695, + "step": 7480 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015671872983090228, + "loss": 0.7053, + "step": 7500 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001563314831547696, + "loss": 0.7041, + "step": 7520 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015594423647863687, + "loss": 0.6975, + "step": 7540 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015555698980250417, + "loss": 0.6914, + "step": 7560 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015516974312637148, + "loss": 0.6961, + "step": 7580 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015478249645023878, + "loss": 0.6968, + "step": 7600 + }, + { + "epoch": 0.97, + "eval_loss": 0.7004283666610718, + "eval_runtime": 178.8057, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 7600 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001543952497741061, + "loss": 0.6847, + "step": 7620 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015400800309797342, + "loss": 0.6954, + "step": 7640 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015362075642184067, + "loss": 0.6967, + "step": 7660 + }, + { + "epoch": 0.98, + "learning_rate": 0.000153233509745708, + "loss": 0.6941, + "step": 7680 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001528462630695753, + "loss": 0.6928, + "step": 7700 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015245901639344262, + "loss": 0.7035, + "step": 7720 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015207176971730992, + "loss": 0.6918, + "step": 7740 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015168452304117723, + "loss": 0.6996, + "step": 7760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001512972763650445, + "loss": 0.6975, + "step": 7780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015091002968891181, + "loss": 0.7022, + "step": 7800 + }, + { + "epoch": 1.0, + "eval_loss": 0.6998333930969238, + "eval_runtime": 178.2222, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 7800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015052278301277912, + "loss": 0.6854, + "step": 7820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015013553633664643, + "loss": 0.6911, + "step": 7840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014974828966051373, + "loss": 0.6846, + "step": 7860 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014936104298438104, + "loss": 0.6859, + "step": 7880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014897379630824834, + "loss": 0.6802, + "step": 7900 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014858654963211565, + "loss": 0.6891, + "step": 7920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014819930295598295, + "loss": 0.6833, + "step": 7940 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014781205627985026, + "loss": 0.6866, + "step": 7960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014742480960371754, + "loss": 0.6863, + "step": 7980 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014703756292758484, + "loss": 0.6898, + "step": 8000 + }, + { + "epoch": 1.03, + "eval_loss": 0.699661135673523, + "eval_runtime": 177.9496, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 8000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014665031625145218, + "loss": 0.6881, + "step": 8020 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014626306957531946, + "loss": 0.6894, + "step": 8040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014587582289918676, + "loss": 0.685, + "step": 8060 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014548857622305407, + "loss": 0.6837, + "step": 8080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014510132954692137, + "loss": 0.6944, + "step": 8100 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014471408287078868, + "loss": 0.6883, + "step": 8120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014432683619465598, + "loss": 0.6874, + "step": 8140 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001439395895185233, + "loss": 0.6867, + "step": 8160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001435523428423906, + "loss": 0.6875, + "step": 8180 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001431650961662579, + "loss": 0.7005, + "step": 8200 + }, + { + "epoch": 1.05, + "eval_loss": 0.699004590511322, + "eval_runtime": 177.1183, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 8200 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014277784949012518, + "loss": 0.6968, + "step": 8220 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014239060281399249, + "loss": 0.6884, + "step": 8240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014200335613785982, + "loss": 0.6808, + "step": 8260 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001416161094617271, + "loss": 0.6851, + "step": 8280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001412288627855944, + "loss": 0.6917, + "step": 8300 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001408416161094617, + "loss": 0.6944, + "step": 8320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014045436943332901, + "loss": 0.6851, + "step": 8340 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014006712275719632, + "loss": 0.6829, + "step": 8360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00013967987608106363, + "loss": 0.6872, + "step": 8380 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013929262940493093, + "loss": 0.6909, + "step": 8400 + }, + { + "epoch": 1.08, + "eval_loss": 0.6990391612052917, + "eval_runtime": 177.2945, + "eval_samples_per_second": 11.281, + "eval_steps_per_second": 1.41, + "step": 8400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013890538272879824, + "loss": 0.6924, + "step": 8420 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013851813605266554, + "loss": 0.6747, + "step": 8440 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013813088937653282, + "loss": 0.6932, + "step": 8460 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013774364270040016, + "loss": 0.6892, + "step": 8480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013735639602426746, + "loss": 0.6868, + "step": 8500 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013696914934813474, + "loss": 0.6898, + "step": 8520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013658190267200205, + "loss": 0.6896, + "step": 8540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013619465599586935, + "loss": 0.6848, + "step": 8560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013580740931973666, + "loss": 0.6801, + "step": 8580 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013542016264360396, + "loss": 0.6927, + "step": 8600 + }, + { + "epoch": 1.1, + "eval_loss": 0.6984953880310059, + "eval_runtime": 177.3114, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 8600 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013503291596747127, + "loss": 0.688, + "step": 8620 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013464566929133857, + "loss": 0.693, + "step": 8640 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013425842261520588, + "loss": 0.6815, + "step": 8660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013387117593907319, + "loss": 0.6828, + "step": 8680 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013348392926294046, + "loss": 0.6834, + "step": 8700 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001330966825868078, + "loss": 0.6839, + "step": 8720 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001327094359106751, + "loss": 0.6838, + "step": 8740 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013232218923454238, + "loss": 0.6847, + "step": 8760 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001319349425584097, + "loss": 0.6894, + "step": 8780 + }, + { + "epoch": 1.13, + "learning_rate": 0.000131547695882277, + "loss": 0.6922, + "step": 8800 + }, + { + "epoch": 1.13, + "eval_loss": 0.698199450969696, + "eval_runtime": 177.0428, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 8800 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001311604492061443, + "loss": 0.6904, + "step": 8820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001307732025300116, + "loss": 0.6854, + "step": 8840 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001303859558538789, + "loss": 0.6877, + "step": 8860 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012999870917774622, + "loss": 0.6857, + "step": 8880 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012961146250161352, + "loss": 0.6856, + "step": 8900 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012922421582548083, + "loss": 0.687, + "step": 8920 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001288369691493481, + "loss": 0.6905, + "step": 8940 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012844972247321544, + "loss": 0.6865, + "step": 8960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012806247579708274, + "loss": 0.6829, + "step": 8980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012767522912095002, + "loss": 0.696, + "step": 9000 + }, + { + "epoch": 1.15, + "eval_loss": 0.6971157789230347, + "eval_runtime": 177.1296, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 9000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012728798244481733, + "loss": 0.6825, + "step": 9020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012690073576868463, + "loss": 0.6844, + "step": 9040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012651348909255194, + "loss": 0.6853, + "step": 9060 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012612624241641925, + "loss": 0.6889, + "step": 9080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012573899574028655, + "loss": 0.6848, + "step": 9100 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012535174906415386, + "loss": 0.6953, + "step": 9120 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012496450238802116, + "loss": 0.6944, + "step": 9140 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012457725571188847, + "loss": 0.6893, + "step": 9160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012419000903575575, + "loss": 0.6831, + "step": 9180 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012380276235962308, + "loss": 0.683, + "step": 9200 + }, + { + "epoch": 1.18, + "eval_loss": 0.6971254944801331, + "eval_runtime": 177.2118, + "eval_samples_per_second": 11.286, + "eval_steps_per_second": 1.411, + "step": 9200 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012341551568349039, + "loss": 0.6782, + "step": 9220 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012302826900735766, + "loss": 0.6962, + "step": 9240 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012264102233122497, + "loss": 0.6808, + "step": 9260 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001222537756550923, + "loss": 0.6931, + "step": 9280 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012186652897895958, + "loss": 0.6878, + "step": 9300 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012147928230282689, + "loss": 0.6855, + "step": 9320 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012109203562669421, + "loss": 0.6865, + "step": 9340 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001207047889505615, + "loss": 0.6945, + "step": 9360 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001203175422744288, + "loss": 0.6862, + "step": 9380 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011993029559829611, + "loss": 0.6932, + "step": 9400 + }, + { + "epoch": 1.21, + "eval_loss": 0.6963634490966797, + "eval_runtime": 177.2999, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 9400 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001195430489221634, + "loss": 0.6793, + "step": 9420 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011915580224603071, + "loss": 0.6827, + "step": 9440 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011876855556989803, + "loss": 0.6836, + "step": 9460 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011838130889376532, + "loss": 0.6876, + "step": 9480 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011799406221763263, + "loss": 0.6949, + "step": 9500 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011760681554149993, + "loss": 0.6809, + "step": 9520 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011721956886536722, + "loss": 0.6923, + "step": 9540 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011683232218923453, + "loss": 0.6975, + "step": 9560 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011644507551310185, + "loss": 0.6981, + "step": 9580 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011605782883696914, + "loss": 0.6843, + "step": 9600 + }, + { + "epoch": 1.23, + "eval_loss": 0.6958213448524475, + "eval_runtime": 177.1541, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 9600 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011567058216083645, + "loss": 0.6904, + "step": 9620 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011528333548470375, + "loss": 0.6761, + "step": 9640 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011489608880857104, + "loss": 0.6933, + "step": 9660 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011450884213243835, + "loss": 0.6913, + "step": 9680 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011412159545630567, + "loss": 0.6958, + "step": 9700 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011373434878017296, + "loss": 0.6902, + "step": 9720 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011334710210404027, + "loss": 0.6796, + "step": 9740 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011295985542790757, + "loss": 0.6906, + "step": 9760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011257260875177487, + "loss": 0.6882, + "step": 9780 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011218536207564217, + "loss": 0.6856, + "step": 9800 + }, + { + "epoch": 1.26, + "eval_loss": 0.6958709359169006, + "eval_runtime": 177.1197, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 9800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011179811539950949, + "loss": 0.681, + "step": 9820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011141086872337678, + "loss": 0.6824, + "step": 9840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011102362204724409, + "loss": 0.6921, + "step": 9860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001106363753711114, + "loss": 0.6862, + "step": 9880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00011024912869497869, + "loss": 0.6869, + "step": 9900 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010986188201884599, + "loss": 0.6867, + "step": 9920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010947463534271328, + "loss": 0.6885, + "step": 9940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001090873886665806, + "loss": 0.6801, + "step": 9960 + }, + { + "epoch": 1.28, + "learning_rate": 0.00010870014199044791, + "loss": 0.684, + "step": 9980 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001083128953143152, + "loss": 0.6885, + "step": 10000 + }, + { + "epoch": 1.28, + "eval_loss": 0.6948391795158386, + "eval_runtime": 184.1668, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.357, + "step": 10000 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010792564863818251, + "loss": 0.685, + "step": 10020 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010753840196204981, + "loss": 0.6849, + "step": 10040 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010715115528591712, + "loss": 0.6901, + "step": 10060 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010678327094359106, + "loss": 0.6926, + "step": 10080 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010639602426745837, + "loss": 0.6863, + "step": 10100 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010600877759132567, + "loss": 0.6846, + "step": 10120 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010562153091519296, + "loss": 0.6859, + "step": 10140 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010523428423906027, + "loss": 0.6887, + "step": 10160 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010484703756292759, + "loss": 0.689, + "step": 10180 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010445979088679488, + "loss": 0.6795, + "step": 10200 + }, + { + "epoch": 1.31, + "eval_loss": 0.6949850916862488, + "eval_runtime": 177.1947, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 10200 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010407254421066219, + "loss": 0.6874, + "step": 10220 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010368529753452949, + "loss": 0.6875, + "step": 10240 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010329805085839678, + "loss": 0.6842, + "step": 10260 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010291080418226409, + "loss": 0.6842, + "step": 10280 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010252355750613141, + "loss": 0.6834, + "step": 10300 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001021363108299987, + "loss": 0.6894, + "step": 10320 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010174906415386601, + "loss": 0.6814, + "step": 10340 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010136181747773331, + "loss": 0.6731, + "step": 10360 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001009745708016006, + "loss": 0.6955, + "step": 10380 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010058732412546791, + "loss": 0.6884, + "step": 10400 + }, + { + "epoch": 1.33, + "eval_loss": 0.6947998404502869, + "eval_runtime": 183.6797, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.361, + "step": 10400 + }, + { + "epoch": 1.34, + "learning_rate": 0.00010020007744933523, + "loss": 0.6844, + "step": 10420 + }, + { + "epoch": 1.34, + "learning_rate": 9.981283077320252e-05, + "loss": 0.6795, + "step": 10440 + }, + { + "epoch": 1.34, + "learning_rate": 9.942558409706983e-05, + "loss": 0.6783, + "step": 10460 + }, + { + "epoch": 1.34, + "learning_rate": 9.903833742093713e-05, + "loss": 0.6951, + "step": 10480 + }, + { + "epoch": 1.35, + "learning_rate": 9.865109074480443e-05, + "loss": 0.6891, + "step": 10500 + }, + { + "epoch": 1.35, + "learning_rate": 9.826384406867173e-05, + "loss": 0.6899, + "step": 10520 + }, + { + "epoch": 1.35, + "learning_rate": 9.787659739253905e-05, + "loss": 0.6905, + "step": 10540 + }, + { + "epoch": 1.35, + "learning_rate": 9.748935071640634e-05, + "loss": 0.6786, + "step": 10560 + }, + { + "epoch": 1.36, + "learning_rate": 9.710210404027365e-05, + "loss": 0.6884, + "step": 10580 + }, + { + "epoch": 1.36, + "learning_rate": 9.671485736414096e-05, + "loss": 0.6757, + "step": 10600 + }, + { + "epoch": 1.36, + "eval_loss": 0.6941512227058411, + "eval_runtime": 179.4312, + "eval_samples_per_second": 11.146, + "eval_steps_per_second": 1.393, + "step": 10600 + }, + { + "epoch": 1.36, + "learning_rate": 9.632761068800825e-05, + "loss": 0.6844, + "step": 10620 + }, + { + "epoch": 1.36, + "learning_rate": 9.594036401187555e-05, + "loss": 0.6829, + "step": 10640 + }, + { + "epoch": 1.37, + "learning_rate": 9.555311733574287e-05, + "loss": 0.7007, + "step": 10660 + }, + { + "epoch": 1.37, + "learning_rate": 9.516587065961016e-05, + "loss": 0.6989, + "step": 10680 + }, + { + "epoch": 1.37, + "learning_rate": 9.477862398347747e-05, + "loss": 0.6783, + "step": 10700 + }, + { + "epoch": 1.37, + "learning_rate": 9.439137730734476e-05, + "loss": 0.6783, + "step": 10720 + }, + { + "epoch": 1.38, + "learning_rate": 9.400413063121207e-05, + "loss": 0.6879, + "step": 10740 + }, + { + "epoch": 1.38, + "learning_rate": 9.361688395507937e-05, + "loss": 0.6801, + "step": 10760 + }, + { + "epoch": 1.38, + "learning_rate": 9.322963727894667e-05, + "loss": 0.6786, + "step": 10780 + }, + { + "epoch": 1.39, + "learning_rate": 9.284239060281399e-05, + "loss": 0.6882, + "step": 10800 + }, + { + "epoch": 1.39, + "eval_loss": 0.6937060356140137, + "eval_runtime": 178.6714, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 1.39, + "learning_rate": 9.245514392668129e-05, + "loss": 0.6796, + "step": 10820 + }, + { + "epoch": 1.39, + "learning_rate": 9.206789725054858e-05, + "loss": 0.6886, + "step": 10840 + }, + { + "epoch": 1.39, + "learning_rate": 9.168065057441589e-05, + "loss": 0.683, + "step": 10860 + }, + { + "epoch": 1.4, + "learning_rate": 9.129340389828321e-05, + "loss": 0.6786, + "step": 10880 + }, + { + "epoch": 1.4, + "learning_rate": 9.09061572221505e-05, + "loss": 0.6843, + "step": 10900 + }, + { + "epoch": 1.4, + "learning_rate": 9.05189105460178e-05, + "loss": 0.6795, + "step": 10920 + }, + { + "epoch": 1.4, + "learning_rate": 9.013166386988511e-05, + "loss": 0.6813, + "step": 10940 + }, + { + "epoch": 1.41, + "learning_rate": 8.97444171937524e-05, + "loss": 0.687, + "step": 10960 + }, + { + "epoch": 1.41, + "learning_rate": 8.935717051761971e-05, + "loss": 0.6872, + "step": 10980 + }, + { + "epoch": 1.41, + "learning_rate": 8.896992384148703e-05, + "loss": 0.6746, + "step": 11000 + }, + { + "epoch": 1.41, + "eval_loss": 0.6936533451080322, + "eval_runtime": 178.4177, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 11000 + }, + { + "epoch": 1.41, + "learning_rate": 8.858267716535432e-05, + "loss": 0.6747, + "step": 11020 + }, + { + "epoch": 1.42, + "learning_rate": 8.819543048922163e-05, + "loss": 0.6975, + "step": 11040 + }, + { + "epoch": 1.42, + "learning_rate": 8.780818381308893e-05, + "loss": 0.6806, + "step": 11060 + }, + { + "epoch": 1.42, + "learning_rate": 8.742093713695623e-05, + "loss": 0.6789, + "step": 11080 + }, + { + "epoch": 1.42, + "learning_rate": 8.703369046082353e-05, + "loss": 0.6851, + "step": 11100 + }, + { + "epoch": 1.43, + "learning_rate": 8.664644378469085e-05, + "loss": 0.6836, + "step": 11120 + }, + { + "epoch": 1.43, + "learning_rate": 8.625919710855814e-05, + "loss": 0.6891, + "step": 11140 + }, + { + "epoch": 1.43, + "learning_rate": 8.587195043242545e-05, + "loss": 0.6898, + "step": 11160 + }, + { + "epoch": 1.43, + "learning_rate": 8.548470375629275e-05, + "loss": 0.6798, + "step": 11180 + }, + { + "epoch": 1.44, + "learning_rate": 8.509745708016005e-05, + "loss": 0.6822, + "step": 11200 + }, + { + "epoch": 1.44, + "eval_loss": 0.6932746767997742, + "eval_runtime": 178.4279, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 11200 + }, + { + "epoch": 1.44, + "learning_rate": 8.471021040402735e-05, + "loss": 0.6776, + "step": 11220 + }, + { + "epoch": 1.44, + "learning_rate": 8.432296372789467e-05, + "loss": 0.6835, + "step": 11240 + }, + { + "epoch": 1.44, + "learning_rate": 8.393571705176196e-05, + "loss": 0.6852, + "step": 11260 + }, + { + "epoch": 1.45, + "learning_rate": 8.354847037562927e-05, + "loss": 0.686, + "step": 11280 + }, + { + "epoch": 1.45, + "learning_rate": 8.316122369949657e-05, + "loss": 0.6884, + "step": 11300 + }, + { + "epoch": 1.45, + "learning_rate": 8.277397702336387e-05, + "loss": 0.6811, + "step": 11320 + }, + { + "epoch": 1.45, + "learning_rate": 8.238673034723117e-05, + "loss": 0.6751, + "step": 11340 + }, + { + "epoch": 1.46, + "learning_rate": 8.199948367109849e-05, + "loss": 0.6837, + "step": 11360 + }, + { + "epoch": 1.46, + "learning_rate": 8.161223699496578e-05, + "loss": 0.6839, + "step": 11380 + }, + { + "epoch": 1.46, + "learning_rate": 8.122499031883309e-05, + "loss": 0.6804, + "step": 11400 + }, + { + "epoch": 1.46, + "eval_loss": 0.6925450563430786, + "eval_runtime": 177.1737, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 11400 + }, + { + "epoch": 1.46, + "learning_rate": 8.08377436427004e-05, + "loss": 0.6885, + "step": 11420 + }, + { + "epoch": 1.47, + "learning_rate": 8.045049696656769e-05, + "loss": 0.6907, + "step": 11440 + }, + { + "epoch": 1.47, + "learning_rate": 8.0063250290435e-05, + "loss": 0.6868, + "step": 11460 + }, + { + "epoch": 1.47, + "learning_rate": 7.967600361430231e-05, + "loss": 0.6945, + "step": 11480 + }, + { + "epoch": 1.47, + "learning_rate": 7.92887569381696e-05, + "loss": 0.6851, + "step": 11500 + }, + { + "epoch": 1.48, + "learning_rate": 7.890151026203691e-05, + "loss": 0.6878, + "step": 11520 + }, + { + "epoch": 1.48, + "learning_rate": 7.851426358590422e-05, + "loss": 0.6955, + "step": 11540 + }, + { + "epoch": 1.48, + "learning_rate": 7.812701690977151e-05, + "loss": 0.6783, + "step": 11560 + }, + { + "epoch": 1.49, + "learning_rate": 7.773977023363881e-05, + "loss": 0.6857, + "step": 11580 + }, + { + "epoch": 1.49, + "learning_rate": 7.735252355750613e-05, + "loss": 0.6828, + "step": 11600 + }, + { + "epoch": 1.49, + "eval_loss": 0.6924574971199036, + "eval_runtime": 177.3841, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.409, + "step": 11600 + }, + { + "epoch": 1.49, + "learning_rate": 7.696527688137343e-05, + "loss": 0.6796, + "step": 11620 + }, + { + "epoch": 1.49, + "learning_rate": 7.657803020524073e-05, + "loss": 0.6823, + "step": 11640 + }, + { + "epoch": 1.5, + "learning_rate": 7.619078352910804e-05, + "loss": 0.6854, + "step": 11660 + }, + { + "epoch": 1.5, + "learning_rate": 7.580353685297533e-05, + "loss": 0.6797, + "step": 11680 + }, + { + "epoch": 1.5, + "learning_rate": 7.541629017684265e-05, + "loss": 0.6811, + "step": 11700 + }, + { + "epoch": 1.5, + "learning_rate": 7.502904350070995e-05, + "loss": 0.6775, + "step": 11720 + }, + { + "epoch": 1.51, + "learning_rate": 7.464179682457725e-05, + "loss": 0.687, + "step": 11740 + }, + { + "epoch": 1.51, + "learning_rate": 7.425455014844455e-05, + "loss": 0.6859, + "step": 11760 + }, + { + "epoch": 1.51, + "learning_rate": 7.386730347231186e-05, + "loss": 0.683, + "step": 11780 + }, + { + "epoch": 1.51, + "learning_rate": 7.348005679617916e-05, + "loss": 0.6812, + "step": 11800 + }, + { + "epoch": 1.51, + "eval_loss": 0.692126452922821, + "eval_runtime": 182.938, + "eval_samples_per_second": 10.933, + "eval_steps_per_second": 1.367, + "step": 11800 + }, + { + "epoch": 1.52, + "learning_rate": 7.309281012004647e-05, + "loss": 0.688, + "step": 11820 + }, + { + "epoch": 1.52, + "learning_rate": 7.270556344391376e-05, + "loss": 0.6774, + "step": 11840 + }, + { + "epoch": 1.52, + "learning_rate": 7.231831676778107e-05, + "loss": 0.6806, + "step": 11860 + }, + { + "epoch": 1.52, + "learning_rate": 7.193107009164837e-05, + "loss": 0.6756, + "step": 11880 + }, + { + "epoch": 1.53, + "learning_rate": 7.154382341551568e-05, + "loss": 0.6856, + "step": 11900 + }, + { + "epoch": 1.53, + "learning_rate": 7.115657673938298e-05, + "loss": 0.6822, + "step": 11920 + }, + { + "epoch": 1.53, + "learning_rate": 7.076933006325029e-05, + "loss": 0.6769, + "step": 11940 + }, + { + "epoch": 1.53, + "learning_rate": 7.038208338711758e-05, + "loss": 0.6759, + "step": 11960 + }, + { + "epoch": 1.54, + "learning_rate": 6.999483671098489e-05, + "loss": 0.6854, + "step": 11980 + }, + { + "epoch": 1.54, + "learning_rate": 6.96075900348522e-05, + "loss": 0.6855, + "step": 12000 + }, + { + "epoch": 1.54, + "eval_loss": 0.6914573907852173, + "eval_runtime": 177.6919, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 12000 + }, + { + "epoch": 1.54, + "learning_rate": 6.92203433587195e-05, + "loss": 0.6816, + "step": 12020 + }, + { + "epoch": 1.54, + "learning_rate": 6.88330966825868e-05, + "loss": 0.6801, + "step": 12040 + }, + { + "epoch": 1.55, + "learning_rate": 6.844585000645411e-05, + "loss": 0.6818, + "step": 12060 + }, + { + "epoch": 1.55, + "learning_rate": 6.80586033303214e-05, + "loss": 0.6808, + "step": 12080 + }, + { + "epoch": 1.55, + "learning_rate": 6.767135665418872e-05, + "loss": 0.6849, + "step": 12100 + }, + { + "epoch": 1.55, + "learning_rate": 6.728410997805601e-05, + "loss": 0.6902, + "step": 12120 + }, + { + "epoch": 1.56, + "learning_rate": 6.689686330192332e-05, + "loss": 0.6795, + "step": 12140 + }, + { + "epoch": 1.56, + "learning_rate": 6.652897895959726e-05, + "loss": 0.6783, + "step": 12160 + }, + { + "epoch": 1.56, + "learning_rate": 6.614173228346455e-05, + "loss": 0.6839, + "step": 12180 + }, + { + "epoch": 1.56, + "learning_rate": 6.575448560733186e-05, + "loss": 0.6875, + "step": 12200 + }, + { + "epoch": 1.56, + "eval_loss": 0.6915743947029114, + "eval_runtime": 177.7146, + "eval_samples_per_second": 11.254, + "eval_steps_per_second": 1.407, + "step": 12200 + }, + { + "epoch": 1.57, + "learning_rate": 6.536723893119917e-05, + "loss": 0.687, + "step": 12220 + }, + { + "epoch": 1.57, + "learning_rate": 6.497999225506647e-05, + "loss": 0.683, + "step": 12240 + }, + { + "epoch": 1.57, + "learning_rate": 6.459274557893378e-05, + "loss": 0.6796, + "step": 12260 + }, + { + "epoch": 1.57, + "learning_rate": 6.420549890280108e-05, + "loss": 0.6757, + "step": 12280 + }, + { + "epoch": 1.58, + "learning_rate": 6.381825222666838e-05, + "loss": 0.6901, + "step": 12300 + }, + { + "epoch": 1.58, + "learning_rate": 6.343100555053568e-05, + "loss": 0.6734, + "step": 12320 + }, + { + "epoch": 1.58, + "learning_rate": 6.304375887440299e-05, + "loss": 0.6863, + "step": 12340 + }, + { + "epoch": 1.59, + "learning_rate": 6.265651219827029e-05, + "loss": 0.6769, + "step": 12360 + }, + { + "epoch": 1.59, + "learning_rate": 6.22692655221376e-05, + "loss": 0.678, + "step": 12380 + }, + { + "epoch": 1.59, + "learning_rate": 6.18820188460049e-05, + "loss": 0.6934, + "step": 12400 + }, + { + "epoch": 1.59, + "eval_loss": 0.6910441517829895, + "eval_runtime": 177.091, + "eval_samples_per_second": 11.294, + "eval_steps_per_second": 1.412, + "step": 12400 + }, + { + "epoch": 1.59, + "learning_rate": 6.14947721698722e-05, + "loss": 0.6925, + "step": 12420 + }, + { + "epoch": 1.6, + "learning_rate": 6.11075254937395e-05, + "loss": 0.6797, + "step": 12440 + }, + { + "epoch": 1.6, + "learning_rate": 6.0720278817606814e-05, + "loss": 0.6839, + "step": 12460 + }, + { + "epoch": 1.6, + "learning_rate": 6.0333032141474113e-05, + "loss": 0.6825, + "step": 12480 + }, + { + "epoch": 1.6, + "learning_rate": 5.994578546534141e-05, + "loss": 0.6816, + "step": 12500 + }, + { + "epoch": 1.61, + "learning_rate": 5.9558538789208725e-05, + "loss": 0.6844, + "step": 12520 + }, + { + "epoch": 1.61, + "learning_rate": 5.9171292113076024e-05, + "loss": 0.6833, + "step": 12540 + }, + { + "epoch": 1.61, + "learning_rate": 5.878404543694332e-05, + "loss": 0.6827, + "step": 12560 + }, + { + "epoch": 1.61, + "learning_rate": 5.8396798760810635e-05, + "loss": 0.6802, + "step": 12580 + }, + { + "epoch": 1.62, + "learning_rate": 5.8009552084677934e-05, + "loss": 0.6837, + "step": 12600 + }, + { + "epoch": 1.62, + "eval_loss": 0.690994381904602, + "eval_runtime": 177.3691, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 12600 + }, + { + "epoch": 1.62, + "learning_rate": 5.762230540854523e-05, + "loss": 0.6781, + "step": 12620 + }, + { + "epoch": 1.62, + "learning_rate": 5.7235058732412546e-05, + "loss": 0.6816, + "step": 12640 + }, + { + "epoch": 1.62, + "learning_rate": 5.6847812056279845e-05, + "loss": 0.6823, + "step": 12660 + }, + { + "epoch": 1.63, + "learning_rate": 5.6460565380147144e-05, + "loss": 0.6824, + "step": 12680 + }, + { + "epoch": 1.63, + "learning_rate": 5.6073318704014456e-05, + "loss": 0.6786, + "step": 12700 + }, + { + "epoch": 1.63, + "learning_rate": 5.5686072027881755e-05, + "loss": 0.6822, + "step": 12720 + }, + { + "epoch": 1.63, + "learning_rate": 5.5298825351749054e-05, + "loss": 0.6792, + "step": 12740 + }, + { + "epoch": 1.64, + "learning_rate": 5.491157867561637e-05, + "loss": 0.6797, + "step": 12760 + }, + { + "epoch": 1.64, + "learning_rate": 5.4524331999483666e-05, + "loss": 0.6814, + "step": 12780 + }, + { + "epoch": 1.64, + "learning_rate": 5.4137085323350965e-05, + "loss": 0.6827, + "step": 12800 + }, + { + "epoch": 1.64, + "eval_loss": 0.6906899809837341, + "eval_runtime": 177.4169, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 12800 + }, + { + "epoch": 1.64, + "learning_rate": 5.374983864721828e-05, + "loss": 0.6776, + "step": 12820 + }, + { + "epoch": 1.65, + "learning_rate": 5.3362591971085576e-05, + "loss": 0.6877, + "step": 12840 + }, + { + "epoch": 1.65, + "learning_rate": 5.297534529495288e-05, + "loss": 0.6786, + "step": 12860 + }, + { + "epoch": 1.65, + "learning_rate": 5.258809861882019e-05, + "loss": 0.6853, + "step": 12880 + }, + { + "epoch": 1.65, + "learning_rate": 5.2200851942687486e-05, + "loss": 0.6843, + "step": 12900 + }, + { + "epoch": 1.66, + "learning_rate": 5.181360526655479e-05, + "loss": 0.6872, + "step": 12920 + }, + { + "epoch": 1.66, + "learning_rate": 5.14263585904221e-05, + "loss": 0.6862, + "step": 12940 + }, + { + "epoch": 1.66, + "learning_rate": 5.10391119142894e-05, + "loss": 0.6804, + "step": 12960 + }, + { + "epoch": 1.66, + "learning_rate": 5.06518652381567e-05, + "loss": 0.6786, + "step": 12980 + }, + { + "epoch": 1.67, + "learning_rate": 5.026461856202401e-05, + "loss": 0.6839, + "step": 13000 + }, + { + "epoch": 1.67, + "eval_loss": 0.6902768015861511, + "eval_runtime": 177.1355, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 13000 + }, + { + "epoch": 1.67, + "learning_rate": 4.987737188589131e-05, + "loss": 0.682, + "step": 13020 + }, + { + "epoch": 1.67, + "learning_rate": 4.949012520975861e-05, + "loss": 0.6835, + "step": 13040 + }, + { + "epoch": 1.68, + "learning_rate": 4.910287853362592e-05, + "loss": 0.6822, + "step": 13060 + }, + { + "epoch": 1.68, + "learning_rate": 4.871563185749322e-05, + "loss": 0.6827, + "step": 13080 + }, + { + "epoch": 1.68, + "learning_rate": 4.8328385181360523e-05, + "loss": 0.6744, + "step": 13100 + }, + { + "epoch": 1.68, + "learning_rate": 4.794113850522782e-05, + "loss": 0.6817, + "step": 13120 + }, + { + "epoch": 1.69, + "learning_rate": 4.755389182909513e-05, + "loss": 0.6765, + "step": 13140 + }, + { + "epoch": 1.69, + "learning_rate": 4.7166645152962434e-05, + "loss": 0.682, + "step": 13160 + }, + { + "epoch": 1.69, + "learning_rate": 4.677939847682973e-05, + "loss": 0.6849, + "step": 13180 + }, + { + "epoch": 1.69, + "learning_rate": 4.639215180069704e-05, + "loss": 0.686, + "step": 13200 + }, + { + "epoch": 1.69, + "eval_loss": 0.6899891495704651, + "eval_runtime": 183.0495, + "eval_samples_per_second": 10.926, + "eval_steps_per_second": 1.366, + "step": 13200 + }, + { + "epoch": 1.7, + "learning_rate": 4.6004905124564344e-05, + "loss": 0.6787, + "step": 13220 + }, + { + "epoch": 1.7, + "learning_rate": 4.561765844843164e-05, + "loss": 0.6786, + "step": 13240 + }, + { + "epoch": 1.7, + "learning_rate": 4.523041177229895e-05, + "loss": 0.6913, + "step": 13260 + }, + { + "epoch": 1.7, + "learning_rate": 4.4843165096166255e-05, + "loss": 0.6721, + "step": 13280 + }, + { + "epoch": 1.71, + "learning_rate": 4.4455918420033554e-05, + "loss": 0.6783, + "step": 13300 + }, + { + "epoch": 1.71, + "learning_rate": 4.4068671743900866e-05, + "loss": 0.687, + "step": 13320 + }, + { + "epoch": 1.71, + "learning_rate": 4.3681425067768165e-05, + "loss": 0.6815, + "step": 13340 + }, + { + "epoch": 1.71, + "learning_rate": 4.3294178391635464e-05, + "loss": 0.6786, + "step": 13360 + }, + { + "epoch": 1.72, + "learning_rate": 4.290693171550278e-05, + "loss": 0.6845, + "step": 13380 + }, + { + "epoch": 1.72, + "learning_rate": 4.2519685039370076e-05, + "loss": 0.6817, + "step": 13400 + }, + { + "epoch": 1.72, + "eval_loss": 0.6897545456886292, + "eval_runtime": 177.4673, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.409, + "step": 13400 + }, + { + "epoch": 1.72, + "learning_rate": 4.2132438363237375e-05, + "loss": 0.6783, + "step": 13420 + }, + { + "epoch": 1.72, + "learning_rate": 4.174519168710469e-05, + "loss": 0.6775, + "step": 13440 + }, + { + "epoch": 1.73, + "learning_rate": 4.1357945010971986e-05, + "loss": 0.6752, + "step": 13460 + }, + { + "epoch": 1.73, + "learning_rate": 4.0970698334839285e-05, + "loss": 0.6732, + "step": 13480 + }, + { + "epoch": 1.73, + "learning_rate": 4.05834516587066e-05, + "loss": 0.6854, + "step": 13500 + }, + { + "epoch": 1.73, + "learning_rate": 4.0196204982573896e-05, + "loss": 0.6814, + "step": 13520 + }, + { + "epoch": 1.74, + "learning_rate": 3.9808958306441195e-05, + "loss": 0.6832, + "step": 13540 + }, + { + "epoch": 1.74, + "learning_rate": 3.942171163030851e-05, + "loss": 0.6764, + "step": 13560 + }, + { + "epoch": 1.74, + "learning_rate": 3.903446495417581e-05, + "loss": 0.6797, + "step": 13580 + }, + { + "epoch": 1.74, + "learning_rate": 3.8647218278043106e-05, + "loss": 0.6786, + "step": 13600 + }, + { + "epoch": 1.74, + "eval_loss": 0.6894007325172424, + "eval_runtime": 177.6931, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 13600 + }, + { + "epoch": 1.75, + "learning_rate": 3.825997160191042e-05, + "loss": 0.6878, + "step": 13620 + }, + { + "epoch": 1.75, + "learning_rate": 3.787272492577772e-05, + "loss": 0.6847, + "step": 13640 + }, + { + "epoch": 1.75, + "learning_rate": 3.748547824964502e-05, + "loss": 0.6773, + "step": 13660 + }, + { + "epoch": 1.75, + "learning_rate": 3.709823157351232e-05, + "loss": 0.6723, + "step": 13680 + }, + { + "epoch": 1.76, + "learning_rate": 3.671098489737963e-05, + "loss": 0.6849, + "step": 13700 + }, + { + "epoch": 1.76, + "learning_rate": 3.6323738221246933e-05, + "loss": 0.6788, + "step": 13720 + }, + { + "epoch": 1.76, + "learning_rate": 3.593649154511423e-05, + "loss": 0.6851, + "step": 13740 + }, + { + "epoch": 1.76, + "learning_rate": 3.554924486898154e-05, + "loss": 0.6842, + "step": 13760 + }, + { + "epoch": 1.77, + "learning_rate": 3.5161998192848844e-05, + "loss": 0.6763, + "step": 13780 + }, + { + "epoch": 1.77, + "learning_rate": 3.477475151671614e-05, + "loss": 0.6795, + "step": 13800 + }, + { + "epoch": 1.77, + "eval_loss": 0.6892591714859009, + "eval_runtime": 178.329, + "eval_samples_per_second": 11.215, + "eval_steps_per_second": 1.402, + "step": 13800 + }, + { + "epoch": 1.77, + "learning_rate": 3.438750484058345e-05, + "loss": 0.6804, + "step": 13820 + }, + { + "epoch": 1.78, + "learning_rate": 3.4000258164450754e-05, + "loss": 0.6873, + "step": 13840 + }, + { + "epoch": 1.78, + "learning_rate": 3.361301148831805e-05, + "loss": 0.6783, + "step": 13860 + }, + { + "epoch": 1.78, + "learning_rate": 3.322576481218536e-05, + "loss": 0.6843, + "step": 13880 + }, + { + "epoch": 1.78, + "learning_rate": 3.2838518136052665e-05, + "loss": 0.6755, + "step": 13900 + }, + { + "epoch": 1.79, + "learning_rate": 3.2451271459919964e-05, + "loss": 0.684, + "step": 13920 + }, + { + "epoch": 1.79, + "learning_rate": 3.206402478378727e-05, + "loss": 0.6828, + "step": 13940 + }, + { + "epoch": 1.79, + "learning_rate": 3.167677810765457e-05, + "loss": 0.6771, + "step": 13960 + }, + { + "epoch": 1.79, + "learning_rate": 3.1289531431521874e-05, + "loss": 0.6818, + "step": 13980 + }, + { + "epoch": 1.8, + "learning_rate": 3.090228475538918e-05, + "loss": 0.6751, + "step": 14000 + }, + { + "epoch": 1.8, + "eval_loss": 0.6889638304710388, + "eval_runtime": 178.8668, + "eval_samples_per_second": 11.182, + "eval_steps_per_second": 1.398, + "step": 14000 + }, + { + "epoch": 1.8, + "learning_rate": 3.0515038079256482e-05, + "loss": 0.6794, + "step": 14020 + }, + { + "epoch": 1.8, + "learning_rate": 3.0127791403123788e-05, + "loss": 0.6652, + "step": 14040 + }, + { + "epoch": 1.8, + "learning_rate": 2.974054472699109e-05, + "loss": 0.6746, + "step": 14060 + }, + { + "epoch": 1.81, + "learning_rate": 2.9353298050858393e-05, + "loss": 0.6847, + "step": 14080 + }, + { + "epoch": 1.81, + "learning_rate": 2.89660513747257e-05, + "loss": 0.6733, + "step": 14100 + }, + { + "epoch": 1.81, + "learning_rate": 2.8578804698593004e-05, + "loss": 0.6794, + "step": 14120 + }, + { + "epoch": 1.81, + "learning_rate": 2.8191558022460303e-05, + "loss": 0.6748, + "step": 14140 + }, + { + "epoch": 1.82, + "learning_rate": 2.780431134632761e-05, + "loss": 0.6668, + "step": 14160 + }, + { + "epoch": 1.82, + "learning_rate": 2.7417064670194915e-05, + "loss": 0.6845, + "step": 14180 + }, + { + "epoch": 1.82, + "learning_rate": 2.7029817994062214e-05, + "loss": 0.6819, + "step": 14200 + }, + { + "epoch": 1.82, + "eval_loss": 0.68879234790802, + "eval_runtime": 178.3854, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 14200 + }, + { + "epoch": 1.82, + "learning_rate": 2.6661933651736152e-05, + "loss": 0.6857, + "step": 14220 + }, + { + "epoch": 1.83, + "learning_rate": 2.6274686975603458e-05, + "loss": 0.6837, + "step": 14240 + }, + { + "epoch": 1.83, + "learning_rate": 2.588744029947076e-05, + "loss": 0.679, + "step": 14260 + }, + { + "epoch": 1.83, + "learning_rate": 2.5500193623338062e-05, + "loss": 0.6809, + "step": 14280 + }, + { + "epoch": 1.83, + "learning_rate": 2.5112946947205368e-05, + "loss": 0.683, + "step": 14300 + }, + { + "epoch": 1.84, + "learning_rate": 2.4725700271072674e-05, + "loss": 0.6787, + "step": 14320 + }, + { + "epoch": 1.84, + "learning_rate": 2.4338453594939973e-05, + "loss": 0.6842, + "step": 14340 + }, + { + "epoch": 1.84, + "learning_rate": 2.395120691880728e-05, + "loss": 0.682, + "step": 14360 + }, + { + "epoch": 1.84, + "learning_rate": 2.3563960242674584e-05, + "loss": 0.6751, + "step": 14380 + }, + { + "epoch": 1.85, + "learning_rate": 2.3176713566541883e-05, + "loss": 0.682, + "step": 14400 + }, + { + "epoch": 1.85, + "eval_loss": 0.6884602308273315, + "eval_runtime": 180.582, + "eval_samples_per_second": 11.075, + "eval_steps_per_second": 1.384, + "step": 14400 + }, + { + "epoch": 1.85, + "learning_rate": 2.278946689040919e-05, + "loss": 0.6728, + "step": 14420 + }, + { + "epoch": 1.85, + "learning_rate": 2.2402220214276495e-05, + "loss": 0.6839, + "step": 14440 + }, + { + "epoch": 1.85, + "learning_rate": 2.2014973538143794e-05, + "loss": 0.6828, + "step": 14460 + }, + { + "epoch": 1.86, + "learning_rate": 2.16277268620111e-05, + "loss": 0.6752, + "step": 14480 + }, + { + "epoch": 1.86, + "learning_rate": 2.1240480185878405e-05, + "loss": 0.682, + "step": 14500 + }, + { + "epoch": 1.86, + "learning_rate": 2.0853233509745704e-05, + "loss": 0.6802, + "step": 14520 + }, + { + "epoch": 1.86, + "learning_rate": 2.046598683361301e-05, + "loss": 0.6809, + "step": 14540 + }, + { + "epoch": 1.87, + "learning_rate": 2.0078740157480316e-05, + "loss": 0.6802, + "step": 14560 + }, + { + "epoch": 1.87, + "learning_rate": 1.9691493481347615e-05, + "loss": 0.6763, + "step": 14580 + }, + { + "epoch": 1.87, + "learning_rate": 1.930424680521492e-05, + "loss": 0.6876, + "step": 14600 + }, + { + "epoch": 1.87, + "eval_loss": 0.6882807612419128, + "eval_runtime": 177.6338, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 14600 + }, + { + "epoch": 1.88, + "learning_rate": 1.8917000129082226e-05, + "loss": 0.6789, + "step": 14620 + }, + { + "epoch": 1.88, + "learning_rate": 1.8529753452949525e-05, + "loss": 0.6772, + "step": 14640 + }, + { + "epoch": 1.88, + "learning_rate": 1.814250677681683e-05, + "loss": 0.6783, + "step": 14660 + }, + { + "epoch": 1.88, + "learning_rate": 1.7755260100684133e-05, + "loss": 0.6829, + "step": 14680 + }, + { + "epoch": 1.89, + "learning_rate": 1.736801342455144e-05, + "loss": 0.6809, + "step": 14700 + }, + { + "epoch": 1.89, + "learning_rate": 1.698076674841874e-05, + "loss": 0.6766, + "step": 14720 + }, + { + "epoch": 1.89, + "learning_rate": 1.6593520072286043e-05, + "loss": 0.6813, + "step": 14740 + }, + { + "epoch": 1.89, + "learning_rate": 1.620627339615335e-05, + "loss": 0.6793, + "step": 14760 + }, + { + "epoch": 1.9, + "learning_rate": 1.581902672002065e-05, + "loss": 0.6736, + "step": 14780 + }, + { + "epoch": 1.9, + "learning_rate": 1.5431780043887957e-05, + "loss": 0.6842, + "step": 14800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6880614757537842, + "eval_runtime": 177.7541, + "eval_samples_per_second": 11.251, + "eval_steps_per_second": 1.406, + "step": 14800 + }, + { + "epoch": 1.9, + "learning_rate": 1.5044533367755258e-05, + "loss": 0.682, + "step": 14820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4657286691622562e-05, + "loss": 0.6776, + "step": 14840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4270040015489866e-05, + "loss": 0.6776, + "step": 14860 + }, + { + "epoch": 1.91, + "learning_rate": 1.3882793339357168e-05, + "loss": 0.6717, + "step": 14880 + }, + { + "epoch": 1.91, + "learning_rate": 1.3495546663224472e-05, + "loss": 0.6871, + "step": 14900 + }, + { + "epoch": 1.91, + "learning_rate": 1.3108299987091776e-05, + "loss": 0.6872, + "step": 14920 + }, + { + "epoch": 1.92, + "learning_rate": 1.272105331095908e-05, + "loss": 0.6833, + "step": 14940 + }, + { + "epoch": 1.92, + "learning_rate": 1.2333806634826383e-05, + "loss": 0.6926, + "step": 14960 + }, + { + "epoch": 1.92, + "learning_rate": 1.1946559958693687e-05, + "loss": 0.6741, + "step": 14980 + }, + { + "epoch": 1.92, + "learning_rate": 1.1559313282560991e-05, + "loss": 0.6756, + "step": 15000 + }, + { + "epoch": 1.92, + "eval_loss": 0.6879639625549316, + "eval_runtime": 177.6222, + "eval_samples_per_second": 11.26, + "eval_steps_per_second": 1.407, + "step": 15000 + }, + { + "epoch": 1.93, + "learning_rate": 1.1172066606428293e-05, + "loss": 0.6803, + "step": 15020 + }, + { + "epoch": 1.93, + "learning_rate": 1.0784819930295599e-05, + "loss": 0.6753, + "step": 15040 + }, + { + "epoch": 1.93, + "learning_rate": 1.0397573254162901e-05, + "loss": 0.6749, + "step": 15060 + }, + { + "epoch": 1.93, + "learning_rate": 1.0010326578030204e-05, + "loss": 0.6792, + "step": 15080 + }, + { + "epoch": 1.94, + "learning_rate": 9.623079901897508e-06, + "loss": 0.6798, + "step": 15100 + }, + { + "epoch": 1.94, + "learning_rate": 9.235833225764812e-06, + "loss": 0.6838, + "step": 15120 + }, + { + "epoch": 1.94, + "learning_rate": 8.848586549632114e-06, + "loss": 0.6785, + "step": 15140 + }, + { + "epoch": 1.94, + "learning_rate": 8.461339873499418e-06, + "loss": 0.6858, + "step": 15160 + }, + { + "epoch": 1.95, + "learning_rate": 8.074093197366722e-06, + "loss": 0.6768, + "step": 15180 + }, + { + "epoch": 1.95, + "learning_rate": 7.686846521234025e-06, + "loss": 0.6891, + "step": 15200 + }, + { + "epoch": 1.95, + "eval_loss": 0.6878132224082947, + "eval_runtime": 177.6408, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 15200 + } + ], + "max_steps": 15594, + "num_train_epochs": 2, + "total_flos": 1.975803247494955e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4820a8207e6b86d4107eb87f94c763093a3c7f88 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d327ea9f712be818d41a24603cd835992a4e9e3612a85caf2415ab699d6a50 +size 3579 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/optimizer.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9e8026a873794c21021630494ca96ccaf5d1ad2 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcc458a10fe82dcee9e9432f212482eba7f9f47f4dbb32c60e5694dcb32c4700 +size 33629893 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/pytorch_model.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6cc46b154b8db0c0dfec398c00539f0bb72e0b13 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfeed223c297fd668bb441bed81e81acc8a3a709064382564a31530dc6cf16ad +size 16822989 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/rng_state.pth b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..16c7cad4d4fcc4fe10514d511064775f4bb9a27f --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8e5f28f76ff581dd7c3383f28cad3b8c2550aa95227479150b8b3923f20b65 +size 14575 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scaler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba4bf721dc96783dc4ab0b3e625823b76be9faa7 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9cb88a802f1d63caf0e00686fa31352eea172dec8bde9402f7e84b125392b5 +size 557 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scheduler.pt b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..924953532240e2cf316ec6c6aedd9dbfaab51850 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:830f71d8c3f7256fb80a514414120d9d9ac9b5dd13eedf2bd68d1a37a260efc8 +size 627 diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/trainer_state.json b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f180e82adf9b4c6ea03f71c7a5224fe3c8ebc992 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/trainer_state.json @@ -0,0 +1,5252 @@ +{ + "best_metric": 0.6877534985542297, + "best_model_checkpoint": "lora-alpaca/checkpoint-15400", + "epoch": 1.9751186353725791, + "global_step": 15400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.8988, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 0.7184, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.7227, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 0.7244, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 0.7225, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002996127533238673, + "loss": 0.7183, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002992255066477346, + "loss": 0.7246, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029883825997160186, + "loss": 0.7334, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002984510132954692, + "loss": 0.7224, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002980637666193365, + "loss": 0.7212, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 0.7327759861946106, + "eval_runtime": 178.5232, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002976765199432038, + "loss": 0.7298, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002972892732670711, + "loss": 0.7275, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002969020265909384, + "loss": 0.7265, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002965147799148057, + "loss": 0.7285, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 0.000296127533238673, + "loss": 0.7218, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002957402865625403, + "loss": 0.715, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029535303988640764, + "loss": 0.7347, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002949657932102749, + "loss": 0.7228, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029457854653414225, + "loss": 0.7198, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029419129985800953, + "loss": 0.7196, + "step": 400 + }, + { + "epoch": 0.05, + "eval_loss": 0.7325090765953064, + "eval_runtime": 178.6018, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002938040531818768, + "loss": 0.7233, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029341680650574414, + "loss": 0.7272, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002930295598296114, + "loss": 0.7272, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029264231315347876, + "loss": 0.7281, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029225506647734603, + "loss": 0.7289, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002918678198012133, + "loss": 0.7215, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029148057312508065, + "loss": 0.7234, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002910933264489479, + "loss": 0.7229, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029070607977281526, + "loss": 0.7277, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002903188330966826, + "loss": 0.7275, + "step": 600 + }, + { + "epoch": 0.08, + "eval_loss": 0.7319443821907043, + "eval_runtime": 178.6932, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028993158642054987, + "loss": 0.7195, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028954433974441715, + "loss": 0.723, + "step": 640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002891570930682845, + "loss": 0.7337, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028876984639215176, + "loss": 0.7249, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002883825997160191, + "loss": 0.7374, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028799535303988637, + "loss": 0.7144, + "step": 720 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002876081063637537, + "loss": 0.7213, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 0.000287220859687621, + "loss": 0.7215, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00028683361301148826, + "loss": 0.7275, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002864463663353556, + "loss": 0.7247, + "step": 800 + }, + { + "epoch": 0.1, + "eval_loss": 0.730565071105957, + "eval_runtime": 178.4749, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002860591196592229, + "loss": 0.717, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002856718729830902, + "loss": 0.7263, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 0.00028528462630695754, + "loss": 0.7188, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002848973796308248, + "loss": 0.7241, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002845101329546921, + "loss": 0.7257, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028412288627855943, + "loss": 0.7315, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002837356396024267, + "loss": 0.7239, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028334839292629404, + "loss": 0.7219, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002829611462501613, + "loss": 0.7257, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002825738995740286, + "loss": 0.7287, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.7289888858795166, + "eval_runtime": 178.5088, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028218665289789593, + "loss": 0.7173, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028179940622176326, + "loss": 0.7185, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028141215954563054, + "loss": 0.7204, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002810249128694979, + "loss": 0.7301, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028063766619336515, + "loss": 0.7254, + "step": 1100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028025041951723243, + "loss": 0.7212, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027986317284109976, + "loss": 0.7265, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027947592616496704, + "loss": 0.7201, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002790886794888344, + "loss": 0.7273, + "step": 1180 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027870143281270165, + "loss": 0.72, + "step": 1200 + }, + { + "epoch": 0.15, + "eval_loss": 0.727615237236023, + "eval_runtime": 178.7045, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 0.000278314186136569, + "loss": 0.7307, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027792693946043627, + "loss": 0.7164, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027753969278430354, + "loss": 0.7163, + "step": 1260 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002771524461081709, + "loss": 0.7127, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002767651994320382, + "loss": 0.7123, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002763779527559055, + "loss": 0.728, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002759907060797728, + "loss": 0.7263, + "step": 1340 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002756034594036401, + "loss": 0.7188, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002752162127275074, + "loss": 0.7142, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002748289660513747, + "loss": 0.7215, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_loss": 0.7257346510887146, + "eval_runtime": 178.9403, + "eval_samples_per_second": 11.177, + "eval_steps_per_second": 1.397, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 0.000274441719375242, + "loss": 0.7262, + "step": 1420 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002740544726991093, + "loss": 0.7185, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002736672260229766, + "loss": 0.7093, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002732799793468439, + "loss": 0.7183, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002728927326707112, + "loss": 0.7006, + "step": 1500 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027250548599457855, + "loss": 0.7168, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002721182393184458, + "loss": 0.7225, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027173099264231316, + "loss": 0.7248, + "step": 1560 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027134374596618044, + "loss": 0.7215, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002709564992900477, + "loss": 0.7179, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_loss": 0.7245064377784729, + "eval_runtime": 177.8967, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027056925261391505, + "loss": 0.7208, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002701820059377823, + "loss": 0.7213, + "step": 1640 + }, + { + "epoch": 0.21, + "learning_rate": 0.00026979475926164966, + "loss": 0.7221, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026940751258551694, + "loss": 0.7207, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026902026590938427, + "loss": 0.7218, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026863301923325155, + "loss": 0.718, + "step": 1720 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002682457725571189, + "loss": 0.7235, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026785852588098616, + "loss": 0.7138, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002674712792048535, + "loss": 0.7178, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026708403252872077, + "loss": 0.7127, + "step": 1800 + }, + { + "epoch": 0.23, + "eval_loss": 0.7234225869178772, + "eval_runtime": 177.3731, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 1800 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002666967858525881, + "loss": 0.7214, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002663095391764554, + "loss": 0.7164, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026592229250032266, + "loss": 0.7132, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026553504582419, + "loss": 0.718, + "step": 1880 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002651477991480573, + "loss": 0.7088, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002647605524719246, + "loss": 0.7145, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026437330579579194, + "loss": 0.7166, + "step": 1940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026398605911965916, + "loss": 0.7141, + "step": 1960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002635988124435265, + "loss": 0.708, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026321156576739383, + "loss": 0.7162, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_loss": 0.7224385142326355, + "eval_runtime": 177.124, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002628243190912611, + "loss": 0.7091, + "step": 2020 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026243707241512844, + "loss": 0.7133, + "step": 2040 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002620498257389957, + "loss": 0.717, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 0.000261662579062863, + "loss": 0.7246, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026127533238673033, + "loss": 0.7169, + "step": 2100 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002608880857105976, + "loss": 0.7121, + "step": 2120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026050083903446494, + "loss": 0.719, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002601135923583322, + "loss": 0.7236, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025972634568219955, + "loss": 0.7154, + "step": 2180 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025933909900606683, + "loss": 0.7148, + "step": 2200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7211937308311462, + "eval_runtime": 177.2195, + "eval_samples_per_second": 11.285, + "eval_steps_per_second": 1.411, + "step": 2200 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025895185232993416, + "loss": 0.7193, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025856460565380144, + "loss": 0.7046, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002581773589776688, + "loss": 0.7231, + "step": 2260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025779011230153605, + "loss": 0.719, + "step": 2280 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025740286562540333, + "loss": 0.707, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025701561894927067, + "loss": 0.7176, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025662837227313794, + "loss": 0.7205, + "step": 2340 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002562411255970053, + "loss": 0.7162, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025585387892087256, + "loss": 0.7223, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002554666322447399, + "loss": 0.7158, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_loss": 0.7201904654502869, + "eval_runtime": 177.1891, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 2400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025507938556860717, + "loss": 0.7155, + "step": 2420 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025469213889247445, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002543048922163418, + "loss": 0.7206, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002539176455402091, + "loss": 0.7147, + "step": 2480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002535303988640764, + "loss": 0.7116, + "step": 2500 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002531431521879437, + "loss": 0.7161, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 0.000252755905511811, + "loss": 0.7173, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002523686588356783, + "loss": 0.7159, + "step": 2560 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002519814121595456, + "loss": 0.7182, + "step": 2580 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002515941654834129, + "loss": 0.723, + "step": 2600 + }, + { + "epoch": 0.33, + "eval_loss": 0.7193037867546082, + "eval_runtime": 177.0591, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002512069188072802, + "loss": 0.717, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 0.00025081967213114756, + "loss": 0.7164, + "step": 2640 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002504324254550148, + "loss": 0.7173, + "step": 2660 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002500451787788821, + "loss": 0.7154, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024965793210274945, + "loss": 0.7101, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002492706854266167, + "loss": 0.7088, + "step": 2720 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024888343875048406, + "loss": 0.715, + "step": 2740 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024849619207435134, + "loss": 0.7165, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002481089453982186, + "loss": 0.7094, + "step": 2780 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024772169872208595, + "loss": 0.7116, + "step": 2800 + }, + { + "epoch": 0.36, + "eval_loss": 0.7178795337677002, + "eval_runtime": 177.1788, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 2800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024733445204595323, + "loss": 0.7121, + "step": 2820 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024694720536982056, + "loss": 0.7197, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024655995869368784, + "loss": 0.7133, + "step": 2860 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024617271201755517, + "loss": 0.7124, + "step": 2880 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024578546534142245, + "loss": 0.7081, + "step": 2900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002453982186652898, + "loss": 0.7105, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 0.00024501097198915706, + "loss": 0.7005, + "step": 2940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002446237253130244, + "loss": 0.7111, + "step": 2960 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002442364786368917, + "loss": 0.7035, + "step": 2980 + }, + { + "epoch": 0.38, + "learning_rate": 0.000243849231960759, + "loss": 0.7125, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7173203229904175, + "eval_runtime": 176.9402, + "eval_samples_per_second": 11.303, + "eval_steps_per_second": 1.413, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00024346198528462626, + "loss": 0.7143, + "step": 3020 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002430747386084936, + "loss": 0.7121, + "step": 3040 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002426874919323609, + "loss": 0.7093, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002423002452562282, + "loss": 0.711, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002419129985800955, + "loss": 0.7239, + "step": 3100 + }, + { + "epoch": 0.4, + "learning_rate": 0.00024152575190396281, + "loss": 0.7183, + "step": 3120 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002411385052278301, + "loss": 0.7056, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002407512585516974, + "loss": 0.7109, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002403640118755647, + "loss": 0.7183, + "step": 3180 + }, + { + "epoch": 0.41, + "learning_rate": 0.000239976765199432, + "loss": 0.7135, + "step": 3200 + }, + { + "epoch": 0.41, + "eval_loss": 0.7156603932380676, + "eval_runtime": 178.5954, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00023958951852329932, + "loss": 0.7022, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023920227184716665, + "loss": 0.7155, + "step": 3240 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002388150251710339, + "loss": 0.7072, + "step": 3260 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023842777849490123, + "loss": 0.7151, + "step": 3280 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023804053181876854, + "loss": 0.7044, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023765328514263584, + "loss": 0.7141, + "step": 3320 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023726603846650315, + "loss": 0.7033, + "step": 3340 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023687879179037046, + "loss": 0.7137, + "step": 3360 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023649154511423773, + "loss": 0.7042, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023610429843810504, + "loss": 0.7156, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_loss": 0.7149476408958435, + "eval_runtime": 178.5798, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023571705176197235, + "loss": 0.7045, + "step": 3420 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023532980508583965, + "loss": 0.7021, + "step": 3440 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023494255840970698, + "loss": 0.7092, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002345553117335743, + "loss": 0.7213, + "step": 3480 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023416806505744157, + "loss": 0.7046, + "step": 3500 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023378081838130887, + "loss": 0.7076, + "step": 3520 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023339357170517618, + "loss": 0.7107, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023300632502904349, + "loss": 0.7087, + "step": 3560 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002326190783529108, + "loss": 0.7005, + "step": 3580 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002322318316767781, + "loss": 0.7064, + "step": 3600 + }, + { + "epoch": 0.46, + "eval_loss": 0.7140311002731323, + "eval_runtime": 179.0415, + "eval_samples_per_second": 11.171, + "eval_steps_per_second": 1.396, + "step": 3600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023184458500064538, + "loss": 0.714, + "step": 3620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023145733832451268, + "loss": 0.7102, + "step": 3640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023107009164838, + "loss": 0.7202, + "step": 3660 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002306828449722473, + "loss": 0.7016, + "step": 3680 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023029559829611463, + "loss": 0.7126, + "step": 3700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022990835161998193, + "loss": 0.7055, + "step": 3720 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002295211049438492, + "loss": 0.7118, + "step": 3740 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022913385826771652, + "loss": 0.707, + "step": 3760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022874661159158382, + "loss": 0.7119, + "step": 3780 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022835936491545113, + "loss": 0.7023, + "step": 3800 + }, + { + "epoch": 0.49, + "eval_loss": 0.7134947776794434, + "eval_runtime": 179.7115, + "eval_samples_per_second": 11.129, + "eval_steps_per_second": 1.391, + "step": 3800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022797211823931843, + "loss": 0.6967, + "step": 3820 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022758487156318574, + "loss": 0.7172, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022719762488705302, + "loss": 0.7137, + "step": 3860 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022681037821092032, + "loss": 0.7164, + "step": 3880 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022642313153478763, + "loss": 0.7099, + "step": 3900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022603588485865494, + "loss": 0.7119, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022564863818252227, + "loss": 0.7098, + "step": 3940 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022526139150638957, + "loss": 0.7067, + "step": 3960 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022487414483025685, + "loss": 0.705, + "step": 3980 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022448689815412416, + "loss": 0.7125, + "step": 4000 + }, + { + "epoch": 0.51, + "eval_loss": 0.7128713130950928, + "eval_runtime": 178.8128, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022409965147799146, + "loss": 0.7098, + "step": 4020 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022371240480185877, + "loss": 0.7081, + "step": 4040 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022332515812572608, + "loss": 0.6982, + "step": 4060 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022293791144959338, + "loss": 0.7122, + "step": 4080 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022255066477346066, + "loss": 0.6974, + "step": 4100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022216341809732797, + "loss": 0.7018, + "step": 4120 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022177617142119527, + "loss": 0.7075, + "step": 4140 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022138892474506258, + "loss": 0.7013, + "step": 4160 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002210016780689299, + "loss": 0.7103, + "step": 4180 + }, + { + "epoch": 0.54, + "learning_rate": 0.00022061443139279722, + "loss": 0.7005, + "step": 4200 + }, + { + "epoch": 0.54, + "eval_loss": 0.7116231918334961, + "eval_runtime": 179.0816, + "eval_samples_per_second": 11.168, + "eval_steps_per_second": 1.396, + "step": 4200 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002202271847166645, + "loss": 0.7071, + "step": 4220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002198399380405318, + "loss": 0.7114, + "step": 4240 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002194526913643991, + "loss": 0.705, + "step": 4260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002190654446882664, + "loss": 0.706, + "step": 4280 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021867819801213372, + "loss": 0.7016, + "step": 4300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021829095133600102, + "loss": 0.7084, + "step": 4320 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002179037046598683, + "loss": 0.7187, + "step": 4340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002175164579837356, + "loss": 0.7044, + "step": 4360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002171292113076029, + "loss": 0.7068, + "step": 4380 + }, + { + "epoch": 0.56, + "learning_rate": 0.00021674196463147025, + "loss": 0.7082, + "step": 4400 + }, + { + "epoch": 0.56, + "eval_loss": 0.7112064957618713, + "eval_runtime": 178.8847, + "eval_samples_per_second": 11.18, + "eval_steps_per_second": 1.398, + "step": 4400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021635471795533755, + "loss": 0.705, + "step": 4420 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021596747127920486, + "loss": 0.704, + "step": 4440 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021558022460307214, + "loss": 0.7071, + "step": 4460 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021519297792693944, + "loss": 0.708, + "step": 4480 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021480573125080675, + "loss": 0.705, + "step": 4500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021441848457467405, + "loss": 0.7061, + "step": 4520 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021403123789854136, + "loss": 0.7074, + "step": 4540 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021364399122240864, + "loss": 0.7148, + "step": 4560 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021325674454627594, + "loss": 0.7091, + "step": 4580 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021286949787014325, + "loss": 0.7103, + "step": 4600 + }, + { + "epoch": 0.59, + "eval_loss": 0.7104864716529846, + "eval_runtime": 178.4457, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 4600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021248225119401055, + "loss": 0.706, + "step": 4620 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002120950045178779, + "loss": 0.6966, + "step": 4640 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002117077578417452, + "loss": 0.6991, + "step": 4660 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021132051116561247, + "loss": 0.7039, + "step": 4680 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021093326448947978, + "loss": 0.7059, + "step": 4700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00021054601781334708, + "loss": 0.7122, + "step": 4720 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002101587711372144, + "loss": 0.7099, + "step": 4740 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002097715244610817, + "loss": 0.6998, + "step": 4760 + }, + { + "epoch": 0.61, + "learning_rate": 0.000209384277784949, + "loss": 0.7048, + "step": 4780 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020899703110881628, + "loss": 0.7077, + "step": 4800 + }, + { + "epoch": 0.62, + "eval_loss": 0.7102417945861816, + "eval_runtime": 178.4169, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 4800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020860978443268359, + "loss": 0.7172, + "step": 4820 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002082225377565509, + "loss": 0.7084, + "step": 4840 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002078352910804182, + "loss": 0.7058, + "step": 4860 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020744804440428553, + "loss": 0.6988, + "step": 4880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020706079772815284, + "loss": 0.7008, + "step": 4900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020667355105202011, + "loss": 0.6986, + "step": 4920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020628630437588742, + "loss": 0.7042, + "step": 4940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020589905769975473, + "loss": 0.7139, + "step": 4960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020551181102362203, + "loss": 0.7094, + "step": 4980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020512456434748934, + "loss": 0.7059, + "step": 5000 + }, + { + "epoch": 0.64, + "eval_loss": 0.7092374563217163, + "eval_runtime": 177.3439, + "eval_samples_per_second": 11.278, + "eval_steps_per_second": 1.41, + "step": 5000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020473731767135664, + "loss": 0.7042, + "step": 5020 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020435007099522392, + "loss": 0.6964, + "step": 5040 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020396282431909123, + "loss": 0.7041, + "step": 5060 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020357557764295853, + "loss": 0.6972, + "step": 5080 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020318833096682587, + "loss": 0.7011, + "step": 5100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020280108429069317, + "loss": 0.7073, + "step": 5120 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020241383761456048, + "loss": 0.706, + "step": 5140 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020202659093842776, + "loss": 0.6949, + "step": 5160 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020163934426229506, + "loss": 0.703, + "step": 5180 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020125209758616237, + "loss": 0.7058, + "step": 5200 + }, + { + "epoch": 0.67, + "eval_loss": 0.7084789276123047, + "eval_runtime": 177.3051, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 5200 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020086485091002967, + "loss": 0.7032, + "step": 5220 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020047760423389698, + "loss": 0.7045, + "step": 5240 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020009035755776428, + "loss": 0.7069, + "step": 5260 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019970311088163156, + "loss": 0.6961, + "step": 5280 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019931586420549887, + "loss": 0.6981, + "step": 5300 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019892861752936617, + "loss": 0.701, + "step": 5320 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001985413708532335, + "loss": 0.6979, + "step": 5340 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001981541241771008, + "loss": 0.6975, + "step": 5360 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019776687750096812, + "loss": 0.6923, + "step": 5380 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001973796308248354, + "loss": 0.7089, + "step": 5400 + }, + { + "epoch": 0.69, + "eval_loss": 0.7074704170227051, + "eval_runtime": 177.1889, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 5400 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001969923841487027, + "loss": 0.6986, + "step": 5420 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019660513747257, + "loss": 0.6991, + "step": 5440 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019621789079643731, + "loss": 0.6946, + "step": 5460 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019583064412030462, + "loss": 0.7057, + "step": 5480 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019544339744417193, + "loss": 0.694, + "step": 5500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001950561507680392, + "loss": 0.7046, + "step": 5520 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001946689040919065, + "loss": 0.6998, + "step": 5540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019428165741577382, + "loss": 0.6995, + "step": 5560 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019389441073964115, + "loss": 0.7081, + "step": 5580 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019350716406350845, + "loss": 0.6974, + "step": 5600 + }, + { + "epoch": 0.72, + "eval_loss": 0.7067714333534241, + "eval_runtime": 177.0363, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 5600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019311991738737576, + "loss": 0.7019, + "step": 5620 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019273267071124304, + "loss": 0.7003, + "step": 5640 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019234542403511034, + "loss": 0.6966, + "step": 5660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019195817735897765, + "loss": 0.7055, + "step": 5680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019157093068284496, + "loss": 0.7069, + "step": 5700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019118368400671226, + "loss": 0.6981, + "step": 5720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019079643733057957, + "loss": 0.7005, + "step": 5740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019040919065444685, + "loss": 0.7033, + "step": 5760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019002194397831415, + "loss": 0.7009, + "step": 5780 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018963469730218146, + "loss": 0.7001, + "step": 5800 + }, + { + "epoch": 0.74, + "eval_loss": 0.7066617608070374, + "eval_runtime": 177.2571, + "eval_samples_per_second": 11.283, + "eval_steps_per_second": 1.41, + "step": 5800 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001892474506260488, + "loss": 0.7048, + "step": 5820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001888602039499161, + "loss": 0.698, + "step": 5840 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001884729572737834, + "loss": 0.7035, + "step": 5860 + }, + { + "epoch": 0.75, + "learning_rate": 0.00018808571059765068, + "loss": 0.6997, + "step": 5880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00018769846392151799, + "loss": 0.7053, + "step": 5900 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001873112172453853, + "loss": 0.6951, + "step": 5920 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001869239705692526, + "loss": 0.701, + "step": 5940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001865367238931199, + "loss": 0.7032, + "step": 5960 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001861494772169872, + "loss": 0.7005, + "step": 5980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001857622305408545, + "loss": 0.7013, + "step": 6000 + }, + { + "epoch": 0.77, + "eval_loss": 0.705744743347168, + "eval_runtime": 177.421, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 6000 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001853749838647218, + "loss": 0.6923, + "step": 6020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00018498773718858913, + "loss": 0.701, + "step": 6040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018460049051245643, + "loss": 0.7027, + "step": 6060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018421324383632374, + "loss": 0.6973, + "step": 6080 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018382599716019104, + "loss": 0.6982, + "step": 6100 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018343875048405832, + "loss": 0.7024, + "step": 6120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018305150380792563, + "loss": 0.6996, + "step": 6140 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018266425713179293, + "loss": 0.7063, + "step": 6160 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018227701045566024, + "loss": 0.7005, + "step": 6180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018188976377952755, + "loss": 0.6913, + "step": 6200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7049428224563599, + "eval_runtime": 180.0706, + "eval_samples_per_second": 11.107, + "eval_steps_per_second": 1.388, + "step": 6200 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018150251710339485, + "loss": 0.6998, + "step": 6220 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018111527042726213, + "loss": 0.7044, + "step": 6240 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018072802375112944, + "loss": 0.6988, + "step": 6260 + }, + { + "epoch": 0.81, + "learning_rate": 0.00018034077707499677, + "loss": 0.6979, + "step": 6280 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017995353039886407, + "loss": 0.7048, + "step": 6300 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017956628372273138, + "loss": 0.6941, + "step": 6320 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017917903704659869, + "loss": 0.6963, + "step": 6340 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017879179037046596, + "loss": 0.6954, + "step": 6360 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017840454369433327, + "loss": 0.6953, + "step": 6380 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017801729701820058, + "loss": 0.693, + "step": 6400 + }, + { + "epoch": 0.82, + "eval_loss": 0.7036707997322083, + "eval_runtime": 178.6236, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 6400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017763005034206788, + "loss": 0.7041, + "step": 6420 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001772428036659352, + "loss": 0.6908, + "step": 6440 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001768555569898025, + "loss": 0.6961, + "step": 6460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017646831031366977, + "loss": 0.6967, + "step": 6480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017608106363753708, + "loss": 0.7019, + "step": 6500 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001756938169614044, + "loss": 0.7036, + "step": 6520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017530657028527172, + "loss": 0.6941, + "step": 6540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017491932360913902, + "loss": 0.6995, + "step": 6560 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017453207693300633, + "loss": 0.6962, + "step": 6580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001741448302568736, + "loss": 0.6963, + "step": 6600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7036789655685425, + "eval_runtime": 177.1424, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 6600 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001737575835807409, + "loss": 0.7009, + "step": 6620 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017337033690460822, + "loss": 0.6964, + "step": 6640 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017298309022847552, + "loss": 0.6974, + "step": 6660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017259584355234283, + "loss": 0.6964, + "step": 6680 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001722085968762101, + "loss": 0.6966, + "step": 6700 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001718213502000774, + "loss": 0.7016, + "step": 6720 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017143410352394475, + "loss": 0.6996, + "step": 6740 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017104685684781205, + "loss": 0.6985, + "step": 6760 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017065961017167936, + "loss": 0.7, + "step": 6780 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017027236349554666, + "loss": 0.6846, + "step": 6800 + }, + { + "epoch": 0.87, + "eval_loss": 0.7028091549873352, + "eval_runtime": 177.0434, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 6800 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016988511681941394, + "loss": 0.6994, + "step": 6820 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016949787014328125, + "loss": 0.6995, + "step": 6840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016911062346714855, + "loss": 0.6949, + "step": 6860 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016872337679101586, + "loss": 0.6903, + "step": 6880 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016833613011488316, + "loss": 0.6983, + "step": 6900 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016794888343875047, + "loss": 0.6979, + "step": 6920 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016756163676261775, + "loss": 0.6963, + "step": 6940 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016717439008648505, + "loss": 0.6963, + "step": 6960 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001667871434103524, + "loss": 0.7109, + "step": 6980 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001663998967342197, + "loss": 0.6996, + "step": 7000 + }, + { + "epoch": 0.9, + "eval_loss": 0.7027884721755981, + "eval_runtime": 177.0464, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 7000 + }, + { + "epoch": 0.9, + "learning_rate": 0.000166012650058087, + "loss": 0.6953, + "step": 7020 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001656254033819543, + "loss": 0.701, + "step": 7040 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016523815670582158, + "loss": 0.6941, + "step": 7060 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001648509100296889, + "loss": 0.6946, + "step": 7080 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001644636633535562, + "loss": 0.6905, + "step": 7100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001640764166774235, + "loss": 0.6938, + "step": 7120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001636891700012908, + "loss": 0.6964, + "step": 7140 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001633019233251581, + "loss": 0.6979, + "step": 7160 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001629146766490254, + "loss": 0.6909, + "step": 7180 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001625274299728927, + "loss": 0.7017, + "step": 7200 + }, + { + "epoch": 0.92, + "eval_loss": 0.7013801336288452, + "eval_runtime": 177.3532, + "eval_samples_per_second": 11.277, + "eval_steps_per_second": 1.41, + "step": 7200 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016214018329676003, + "loss": 0.6986, + "step": 7220 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016175293662062734, + "loss": 0.6985, + "step": 7240 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016136568994449464, + "loss": 0.694, + "step": 7260 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016097844326836195, + "loss": 0.7124, + "step": 7280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016059119659222923, + "loss": 0.6936, + "step": 7300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016020394991609653, + "loss": 0.6904, + "step": 7320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015981670323996384, + "loss": 0.6994, + "step": 7340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015942945656383114, + "loss": 0.7046, + "step": 7360 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015904220988769845, + "loss": 0.6999, + "step": 7380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015865496321156575, + "loss": 0.6952, + "step": 7400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7015686631202698, + "eval_runtime": 178.4607, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 7400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015826771653543303, + "loss": 0.7001, + "step": 7420 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015788046985930034, + "loss": 0.6948, + "step": 7440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015749322318316767, + "loss": 0.702, + "step": 7460 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015710597650703498, + "loss": 0.695, + "step": 7480 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015671872983090228, + "loss": 0.7053, + "step": 7500 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001563314831547696, + "loss": 0.7041, + "step": 7520 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015594423647863687, + "loss": 0.6975, + "step": 7540 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015555698980250417, + "loss": 0.6914, + "step": 7560 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015516974312637148, + "loss": 0.6961, + "step": 7580 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015478249645023878, + "loss": 0.6968, + "step": 7600 + }, + { + "epoch": 0.97, + "eval_loss": 0.7004283666610718, + "eval_runtime": 178.8057, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 7600 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001543952497741061, + "loss": 0.6847, + "step": 7620 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015400800309797342, + "loss": 0.6954, + "step": 7640 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015362075642184067, + "loss": 0.6967, + "step": 7660 + }, + { + "epoch": 0.98, + "learning_rate": 0.000153233509745708, + "loss": 0.6941, + "step": 7680 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001528462630695753, + "loss": 0.6928, + "step": 7700 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015245901639344262, + "loss": 0.7035, + "step": 7720 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015207176971730992, + "loss": 0.6918, + "step": 7740 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015168452304117723, + "loss": 0.6996, + "step": 7760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001512972763650445, + "loss": 0.6975, + "step": 7780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015091002968891181, + "loss": 0.7022, + "step": 7800 + }, + { + "epoch": 1.0, + "eval_loss": 0.6998333930969238, + "eval_runtime": 178.2222, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 7800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015052278301277912, + "loss": 0.6854, + "step": 7820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015013553633664643, + "loss": 0.6911, + "step": 7840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014974828966051373, + "loss": 0.6846, + "step": 7860 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014936104298438104, + "loss": 0.6859, + "step": 7880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014897379630824834, + "loss": 0.6802, + "step": 7900 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014858654963211565, + "loss": 0.6891, + "step": 7920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014819930295598295, + "loss": 0.6833, + "step": 7940 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014781205627985026, + "loss": 0.6866, + "step": 7960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014742480960371754, + "loss": 0.6863, + "step": 7980 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014703756292758484, + "loss": 0.6898, + "step": 8000 + }, + { + "epoch": 1.03, + "eval_loss": 0.699661135673523, + "eval_runtime": 177.9496, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 8000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014665031625145218, + "loss": 0.6881, + "step": 8020 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014626306957531946, + "loss": 0.6894, + "step": 8040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014587582289918676, + "loss": 0.685, + "step": 8060 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014548857622305407, + "loss": 0.6837, + "step": 8080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014510132954692137, + "loss": 0.6944, + "step": 8100 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014471408287078868, + "loss": 0.6883, + "step": 8120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014432683619465598, + "loss": 0.6874, + "step": 8140 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001439395895185233, + "loss": 0.6867, + "step": 8160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001435523428423906, + "loss": 0.6875, + "step": 8180 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001431650961662579, + "loss": 0.7005, + "step": 8200 + }, + { + "epoch": 1.05, + "eval_loss": 0.699004590511322, + "eval_runtime": 177.1183, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 8200 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014277784949012518, + "loss": 0.6968, + "step": 8220 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014239060281399249, + "loss": 0.6884, + "step": 8240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014200335613785982, + "loss": 0.6808, + "step": 8260 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001416161094617271, + "loss": 0.6851, + "step": 8280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001412288627855944, + "loss": 0.6917, + "step": 8300 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001408416161094617, + "loss": 0.6944, + "step": 8320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014045436943332901, + "loss": 0.6851, + "step": 8340 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014006712275719632, + "loss": 0.6829, + "step": 8360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00013967987608106363, + "loss": 0.6872, + "step": 8380 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013929262940493093, + "loss": 0.6909, + "step": 8400 + }, + { + "epoch": 1.08, + "eval_loss": 0.6990391612052917, + "eval_runtime": 177.2945, + "eval_samples_per_second": 11.281, + "eval_steps_per_second": 1.41, + "step": 8400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013890538272879824, + "loss": 0.6924, + "step": 8420 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013851813605266554, + "loss": 0.6747, + "step": 8440 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013813088937653282, + "loss": 0.6932, + "step": 8460 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013774364270040016, + "loss": 0.6892, + "step": 8480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013735639602426746, + "loss": 0.6868, + "step": 8500 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013696914934813474, + "loss": 0.6898, + "step": 8520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013658190267200205, + "loss": 0.6896, + "step": 8540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013619465599586935, + "loss": 0.6848, + "step": 8560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013580740931973666, + "loss": 0.6801, + "step": 8580 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013542016264360396, + "loss": 0.6927, + "step": 8600 + }, + { + "epoch": 1.1, + "eval_loss": 0.6984953880310059, + "eval_runtime": 177.3114, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 8600 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013503291596747127, + "loss": 0.688, + "step": 8620 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013464566929133857, + "loss": 0.693, + "step": 8640 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013425842261520588, + "loss": 0.6815, + "step": 8660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013387117593907319, + "loss": 0.6828, + "step": 8680 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013348392926294046, + "loss": 0.6834, + "step": 8700 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001330966825868078, + "loss": 0.6839, + "step": 8720 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001327094359106751, + "loss": 0.6838, + "step": 8740 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013232218923454238, + "loss": 0.6847, + "step": 8760 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001319349425584097, + "loss": 0.6894, + "step": 8780 + }, + { + "epoch": 1.13, + "learning_rate": 0.000131547695882277, + "loss": 0.6922, + "step": 8800 + }, + { + "epoch": 1.13, + "eval_loss": 0.698199450969696, + "eval_runtime": 177.0428, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 8800 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001311604492061443, + "loss": 0.6904, + "step": 8820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001307732025300116, + "loss": 0.6854, + "step": 8840 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001303859558538789, + "loss": 0.6877, + "step": 8860 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012999870917774622, + "loss": 0.6857, + "step": 8880 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012961146250161352, + "loss": 0.6856, + "step": 8900 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012922421582548083, + "loss": 0.687, + "step": 8920 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001288369691493481, + "loss": 0.6905, + "step": 8940 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012844972247321544, + "loss": 0.6865, + "step": 8960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012806247579708274, + "loss": 0.6829, + "step": 8980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012767522912095002, + "loss": 0.696, + "step": 9000 + }, + { + "epoch": 1.15, + "eval_loss": 0.6971157789230347, + "eval_runtime": 177.1296, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 9000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012728798244481733, + "loss": 0.6825, + "step": 9020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012690073576868463, + "loss": 0.6844, + "step": 9040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012651348909255194, + "loss": 0.6853, + "step": 9060 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012612624241641925, + "loss": 0.6889, + "step": 9080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012573899574028655, + "loss": 0.6848, + "step": 9100 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012535174906415386, + "loss": 0.6953, + "step": 9120 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012496450238802116, + "loss": 0.6944, + "step": 9140 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012457725571188847, + "loss": 0.6893, + "step": 9160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012419000903575575, + "loss": 0.6831, + "step": 9180 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012380276235962308, + "loss": 0.683, + "step": 9200 + }, + { + "epoch": 1.18, + "eval_loss": 0.6971254944801331, + "eval_runtime": 177.2118, + "eval_samples_per_second": 11.286, + "eval_steps_per_second": 1.411, + "step": 9200 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012341551568349039, + "loss": 0.6782, + "step": 9220 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012302826900735766, + "loss": 0.6962, + "step": 9240 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012264102233122497, + "loss": 0.6808, + "step": 9260 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001222537756550923, + "loss": 0.6931, + "step": 9280 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012186652897895958, + "loss": 0.6878, + "step": 9300 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012147928230282689, + "loss": 0.6855, + "step": 9320 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012109203562669421, + "loss": 0.6865, + "step": 9340 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001207047889505615, + "loss": 0.6945, + "step": 9360 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001203175422744288, + "loss": 0.6862, + "step": 9380 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011993029559829611, + "loss": 0.6932, + "step": 9400 + }, + { + "epoch": 1.21, + "eval_loss": 0.6963634490966797, + "eval_runtime": 177.2999, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 9400 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001195430489221634, + "loss": 0.6793, + "step": 9420 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011915580224603071, + "loss": 0.6827, + "step": 9440 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011876855556989803, + "loss": 0.6836, + "step": 9460 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011838130889376532, + "loss": 0.6876, + "step": 9480 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011799406221763263, + "loss": 0.6949, + "step": 9500 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011760681554149993, + "loss": 0.6809, + "step": 9520 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011721956886536722, + "loss": 0.6923, + "step": 9540 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011683232218923453, + "loss": 0.6975, + "step": 9560 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011644507551310185, + "loss": 0.6981, + "step": 9580 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011605782883696914, + "loss": 0.6843, + "step": 9600 + }, + { + "epoch": 1.23, + "eval_loss": 0.6958213448524475, + "eval_runtime": 177.1541, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 9600 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011567058216083645, + "loss": 0.6904, + "step": 9620 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011528333548470375, + "loss": 0.6761, + "step": 9640 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011489608880857104, + "loss": 0.6933, + "step": 9660 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011450884213243835, + "loss": 0.6913, + "step": 9680 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011412159545630567, + "loss": 0.6958, + "step": 9700 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011373434878017296, + "loss": 0.6902, + "step": 9720 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011334710210404027, + "loss": 0.6796, + "step": 9740 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011295985542790757, + "loss": 0.6906, + "step": 9760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011257260875177487, + "loss": 0.6882, + "step": 9780 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011218536207564217, + "loss": 0.6856, + "step": 9800 + }, + { + "epoch": 1.26, + "eval_loss": 0.6958709359169006, + "eval_runtime": 177.1197, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 9800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011179811539950949, + "loss": 0.681, + "step": 9820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011141086872337678, + "loss": 0.6824, + "step": 9840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011102362204724409, + "loss": 0.6921, + "step": 9860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001106363753711114, + "loss": 0.6862, + "step": 9880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00011024912869497869, + "loss": 0.6869, + "step": 9900 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010986188201884599, + "loss": 0.6867, + "step": 9920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010947463534271328, + "loss": 0.6885, + "step": 9940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001090873886665806, + "loss": 0.6801, + "step": 9960 + }, + { + "epoch": 1.28, + "learning_rate": 0.00010870014199044791, + "loss": 0.684, + "step": 9980 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001083128953143152, + "loss": 0.6885, + "step": 10000 + }, + { + "epoch": 1.28, + "eval_loss": 0.6948391795158386, + "eval_runtime": 184.1668, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.357, + "step": 10000 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010792564863818251, + "loss": 0.685, + "step": 10020 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010753840196204981, + "loss": 0.6849, + "step": 10040 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010715115528591712, + "loss": 0.6901, + "step": 10060 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010678327094359106, + "loss": 0.6926, + "step": 10080 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010639602426745837, + "loss": 0.6863, + "step": 10100 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010600877759132567, + "loss": 0.6846, + "step": 10120 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010562153091519296, + "loss": 0.6859, + "step": 10140 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010523428423906027, + "loss": 0.6887, + "step": 10160 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010484703756292759, + "loss": 0.689, + "step": 10180 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010445979088679488, + "loss": 0.6795, + "step": 10200 + }, + { + "epoch": 1.31, + "eval_loss": 0.6949850916862488, + "eval_runtime": 177.1947, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 10200 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010407254421066219, + "loss": 0.6874, + "step": 10220 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010368529753452949, + "loss": 0.6875, + "step": 10240 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010329805085839678, + "loss": 0.6842, + "step": 10260 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010291080418226409, + "loss": 0.6842, + "step": 10280 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010252355750613141, + "loss": 0.6834, + "step": 10300 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001021363108299987, + "loss": 0.6894, + "step": 10320 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010174906415386601, + "loss": 0.6814, + "step": 10340 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010136181747773331, + "loss": 0.6731, + "step": 10360 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001009745708016006, + "loss": 0.6955, + "step": 10380 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010058732412546791, + "loss": 0.6884, + "step": 10400 + }, + { + "epoch": 1.33, + "eval_loss": 0.6947998404502869, + "eval_runtime": 183.6797, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.361, + "step": 10400 + }, + { + "epoch": 1.34, + "learning_rate": 0.00010020007744933523, + "loss": 0.6844, + "step": 10420 + }, + { + "epoch": 1.34, + "learning_rate": 9.981283077320252e-05, + "loss": 0.6795, + "step": 10440 + }, + { + "epoch": 1.34, + "learning_rate": 9.942558409706983e-05, + "loss": 0.6783, + "step": 10460 + }, + { + "epoch": 1.34, + "learning_rate": 9.903833742093713e-05, + "loss": 0.6951, + "step": 10480 + }, + { + "epoch": 1.35, + "learning_rate": 9.865109074480443e-05, + "loss": 0.6891, + "step": 10500 + }, + { + "epoch": 1.35, + "learning_rate": 9.826384406867173e-05, + "loss": 0.6899, + "step": 10520 + }, + { + "epoch": 1.35, + "learning_rate": 9.787659739253905e-05, + "loss": 0.6905, + "step": 10540 + }, + { + "epoch": 1.35, + "learning_rate": 9.748935071640634e-05, + "loss": 0.6786, + "step": 10560 + }, + { + "epoch": 1.36, + "learning_rate": 9.710210404027365e-05, + "loss": 0.6884, + "step": 10580 + }, + { + "epoch": 1.36, + "learning_rate": 9.671485736414096e-05, + "loss": 0.6757, + "step": 10600 + }, + { + "epoch": 1.36, + "eval_loss": 0.6941512227058411, + "eval_runtime": 179.4312, + "eval_samples_per_second": 11.146, + "eval_steps_per_second": 1.393, + "step": 10600 + }, + { + "epoch": 1.36, + "learning_rate": 9.632761068800825e-05, + "loss": 0.6844, + "step": 10620 + }, + { + "epoch": 1.36, + "learning_rate": 9.594036401187555e-05, + "loss": 0.6829, + "step": 10640 + }, + { + "epoch": 1.37, + "learning_rate": 9.555311733574287e-05, + "loss": 0.7007, + "step": 10660 + }, + { + "epoch": 1.37, + "learning_rate": 9.516587065961016e-05, + "loss": 0.6989, + "step": 10680 + }, + { + "epoch": 1.37, + "learning_rate": 9.477862398347747e-05, + "loss": 0.6783, + "step": 10700 + }, + { + "epoch": 1.37, + "learning_rate": 9.439137730734476e-05, + "loss": 0.6783, + "step": 10720 + }, + { + "epoch": 1.38, + "learning_rate": 9.400413063121207e-05, + "loss": 0.6879, + "step": 10740 + }, + { + "epoch": 1.38, + "learning_rate": 9.361688395507937e-05, + "loss": 0.6801, + "step": 10760 + }, + { + "epoch": 1.38, + "learning_rate": 9.322963727894667e-05, + "loss": 0.6786, + "step": 10780 + }, + { + "epoch": 1.39, + "learning_rate": 9.284239060281399e-05, + "loss": 0.6882, + "step": 10800 + }, + { + "epoch": 1.39, + "eval_loss": 0.6937060356140137, + "eval_runtime": 178.6714, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 1.39, + "learning_rate": 9.245514392668129e-05, + "loss": 0.6796, + "step": 10820 + }, + { + "epoch": 1.39, + "learning_rate": 9.206789725054858e-05, + "loss": 0.6886, + "step": 10840 + }, + { + "epoch": 1.39, + "learning_rate": 9.168065057441589e-05, + "loss": 0.683, + "step": 10860 + }, + { + "epoch": 1.4, + "learning_rate": 9.129340389828321e-05, + "loss": 0.6786, + "step": 10880 + }, + { + "epoch": 1.4, + "learning_rate": 9.09061572221505e-05, + "loss": 0.6843, + "step": 10900 + }, + { + "epoch": 1.4, + "learning_rate": 9.05189105460178e-05, + "loss": 0.6795, + "step": 10920 + }, + { + "epoch": 1.4, + "learning_rate": 9.013166386988511e-05, + "loss": 0.6813, + "step": 10940 + }, + { + "epoch": 1.41, + "learning_rate": 8.97444171937524e-05, + "loss": 0.687, + "step": 10960 + }, + { + "epoch": 1.41, + "learning_rate": 8.935717051761971e-05, + "loss": 0.6872, + "step": 10980 + }, + { + "epoch": 1.41, + "learning_rate": 8.896992384148703e-05, + "loss": 0.6746, + "step": 11000 + }, + { + "epoch": 1.41, + "eval_loss": 0.6936533451080322, + "eval_runtime": 178.4177, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 11000 + }, + { + "epoch": 1.41, + "learning_rate": 8.858267716535432e-05, + "loss": 0.6747, + "step": 11020 + }, + { + "epoch": 1.42, + "learning_rate": 8.819543048922163e-05, + "loss": 0.6975, + "step": 11040 + }, + { + "epoch": 1.42, + "learning_rate": 8.780818381308893e-05, + "loss": 0.6806, + "step": 11060 + }, + { + "epoch": 1.42, + "learning_rate": 8.742093713695623e-05, + "loss": 0.6789, + "step": 11080 + }, + { + "epoch": 1.42, + "learning_rate": 8.703369046082353e-05, + "loss": 0.6851, + "step": 11100 + }, + { + "epoch": 1.43, + "learning_rate": 8.664644378469085e-05, + "loss": 0.6836, + "step": 11120 + }, + { + "epoch": 1.43, + "learning_rate": 8.625919710855814e-05, + "loss": 0.6891, + "step": 11140 + }, + { + "epoch": 1.43, + "learning_rate": 8.587195043242545e-05, + "loss": 0.6898, + "step": 11160 + }, + { + "epoch": 1.43, + "learning_rate": 8.548470375629275e-05, + "loss": 0.6798, + "step": 11180 + }, + { + "epoch": 1.44, + "learning_rate": 8.509745708016005e-05, + "loss": 0.6822, + "step": 11200 + }, + { + "epoch": 1.44, + "eval_loss": 0.6932746767997742, + "eval_runtime": 178.4279, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 11200 + }, + { + "epoch": 1.44, + "learning_rate": 8.471021040402735e-05, + "loss": 0.6776, + "step": 11220 + }, + { + "epoch": 1.44, + "learning_rate": 8.432296372789467e-05, + "loss": 0.6835, + "step": 11240 + }, + { + "epoch": 1.44, + "learning_rate": 8.393571705176196e-05, + "loss": 0.6852, + "step": 11260 + }, + { + "epoch": 1.45, + "learning_rate": 8.354847037562927e-05, + "loss": 0.686, + "step": 11280 + }, + { + "epoch": 1.45, + "learning_rate": 8.316122369949657e-05, + "loss": 0.6884, + "step": 11300 + }, + { + "epoch": 1.45, + "learning_rate": 8.277397702336387e-05, + "loss": 0.6811, + "step": 11320 + }, + { + "epoch": 1.45, + "learning_rate": 8.238673034723117e-05, + "loss": 0.6751, + "step": 11340 + }, + { + "epoch": 1.46, + "learning_rate": 8.199948367109849e-05, + "loss": 0.6837, + "step": 11360 + }, + { + "epoch": 1.46, + "learning_rate": 8.161223699496578e-05, + "loss": 0.6839, + "step": 11380 + }, + { + "epoch": 1.46, + "learning_rate": 8.122499031883309e-05, + "loss": 0.6804, + "step": 11400 + }, + { + "epoch": 1.46, + "eval_loss": 0.6925450563430786, + "eval_runtime": 177.1737, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 11400 + }, + { + "epoch": 1.46, + "learning_rate": 8.08377436427004e-05, + "loss": 0.6885, + "step": 11420 + }, + { + "epoch": 1.47, + "learning_rate": 8.045049696656769e-05, + "loss": 0.6907, + "step": 11440 + }, + { + "epoch": 1.47, + "learning_rate": 8.0063250290435e-05, + "loss": 0.6868, + "step": 11460 + }, + { + "epoch": 1.47, + "learning_rate": 7.967600361430231e-05, + "loss": 0.6945, + "step": 11480 + }, + { + "epoch": 1.47, + "learning_rate": 7.92887569381696e-05, + "loss": 0.6851, + "step": 11500 + }, + { + "epoch": 1.48, + "learning_rate": 7.890151026203691e-05, + "loss": 0.6878, + "step": 11520 + }, + { + "epoch": 1.48, + "learning_rate": 7.851426358590422e-05, + "loss": 0.6955, + "step": 11540 + }, + { + "epoch": 1.48, + "learning_rate": 7.812701690977151e-05, + "loss": 0.6783, + "step": 11560 + }, + { + "epoch": 1.49, + "learning_rate": 7.773977023363881e-05, + "loss": 0.6857, + "step": 11580 + }, + { + "epoch": 1.49, + "learning_rate": 7.735252355750613e-05, + "loss": 0.6828, + "step": 11600 + }, + { + "epoch": 1.49, + "eval_loss": 0.6924574971199036, + "eval_runtime": 177.3841, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.409, + "step": 11600 + }, + { + "epoch": 1.49, + "learning_rate": 7.696527688137343e-05, + "loss": 0.6796, + "step": 11620 + }, + { + "epoch": 1.49, + "learning_rate": 7.657803020524073e-05, + "loss": 0.6823, + "step": 11640 + }, + { + "epoch": 1.5, + "learning_rate": 7.619078352910804e-05, + "loss": 0.6854, + "step": 11660 + }, + { + "epoch": 1.5, + "learning_rate": 7.580353685297533e-05, + "loss": 0.6797, + "step": 11680 + }, + { + "epoch": 1.5, + "learning_rate": 7.541629017684265e-05, + "loss": 0.6811, + "step": 11700 + }, + { + "epoch": 1.5, + "learning_rate": 7.502904350070995e-05, + "loss": 0.6775, + "step": 11720 + }, + { + "epoch": 1.51, + "learning_rate": 7.464179682457725e-05, + "loss": 0.687, + "step": 11740 + }, + { + "epoch": 1.51, + "learning_rate": 7.425455014844455e-05, + "loss": 0.6859, + "step": 11760 + }, + { + "epoch": 1.51, + "learning_rate": 7.386730347231186e-05, + "loss": 0.683, + "step": 11780 + }, + { + "epoch": 1.51, + "learning_rate": 7.348005679617916e-05, + "loss": 0.6812, + "step": 11800 + }, + { + "epoch": 1.51, + "eval_loss": 0.692126452922821, + "eval_runtime": 182.938, + "eval_samples_per_second": 10.933, + "eval_steps_per_second": 1.367, + "step": 11800 + }, + { + "epoch": 1.52, + "learning_rate": 7.309281012004647e-05, + "loss": 0.688, + "step": 11820 + }, + { + "epoch": 1.52, + "learning_rate": 7.270556344391376e-05, + "loss": 0.6774, + "step": 11840 + }, + { + "epoch": 1.52, + "learning_rate": 7.231831676778107e-05, + "loss": 0.6806, + "step": 11860 + }, + { + "epoch": 1.52, + "learning_rate": 7.193107009164837e-05, + "loss": 0.6756, + "step": 11880 + }, + { + "epoch": 1.53, + "learning_rate": 7.154382341551568e-05, + "loss": 0.6856, + "step": 11900 + }, + { + "epoch": 1.53, + "learning_rate": 7.115657673938298e-05, + "loss": 0.6822, + "step": 11920 + }, + { + "epoch": 1.53, + "learning_rate": 7.076933006325029e-05, + "loss": 0.6769, + "step": 11940 + }, + { + "epoch": 1.53, + "learning_rate": 7.038208338711758e-05, + "loss": 0.6759, + "step": 11960 + }, + { + "epoch": 1.54, + "learning_rate": 6.999483671098489e-05, + "loss": 0.6854, + "step": 11980 + }, + { + "epoch": 1.54, + "learning_rate": 6.96075900348522e-05, + "loss": 0.6855, + "step": 12000 + }, + { + "epoch": 1.54, + "eval_loss": 0.6914573907852173, + "eval_runtime": 177.6919, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 12000 + }, + { + "epoch": 1.54, + "learning_rate": 6.92203433587195e-05, + "loss": 0.6816, + "step": 12020 + }, + { + "epoch": 1.54, + "learning_rate": 6.88330966825868e-05, + "loss": 0.6801, + "step": 12040 + }, + { + "epoch": 1.55, + "learning_rate": 6.844585000645411e-05, + "loss": 0.6818, + "step": 12060 + }, + { + "epoch": 1.55, + "learning_rate": 6.80586033303214e-05, + "loss": 0.6808, + "step": 12080 + }, + { + "epoch": 1.55, + "learning_rate": 6.767135665418872e-05, + "loss": 0.6849, + "step": 12100 + }, + { + "epoch": 1.55, + "learning_rate": 6.728410997805601e-05, + "loss": 0.6902, + "step": 12120 + }, + { + "epoch": 1.56, + "learning_rate": 6.689686330192332e-05, + "loss": 0.6795, + "step": 12140 + }, + { + "epoch": 1.56, + "learning_rate": 6.652897895959726e-05, + "loss": 0.6783, + "step": 12160 + }, + { + "epoch": 1.56, + "learning_rate": 6.614173228346455e-05, + "loss": 0.6839, + "step": 12180 + }, + { + "epoch": 1.56, + "learning_rate": 6.575448560733186e-05, + "loss": 0.6875, + "step": 12200 + }, + { + "epoch": 1.56, + "eval_loss": 0.6915743947029114, + "eval_runtime": 177.7146, + "eval_samples_per_second": 11.254, + "eval_steps_per_second": 1.407, + "step": 12200 + }, + { + "epoch": 1.57, + "learning_rate": 6.536723893119917e-05, + "loss": 0.687, + "step": 12220 + }, + { + "epoch": 1.57, + "learning_rate": 6.497999225506647e-05, + "loss": 0.683, + "step": 12240 + }, + { + "epoch": 1.57, + "learning_rate": 6.459274557893378e-05, + "loss": 0.6796, + "step": 12260 + }, + { + "epoch": 1.57, + "learning_rate": 6.420549890280108e-05, + "loss": 0.6757, + "step": 12280 + }, + { + "epoch": 1.58, + "learning_rate": 6.381825222666838e-05, + "loss": 0.6901, + "step": 12300 + }, + { + "epoch": 1.58, + "learning_rate": 6.343100555053568e-05, + "loss": 0.6734, + "step": 12320 + }, + { + "epoch": 1.58, + "learning_rate": 6.304375887440299e-05, + "loss": 0.6863, + "step": 12340 + }, + { + "epoch": 1.59, + "learning_rate": 6.265651219827029e-05, + "loss": 0.6769, + "step": 12360 + }, + { + "epoch": 1.59, + "learning_rate": 6.22692655221376e-05, + "loss": 0.678, + "step": 12380 + }, + { + "epoch": 1.59, + "learning_rate": 6.18820188460049e-05, + "loss": 0.6934, + "step": 12400 + }, + { + "epoch": 1.59, + "eval_loss": 0.6910441517829895, + "eval_runtime": 177.091, + "eval_samples_per_second": 11.294, + "eval_steps_per_second": 1.412, + "step": 12400 + }, + { + "epoch": 1.59, + "learning_rate": 6.14947721698722e-05, + "loss": 0.6925, + "step": 12420 + }, + { + "epoch": 1.6, + "learning_rate": 6.11075254937395e-05, + "loss": 0.6797, + "step": 12440 + }, + { + "epoch": 1.6, + "learning_rate": 6.0720278817606814e-05, + "loss": 0.6839, + "step": 12460 + }, + { + "epoch": 1.6, + "learning_rate": 6.0333032141474113e-05, + "loss": 0.6825, + "step": 12480 + }, + { + "epoch": 1.6, + "learning_rate": 5.994578546534141e-05, + "loss": 0.6816, + "step": 12500 + }, + { + "epoch": 1.61, + "learning_rate": 5.9558538789208725e-05, + "loss": 0.6844, + "step": 12520 + }, + { + "epoch": 1.61, + "learning_rate": 5.9171292113076024e-05, + "loss": 0.6833, + "step": 12540 + }, + { + "epoch": 1.61, + "learning_rate": 5.878404543694332e-05, + "loss": 0.6827, + "step": 12560 + }, + { + "epoch": 1.61, + "learning_rate": 5.8396798760810635e-05, + "loss": 0.6802, + "step": 12580 + }, + { + "epoch": 1.62, + "learning_rate": 5.8009552084677934e-05, + "loss": 0.6837, + "step": 12600 + }, + { + "epoch": 1.62, + "eval_loss": 0.690994381904602, + "eval_runtime": 177.3691, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 12600 + }, + { + "epoch": 1.62, + "learning_rate": 5.762230540854523e-05, + "loss": 0.6781, + "step": 12620 + }, + { + "epoch": 1.62, + "learning_rate": 5.7235058732412546e-05, + "loss": 0.6816, + "step": 12640 + }, + { + "epoch": 1.62, + "learning_rate": 5.6847812056279845e-05, + "loss": 0.6823, + "step": 12660 + }, + { + "epoch": 1.63, + "learning_rate": 5.6460565380147144e-05, + "loss": 0.6824, + "step": 12680 + }, + { + "epoch": 1.63, + "learning_rate": 5.6073318704014456e-05, + "loss": 0.6786, + "step": 12700 + }, + { + "epoch": 1.63, + "learning_rate": 5.5686072027881755e-05, + "loss": 0.6822, + "step": 12720 + }, + { + "epoch": 1.63, + "learning_rate": 5.5298825351749054e-05, + "loss": 0.6792, + "step": 12740 + }, + { + "epoch": 1.64, + "learning_rate": 5.491157867561637e-05, + "loss": 0.6797, + "step": 12760 + }, + { + "epoch": 1.64, + "learning_rate": 5.4524331999483666e-05, + "loss": 0.6814, + "step": 12780 + }, + { + "epoch": 1.64, + "learning_rate": 5.4137085323350965e-05, + "loss": 0.6827, + "step": 12800 + }, + { + "epoch": 1.64, + "eval_loss": 0.6906899809837341, + "eval_runtime": 177.4169, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 12800 + }, + { + "epoch": 1.64, + "learning_rate": 5.374983864721828e-05, + "loss": 0.6776, + "step": 12820 + }, + { + "epoch": 1.65, + "learning_rate": 5.3362591971085576e-05, + "loss": 0.6877, + "step": 12840 + }, + { + "epoch": 1.65, + "learning_rate": 5.297534529495288e-05, + "loss": 0.6786, + "step": 12860 + }, + { + "epoch": 1.65, + "learning_rate": 5.258809861882019e-05, + "loss": 0.6853, + "step": 12880 + }, + { + "epoch": 1.65, + "learning_rate": 5.2200851942687486e-05, + "loss": 0.6843, + "step": 12900 + }, + { + "epoch": 1.66, + "learning_rate": 5.181360526655479e-05, + "loss": 0.6872, + "step": 12920 + }, + { + "epoch": 1.66, + "learning_rate": 5.14263585904221e-05, + "loss": 0.6862, + "step": 12940 + }, + { + "epoch": 1.66, + "learning_rate": 5.10391119142894e-05, + "loss": 0.6804, + "step": 12960 + }, + { + "epoch": 1.66, + "learning_rate": 5.06518652381567e-05, + "loss": 0.6786, + "step": 12980 + }, + { + "epoch": 1.67, + "learning_rate": 5.026461856202401e-05, + "loss": 0.6839, + "step": 13000 + }, + { + "epoch": 1.67, + "eval_loss": 0.6902768015861511, + "eval_runtime": 177.1355, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 13000 + }, + { + "epoch": 1.67, + "learning_rate": 4.987737188589131e-05, + "loss": 0.682, + "step": 13020 + }, + { + "epoch": 1.67, + "learning_rate": 4.949012520975861e-05, + "loss": 0.6835, + "step": 13040 + }, + { + "epoch": 1.68, + "learning_rate": 4.910287853362592e-05, + "loss": 0.6822, + "step": 13060 + }, + { + "epoch": 1.68, + "learning_rate": 4.871563185749322e-05, + "loss": 0.6827, + "step": 13080 + }, + { + "epoch": 1.68, + "learning_rate": 4.8328385181360523e-05, + "loss": 0.6744, + "step": 13100 + }, + { + "epoch": 1.68, + "learning_rate": 4.794113850522782e-05, + "loss": 0.6817, + "step": 13120 + }, + { + "epoch": 1.69, + "learning_rate": 4.755389182909513e-05, + "loss": 0.6765, + "step": 13140 + }, + { + "epoch": 1.69, + "learning_rate": 4.7166645152962434e-05, + "loss": 0.682, + "step": 13160 + }, + { + "epoch": 1.69, + "learning_rate": 4.677939847682973e-05, + "loss": 0.6849, + "step": 13180 + }, + { + "epoch": 1.69, + "learning_rate": 4.639215180069704e-05, + "loss": 0.686, + "step": 13200 + }, + { + "epoch": 1.69, + "eval_loss": 0.6899891495704651, + "eval_runtime": 183.0495, + "eval_samples_per_second": 10.926, + "eval_steps_per_second": 1.366, + "step": 13200 + }, + { + "epoch": 1.7, + "learning_rate": 4.6004905124564344e-05, + "loss": 0.6787, + "step": 13220 + }, + { + "epoch": 1.7, + "learning_rate": 4.561765844843164e-05, + "loss": 0.6786, + "step": 13240 + }, + { + "epoch": 1.7, + "learning_rate": 4.523041177229895e-05, + "loss": 0.6913, + "step": 13260 + }, + { + "epoch": 1.7, + "learning_rate": 4.4843165096166255e-05, + "loss": 0.6721, + "step": 13280 + }, + { + "epoch": 1.71, + "learning_rate": 4.4455918420033554e-05, + "loss": 0.6783, + "step": 13300 + }, + { + "epoch": 1.71, + "learning_rate": 4.4068671743900866e-05, + "loss": 0.687, + "step": 13320 + }, + { + "epoch": 1.71, + "learning_rate": 4.3681425067768165e-05, + "loss": 0.6815, + "step": 13340 + }, + { + "epoch": 1.71, + "learning_rate": 4.3294178391635464e-05, + "loss": 0.6786, + "step": 13360 + }, + { + "epoch": 1.72, + "learning_rate": 4.290693171550278e-05, + "loss": 0.6845, + "step": 13380 + }, + { + "epoch": 1.72, + "learning_rate": 4.2519685039370076e-05, + "loss": 0.6817, + "step": 13400 + }, + { + "epoch": 1.72, + "eval_loss": 0.6897545456886292, + "eval_runtime": 177.4673, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.409, + "step": 13400 + }, + { + "epoch": 1.72, + "learning_rate": 4.2132438363237375e-05, + "loss": 0.6783, + "step": 13420 + }, + { + "epoch": 1.72, + "learning_rate": 4.174519168710469e-05, + "loss": 0.6775, + "step": 13440 + }, + { + "epoch": 1.73, + "learning_rate": 4.1357945010971986e-05, + "loss": 0.6752, + "step": 13460 + }, + { + "epoch": 1.73, + "learning_rate": 4.0970698334839285e-05, + "loss": 0.6732, + "step": 13480 + }, + { + "epoch": 1.73, + "learning_rate": 4.05834516587066e-05, + "loss": 0.6854, + "step": 13500 + }, + { + "epoch": 1.73, + "learning_rate": 4.0196204982573896e-05, + "loss": 0.6814, + "step": 13520 + }, + { + "epoch": 1.74, + "learning_rate": 3.9808958306441195e-05, + "loss": 0.6832, + "step": 13540 + }, + { + "epoch": 1.74, + "learning_rate": 3.942171163030851e-05, + "loss": 0.6764, + "step": 13560 + }, + { + "epoch": 1.74, + "learning_rate": 3.903446495417581e-05, + "loss": 0.6797, + "step": 13580 + }, + { + "epoch": 1.74, + "learning_rate": 3.8647218278043106e-05, + "loss": 0.6786, + "step": 13600 + }, + { + "epoch": 1.74, + "eval_loss": 0.6894007325172424, + "eval_runtime": 177.6931, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 13600 + }, + { + "epoch": 1.75, + "learning_rate": 3.825997160191042e-05, + "loss": 0.6878, + "step": 13620 + }, + { + "epoch": 1.75, + "learning_rate": 3.787272492577772e-05, + "loss": 0.6847, + "step": 13640 + }, + { + "epoch": 1.75, + "learning_rate": 3.748547824964502e-05, + "loss": 0.6773, + "step": 13660 + }, + { + "epoch": 1.75, + "learning_rate": 3.709823157351232e-05, + "loss": 0.6723, + "step": 13680 + }, + { + "epoch": 1.76, + "learning_rate": 3.671098489737963e-05, + "loss": 0.6849, + "step": 13700 + }, + { + "epoch": 1.76, + "learning_rate": 3.6323738221246933e-05, + "loss": 0.6788, + "step": 13720 + }, + { + "epoch": 1.76, + "learning_rate": 3.593649154511423e-05, + "loss": 0.6851, + "step": 13740 + }, + { + "epoch": 1.76, + "learning_rate": 3.554924486898154e-05, + "loss": 0.6842, + "step": 13760 + }, + { + "epoch": 1.77, + "learning_rate": 3.5161998192848844e-05, + "loss": 0.6763, + "step": 13780 + }, + { + "epoch": 1.77, + "learning_rate": 3.477475151671614e-05, + "loss": 0.6795, + "step": 13800 + }, + { + "epoch": 1.77, + "eval_loss": 0.6892591714859009, + "eval_runtime": 178.329, + "eval_samples_per_second": 11.215, + "eval_steps_per_second": 1.402, + "step": 13800 + }, + { + "epoch": 1.77, + "learning_rate": 3.438750484058345e-05, + "loss": 0.6804, + "step": 13820 + }, + { + "epoch": 1.78, + "learning_rate": 3.4000258164450754e-05, + "loss": 0.6873, + "step": 13840 + }, + { + "epoch": 1.78, + "learning_rate": 3.361301148831805e-05, + "loss": 0.6783, + "step": 13860 + }, + { + "epoch": 1.78, + "learning_rate": 3.322576481218536e-05, + "loss": 0.6843, + "step": 13880 + }, + { + "epoch": 1.78, + "learning_rate": 3.2838518136052665e-05, + "loss": 0.6755, + "step": 13900 + }, + { + "epoch": 1.79, + "learning_rate": 3.2451271459919964e-05, + "loss": 0.684, + "step": 13920 + }, + { + "epoch": 1.79, + "learning_rate": 3.206402478378727e-05, + "loss": 0.6828, + "step": 13940 + }, + { + "epoch": 1.79, + "learning_rate": 3.167677810765457e-05, + "loss": 0.6771, + "step": 13960 + }, + { + "epoch": 1.79, + "learning_rate": 3.1289531431521874e-05, + "loss": 0.6818, + "step": 13980 + }, + { + "epoch": 1.8, + "learning_rate": 3.090228475538918e-05, + "loss": 0.6751, + "step": 14000 + }, + { + "epoch": 1.8, + "eval_loss": 0.6889638304710388, + "eval_runtime": 178.8668, + "eval_samples_per_second": 11.182, + "eval_steps_per_second": 1.398, + "step": 14000 + }, + { + "epoch": 1.8, + "learning_rate": 3.0515038079256482e-05, + "loss": 0.6794, + "step": 14020 + }, + { + "epoch": 1.8, + "learning_rate": 3.0127791403123788e-05, + "loss": 0.6652, + "step": 14040 + }, + { + "epoch": 1.8, + "learning_rate": 2.974054472699109e-05, + "loss": 0.6746, + "step": 14060 + }, + { + "epoch": 1.81, + "learning_rate": 2.9353298050858393e-05, + "loss": 0.6847, + "step": 14080 + }, + { + "epoch": 1.81, + "learning_rate": 2.89660513747257e-05, + "loss": 0.6733, + "step": 14100 + }, + { + "epoch": 1.81, + "learning_rate": 2.8578804698593004e-05, + "loss": 0.6794, + "step": 14120 + }, + { + "epoch": 1.81, + "learning_rate": 2.8191558022460303e-05, + "loss": 0.6748, + "step": 14140 + }, + { + "epoch": 1.82, + "learning_rate": 2.780431134632761e-05, + "loss": 0.6668, + "step": 14160 + }, + { + "epoch": 1.82, + "learning_rate": 2.7417064670194915e-05, + "loss": 0.6845, + "step": 14180 + }, + { + "epoch": 1.82, + "learning_rate": 2.7029817994062214e-05, + "loss": 0.6819, + "step": 14200 + }, + { + "epoch": 1.82, + "eval_loss": 0.68879234790802, + "eval_runtime": 178.3854, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 14200 + }, + { + "epoch": 1.82, + "learning_rate": 2.6661933651736152e-05, + "loss": 0.6857, + "step": 14220 + }, + { + "epoch": 1.83, + "learning_rate": 2.6274686975603458e-05, + "loss": 0.6837, + "step": 14240 + }, + { + "epoch": 1.83, + "learning_rate": 2.588744029947076e-05, + "loss": 0.679, + "step": 14260 + }, + { + "epoch": 1.83, + "learning_rate": 2.5500193623338062e-05, + "loss": 0.6809, + "step": 14280 + }, + { + "epoch": 1.83, + "learning_rate": 2.5112946947205368e-05, + "loss": 0.683, + "step": 14300 + }, + { + "epoch": 1.84, + "learning_rate": 2.4725700271072674e-05, + "loss": 0.6787, + "step": 14320 + }, + { + "epoch": 1.84, + "learning_rate": 2.4338453594939973e-05, + "loss": 0.6842, + "step": 14340 + }, + { + "epoch": 1.84, + "learning_rate": 2.395120691880728e-05, + "loss": 0.682, + "step": 14360 + }, + { + "epoch": 1.84, + "learning_rate": 2.3563960242674584e-05, + "loss": 0.6751, + "step": 14380 + }, + { + "epoch": 1.85, + "learning_rate": 2.3176713566541883e-05, + "loss": 0.682, + "step": 14400 + }, + { + "epoch": 1.85, + "eval_loss": 0.6884602308273315, + "eval_runtime": 180.582, + "eval_samples_per_second": 11.075, + "eval_steps_per_second": 1.384, + "step": 14400 + }, + { + "epoch": 1.85, + "learning_rate": 2.278946689040919e-05, + "loss": 0.6728, + "step": 14420 + }, + { + "epoch": 1.85, + "learning_rate": 2.2402220214276495e-05, + "loss": 0.6839, + "step": 14440 + }, + { + "epoch": 1.85, + "learning_rate": 2.2014973538143794e-05, + "loss": 0.6828, + "step": 14460 + }, + { + "epoch": 1.86, + "learning_rate": 2.16277268620111e-05, + "loss": 0.6752, + "step": 14480 + }, + { + "epoch": 1.86, + "learning_rate": 2.1240480185878405e-05, + "loss": 0.682, + "step": 14500 + }, + { + "epoch": 1.86, + "learning_rate": 2.0853233509745704e-05, + "loss": 0.6802, + "step": 14520 + }, + { + "epoch": 1.86, + "learning_rate": 2.046598683361301e-05, + "loss": 0.6809, + "step": 14540 + }, + { + "epoch": 1.87, + "learning_rate": 2.0078740157480316e-05, + "loss": 0.6802, + "step": 14560 + }, + { + "epoch": 1.87, + "learning_rate": 1.9691493481347615e-05, + "loss": 0.6763, + "step": 14580 + }, + { + "epoch": 1.87, + "learning_rate": 1.930424680521492e-05, + "loss": 0.6876, + "step": 14600 + }, + { + "epoch": 1.87, + "eval_loss": 0.6882807612419128, + "eval_runtime": 177.6338, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 14600 + }, + { + "epoch": 1.88, + "learning_rate": 1.8917000129082226e-05, + "loss": 0.6789, + "step": 14620 + }, + { + "epoch": 1.88, + "learning_rate": 1.8529753452949525e-05, + "loss": 0.6772, + "step": 14640 + }, + { + "epoch": 1.88, + "learning_rate": 1.814250677681683e-05, + "loss": 0.6783, + "step": 14660 + }, + { + "epoch": 1.88, + "learning_rate": 1.7755260100684133e-05, + "loss": 0.6829, + "step": 14680 + }, + { + "epoch": 1.89, + "learning_rate": 1.736801342455144e-05, + "loss": 0.6809, + "step": 14700 + }, + { + "epoch": 1.89, + "learning_rate": 1.698076674841874e-05, + "loss": 0.6766, + "step": 14720 + }, + { + "epoch": 1.89, + "learning_rate": 1.6593520072286043e-05, + "loss": 0.6813, + "step": 14740 + }, + { + "epoch": 1.89, + "learning_rate": 1.620627339615335e-05, + "loss": 0.6793, + "step": 14760 + }, + { + "epoch": 1.9, + "learning_rate": 1.581902672002065e-05, + "loss": 0.6736, + "step": 14780 + }, + { + "epoch": 1.9, + "learning_rate": 1.5431780043887957e-05, + "loss": 0.6842, + "step": 14800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6880614757537842, + "eval_runtime": 177.7541, + "eval_samples_per_second": 11.251, + "eval_steps_per_second": 1.406, + "step": 14800 + }, + { + "epoch": 1.9, + "learning_rate": 1.5044533367755258e-05, + "loss": 0.682, + "step": 14820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4657286691622562e-05, + "loss": 0.6776, + "step": 14840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4270040015489866e-05, + "loss": 0.6776, + "step": 14860 + }, + { + "epoch": 1.91, + "learning_rate": 1.3882793339357168e-05, + "loss": 0.6717, + "step": 14880 + }, + { + "epoch": 1.91, + "learning_rate": 1.3495546663224472e-05, + "loss": 0.6871, + "step": 14900 + }, + { + "epoch": 1.91, + "learning_rate": 1.3108299987091776e-05, + "loss": 0.6872, + "step": 14920 + }, + { + "epoch": 1.92, + "learning_rate": 1.272105331095908e-05, + "loss": 0.6833, + "step": 14940 + }, + { + "epoch": 1.92, + "learning_rate": 1.2333806634826383e-05, + "loss": 0.6926, + "step": 14960 + }, + { + "epoch": 1.92, + "learning_rate": 1.1946559958693687e-05, + "loss": 0.6741, + "step": 14980 + }, + { + "epoch": 1.92, + "learning_rate": 1.1559313282560991e-05, + "loss": 0.6756, + "step": 15000 + }, + { + "epoch": 1.92, + "eval_loss": 0.6879639625549316, + "eval_runtime": 177.6222, + "eval_samples_per_second": 11.26, + "eval_steps_per_second": 1.407, + "step": 15000 + }, + { + "epoch": 1.93, + "learning_rate": 1.1172066606428293e-05, + "loss": 0.6803, + "step": 15020 + }, + { + "epoch": 1.93, + "learning_rate": 1.0784819930295599e-05, + "loss": 0.6753, + "step": 15040 + }, + { + "epoch": 1.93, + "learning_rate": 1.0397573254162901e-05, + "loss": 0.6749, + "step": 15060 + }, + { + "epoch": 1.93, + "learning_rate": 1.0010326578030204e-05, + "loss": 0.6792, + "step": 15080 + }, + { + "epoch": 1.94, + "learning_rate": 9.623079901897508e-06, + "loss": 0.6798, + "step": 15100 + }, + { + "epoch": 1.94, + "learning_rate": 9.235833225764812e-06, + "loss": 0.6838, + "step": 15120 + }, + { + "epoch": 1.94, + "learning_rate": 8.848586549632114e-06, + "loss": 0.6785, + "step": 15140 + }, + { + "epoch": 1.94, + "learning_rate": 8.461339873499418e-06, + "loss": 0.6858, + "step": 15160 + }, + { + "epoch": 1.95, + "learning_rate": 8.074093197366722e-06, + "loss": 0.6768, + "step": 15180 + }, + { + "epoch": 1.95, + "learning_rate": 7.686846521234025e-06, + "loss": 0.6891, + "step": 15200 + }, + { + "epoch": 1.95, + "eval_loss": 0.6878132224082947, + "eval_runtime": 177.6408, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 15200 + }, + { + "epoch": 1.95, + "learning_rate": 7.299599845101329e-06, + "loss": 0.6757, + "step": 15220 + }, + { + "epoch": 1.95, + "learning_rate": 6.912353168968632e-06, + "loss": 0.6834, + "step": 15240 + }, + { + "epoch": 1.96, + "learning_rate": 6.525106492835936e-06, + "loss": 0.6777, + "step": 15260 + }, + { + "epoch": 1.96, + "learning_rate": 6.13785981670324e-06, + "loss": 0.6833, + "step": 15280 + }, + { + "epoch": 1.96, + "learning_rate": 5.750613140570543e-06, + "loss": 0.6764, + "step": 15300 + }, + { + "epoch": 1.96, + "learning_rate": 5.363366464437847e-06, + "loss": 0.6732, + "step": 15320 + }, + { + "epoch": 1.97, + "learning_rate": 4.9761197883051494e-06, + "loss": 0.6804, + "step": 15340 + }, + { + "epoch": 1.97, + "learning_rate": 4.5888731121724535e-06, + "loss": 0.6904, + "step": 15360 + }, + { + "epoch": 1.97, + "learning_rate": 4.201626436039757e-06, + "loss": 0.6798, + "step": 15380 + }, + { + "epoch": 1.98, + "learning_rate": 3.8143797599070603e-06, + "loss": 0.6728, + "step": 15400 + }, + { + "epoch": 1.98, + "eval_loss": 0.6877534985542297, + "eval_runtime": 177.4956, + "eval_samples_per_second": 11.268, + "eval_steps_per_second": 1.408, + "step": 15400 + } + ], + "max_steps": 15594, + "num_train_epochs": 2, + "total_flos": 2.0018008724424622e+19, + "trial_name": null, + "trial_params": null +} diff --git a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/training_args.bin b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4820a8207e6b86d4107eb87f94c763093a3c7f88 --- /dev/null +++ b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d327ea9f712be818d41a24603cd835992a4e9e3612a85caf2415ab699d6a50 +size 3579