manu commited on
Commit
36f92c4
1 Parent(s): 87585a6

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: croissantllm/CroissantLLMBase
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: out
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
15
+ <details><summary>See axolotl config</summary>
16
+
17
+ axolotl version: `0.4.0`
18
+ ```yaml
19
+ base_model: croissantllm/CroissantLLMBase
20
+ model_type: LlamaForCausalLM
21
+ tokenizer_type: LlamaTokenizerFast
22
+ is_llama_derived_model: true
23
+
24
+ load_in_8bit: false
25
+ load_in_4bit: false
26
+ strict: false
27
+
28
+ datasets:
29
+ - path: manu/mmlu_auxiliary_train_formatted_2
30
+ split: train
31
+ type: completion
32
+
33
+ dataset_prepared_path: last_run_prepared
34
+ val_set_size: 0.05
35
+ output_dir: ./out
36
+
37
+ sequence_len: 2048
38
+ sample_packing: true
39
+ pad_to_sequence_len: true
40
+
41
+ adapter:
42
+ lora_model_dir:
43
+ lora_r:
44
+ lora_alpha:
45
+ lora_dropout:
46
+ lora_target_linear:
47
+ lora_fan_in_fan_out:
48
+
49
+ wandb_project:
50
+ wandb_entity:
51
+ wandb_watch:
52
+ wandb_name:
53
+ wandb_log_model:
54
+
55
+ gradient_accumulation_steps: 4
56
+ micro_batch_size: 16
57
+ num_epochs: 2
58
+ optimizer: adamw_bnb_8bit
59
+ lr_scheduler: cosine
60
+ learning_rate: 0.0002
61
+
62
+ train_on_inputs: false
63
+ group_by_length: false
64
+ bf16: auto
65
+ fp16:
66
+ tf32: false
67
+
68
+ gradient_checkpointing: true
69
+ early_stopping_patience:
70
+ resume_from_checkpoint:
71
+ local_rank:
72
+ logging_steps: 1
73
+ xformers_attention:
74
+ flash_attention: true
75
+ flash_attn_cross_entropy: false
76
+ flash_attn_rms_norm: true
77
+ flash_attn_fuse_qkv: false
78
+ flash_attn_fuse_mlp: true
79
+
80
+ warmup_steps: 50
81
+ evals_per_epoch: 4
82
+ eval_table_size:
83
+ saves_per_epoch: 1
84
+ debug:
85
+ deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
86
+ weight_decay: 0.1
87
+ fsdp:
88
+ fsdp_config:
89
+ special_tokens:
90
+
91
+ ```
92
+
93
+ </details><br>
94
+
95
+ # out
96
+
97
+ This model is a fine-tuned version of [croissantllm/CroissantLLMBase](https://huggingface.co/croissantllm/CroissantLLMBase) on the None dataset.
98
+ It achieves the following results on the evaluation set:
99
+ - Loss: 0.7378
100
+
101
+ ## Model description
102
+
103
+ More information needed
104
+
105
+ ## Intended uses & limitations
106
+
107
+ More information needed
108
+
109
+ ## Training and evaluation data
110
+
111
+ More information needed
112
+
113
+ ## Training procedure
114
+
115
+ ### Training hyperparameters
116
+
117
+ The following hyperparameters were used during training:
118
+ - learning_rate: 0.0002
119
+ - train_batch_size: 16
120
+ - eval_batch_size: 16
121
+ - seed: 42
122
+ - gradient_accumulation_steps: 4
123
+ - total_train_batch_size: 64
124
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
125
+ - lr_scheduler_type: cosine
126
+ - lr_scheduler_warmup_steps: 50
127
+ - num_epochs: 2
128
+
129
+ ### Training results
130
+
131
+ | Training Loss | Epoch | Step | Validation Loss |
132
+ |:-------------:|:-----:|:----:|:---------------:|
133
+ | 2.5429 | 0.0 | 1 | 2.5242 |
134
+ | 2.2283 | 0.25 | 69 | 2.2514 |
135
+ | 1.9539 | 0.5 | 138 | 2.0381 |
136
+ | 1.6608 | 0.75 | 207 | 1.6872 |
137
+ | 1.3767 | 1.0 | 276 | 1.3323 |
138
+ | 0.7872 | 1.23 | 345 | 1.0583 |
139
+ | 0.5873 | 1.48 | 414 | 0.8251 |
140
+ | 0.5154 | 1.73 | 483 | 0.7378 |
141
+
142
+
143
+ ### Framework versions
144
+
145
+ - Transformers 4.38.0.dev0
146
+ - Pytorch 2.1.2+cu121
147
+ - Datasets 2.16.1
148
+ - Tokenizers 0.15.0
checkpoint-275/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "croissantllm/CroissantLLMBase",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5504,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 16,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "rope_theta": 10000.0,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.0.dev0",
26
+ "use_cache": false,
27
+ "vocab_size": 32000
28
+ }
checkpoint-275/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.38.0.dev0"
7
+ }
checkpoint-275/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08eb85b18ec83cbf0d6725ad7811485e5ef17519885afea57a68358bbb98860b
3
+ size 2690869336
checkpoint-275/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e1eea1fbf256bf99f9781eb5e79f2e3019f0c0e22d62a74d8dbe91ebb7d3b75
3
+ size 2696906170
checkpoint-275/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48ee9b73399c28d7e668360bf1d5a4d11095c4738bf96c13f7bb6fbff59f8ccb
3
+ size 14244
checkpoint-275/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e5869532b4cacf643b3611a6255483a77a20e53a0a06d58c89740d9446395ea
3
+ size 1064
checkpoint-275/trainer_state.json ADDED
@@ -0,0 +1,1703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 69,
6
+ "global_step": 275,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4.000000000000001e-06,
14
+ "loss": 2.5429,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 2.5242362022399902,
20
+ "eval_runtime": 44.0094,
21
+ "eval_samples_per_second": 25.676,
22
+ "eval_steps_per_second": 1.613,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 8.000000000000001e-06,
28
+ "loss": 2.5186,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 1.2e-05,
34
+ "loss": 2.5088,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.01,
39
+ "learning_rate": 1.6000000000000003e-05,
40
+ "loss": 2.5423,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.02,
45
+ "learning_rate": 2e-05,
46
+ "loss": 2.5137,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.02,
51
+ "learning_rate": 2.4e-05,
52
+ "loss": 2.4817,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.03,
57
+ "learning_rate": 2.8000000000000003e-05,
58
+ "loss": 2.4889,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03,
63
+ "learning_rate": 3.2000000000000005e-05,
64
+ "loss": 2.4334,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 3.6e-05,
70
+ "loss": 2.4707,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.04,
75
+ "learning_rate": 4e-05,
76
+ "loss": 2.4251,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.04,
81
+ "learning_rate": 4.4000000000000006e-05,
82
+ "loss": 2.4167,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.04,
87
+ "learning_rate": 4.8e-05,
88
+ "loss": 2.4235,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.05,
93
+ "learning_rate": 5.2000000000000004e-05,
94
+ "loss": 2.411,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.05,
99
+ "learning_rate": 5.6000000000000006e-05,
100
+ "loss": 2.4181,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.05,
105
+ "learning_rate": 6e-05,
106
+ "loss": 2.3847,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.06,
111
+ "learning_rate": 6.400000000000001e-05,
112
+ "loss": 2.407,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.06,
117
+ "learning_rate": 6.800000000000001e-05,
118
+ "loss": 2.3764,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.07,
123
+ "learning_rate": 7.2e-05,
124
+ "loss": 2.3801,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.07,
129
+ "learning_rate": 7.6e-05,
130
+ "loss": 2.387,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.07,
135
+ "learning_rate": 8e-05,
136
+ "loss": 2.3782,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.08,
141
+ "learning_rate": 8.4e-05,
142
+ "loss": 2.3684,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.08,
147
+ "learning_rate": 8.800000000000001e-05,
148
+ "loss": 2.381,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.08,
153
+ "learning_rate": 9.200000000000001e-05,
154
+ "loss": 2.381,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.09,
159
+ "learning_rate": 9.6e-05,
160
+ "loss": 2.34,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.09,
165
+ "learning_rate": 0.0001,
166
+ "loss": 2.3467,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.09,
171
+ "learning_rate": 0.00010400000000000001,
172
+ "loss": 2.335,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.1,
177
+ "learning_rate": 0.00010800000000000001,
178
+ "loss": 2.3741,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.1,
183
+ "learning_rate": 0.00011200000000000001,
184
+ "loss": 2.3716,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 0.11,
189
+ "learning_rate": 0.000116,
190
+ "loss": 2.3168,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 0.11,
195
+ "learning_rate": 0.00012,
196
+ "loss": 2.3438,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 0.11,
201
+ "learning_rate": 0.000124,
202
+ "loss": 2.3625,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 0.12,
207
+ "learning_rate": 0.00012800000000000002,
208
+ "loss": 2.3581,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 0.12,
213
+ "learning_rate": 0.000132,
214
+ "loss": 2.3283,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 0.12,
219
+ "learning_rate": 0.00013600000000000003,
220
+ "loss": 2.3691,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 0.13,
225
+ "learning_rate": 0.00014,
226
+ "loss": 2.3538,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 0.13,
231
+ "learning_rate": 0.000144,
232
+ "loss": 2.337,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 0.13,
237
+ "learning_rate": 0.000148,
238
+ "loss": 2.2913,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 0.14,
243
+ "learning_rate": 0.000152,
244
+ "loss": 2.2872,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 0.14,
249
+ "learning_rate": 0.00015600000000000002,
250
+ "loss": 2.319,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 0.15,
255
+ "learning_rate": 0.00016,
256
+ "loss": 2.3445,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 0.15,
261
+ "learning_rate": 0.000164,
262
+ "loss": 2.3293,
263
+ "step": 41
264
+ },
265
+ {
266
+ "epoch": 0.15,
267
+ "learning_rate": 0.000168,
268
+ "loss": 2.3142,
269
+ "step": 42
270
+ },
271
+ {
272
+ "epoch": 0.16,
273
+ "learning_rate": 0.000172,
274
+ "loss": 2.3172,
275
+ "step": 43
276
+ },
277
+ {
278
+ "epoch": 0.16,
279
+ "learning_rate": 0.00017600000000000002,
280
+ "loss": 2.3179,
281
+ "step": 44
282
+ },
283
+ {
284
+ "epoch": 0.16,
285
+ "learning_rate": 0.00018,
286
+ "loss": 2.3018,
287
+ "step": 45
288
+ },
289
+ {
290
+ "epoch": 0.17,
291
+ "learning_rate": 0.00018400000000000003,
292
+ "loss": 2.3266,
293
+ "step": 46
294
+ },
295
+ {
296
+ "epoch": 0.17,
297
+ "learning_rate": 0.000188,
298
+ "loss": 2.3131,
299
+ "step": 47
300
+ },
301
+ {
302
+ "epoch": 0.17,
303
+ "learning_rate": 0.000192,
304
+ "loss": 2.3235,
305
+ "step": 48
306
+ },
307
+ {
308
+ "epoch": 0.18,
309
+ "learning_rate": 0.000196,
310
+ "loss": 2.3281,
311
+ "step": 49
312
+ },
313
+ {
314
+ "epoch": 0.18,
315
+ "learning_rate": 0.0002,
316
+ "loss": 2.3332,
317
+ "step": 50
318
+ },
319
+ {
320
+ "epoch": 0.19,
321
+ "learning_rate": 0.0001999980260856137,
322
+ "loss": 2.3148,
323
+ "step": 51
324
+ },
325
+ {
326
+ "epoch": 0.19,
327
+ "learning_rate": 0.00019999210442038162,
328
+ "loss": 2.2979,
329
+ "step": 52
330
+ },
331
+ {
332
+ "epoch": 0.19,
333
+ "learning_rate": 0.0001999822352380809,
334
+ "loss": 2.3257,
335
+ "step": 53
336
+ },
337
+ {
338
+ "epoch": 0.2,
339
+ "learning_rate": 0.00019996841892833,
340
+ "loss": 2.2947,
341
+ "step": 54
342
+ },
343
+ {
344
+ "epoch": 0.2,
345
+ "learning_rate": 0.00019995065603657316,
346
+ "loss": 2.3266,
347
+ "step": 55
348
+ },
349
+ {
350
+ "epoch": 0.2,
351
+ "learning_rate": 0.00019992894726405893,
352
+ "loss": 2.3385,
353
+ "step": 56
354
+ },
355
+ {
356
+ "epoch": 0.21,
357
+ "learning_rate": 0.0001999032934678125,
358
+ "loss": 2.3289,
359
+ "step": 57
360
+ },
361
+ {
362
+ "epoch": 0.21,
363
+ "learning_rate": 0.00019987369566060176,
364
+ "loss": 2.2741,
365
+ "step": 58
366
+ },
367
+ {
368
+ "epoch": 0.21,
369
+ "learning_rate": 0.00019984015501089752,
370
+ "loss": 2.2867,
371
+ "step": 59
372
+ },
373
+ {
374
+ "epoch": 0.22,
375
+ "learning_rate": 0.00019980267284282717,
376
+ "loss": 2.2836,
377
+ "step": 60
378
+ },
379
+ {
380
+ "epoch": 0.22,
381
+ "learning_rate": 0.00019976125063612252,
382
+ "loss": 2.3015,
383
+ "step": 61
384
+ },
385
+ {
386
+ "epoch": 0.23,
387
+ "learning_rate": 0.0001997158900260614,
388
+ "loss": 2.3233,
389
+ "step": 62
390
+ },
391
+ {
392
+ "epoch": 0.23,
393
+ "learning_rate": 0.00019966659280340297,
394
+ "loss": 2.2758,
395
+ "step": 63
396
+ },
397
+ {
398
+ "epoch": 0.23,
399
+ "learning_rate": 0.00019961336091431727,
400
+ "loss": 2.2709,
401
+ "step": 64
402
+ },
403
+ {
404
+ "epoch": 0.24,
405
+ "learning_rate": 0.00019955619646030802,
406
+ "loss": 2.2797,
407
+ "step": 65
408
+ },
409
+ {
410
+ "epoch": 0.24,
411
+ "learning_rate": 0.00019949510169813003,
412
+ "loss": 2.304,
413
+ "step": 66
414
+ },
415
+ {
416
+ "epoch": 0.24,
417
+ "learning_rate": 0.0001994300790396999,
418
+ "loss": 2.276,
419
+ "step": 67
420
+ },
421
+ {
422
+ "epoch": 0.25,
423
+ "learning_rate": 0.00019936113105200085,
424
+ "loss": 2.2041,
425
+ "step": 68
426
+ },
427
+ {
428
+ "epoch": 0.25,
429
+ "learning_rate": 0.00019928826045698136,
430
+ "loss": 2.2283,
431
+ "step": 69
432
+ },
433
+ {
434
+ "epoch": 0.25,
435
+ "eval_loss": 2.2513792514801025,
436
+ "eval_runtime": 44.1028,
437
+ "eval_samples_per_second": 25.622,
438
+ "eval_steps_per_second": 1.61,
439
+ "step": 69
440
+ },
441
+ {
442
+ "epoch": 0.25,
443
+ "learning_rate": 0.0001992114701314478,
444
+ "loss": 2.2609,
445
+ "step": 70
446
+ },
447
+ {
448
+ "epoch": 0.26,
449
+ "learning_rate": 0.00019913076310695068,
450
+ "loss": 2.2827,
451
+ "step": 71
452
+ },
453
+ {
454
+ "epoch": 0.26,
455
+ "learning_rate": 0.00019904614256966512,
456
+ "loss": 2.209,
457
+ "step": 72
458
+ },
459
+ {
460
+ "epoch": 0.27,
461
+ "learning_rate": 0.0001989576118602651,
462
+ "loss": 2.2145,
463
+ "step": 73
464
+ },
465
+ {
466
+ "epoch": 0.27,
467
+ "learning_rate": 0.0001988651744737914,
468
+ "loss": 2.2473,
469
+ "step": 74
470
+ },
471
+ {
472
+ "epoch": 0.27,
473
+ "learning_rate": 0.00019876883405951377,
474
+ "loss": 2.2453,
475
+ "step": 75
476
+ },
477
+ {
478
+ "epoch": 0.28,
479
+ "learning_rate": 0.0001986685944207868,
480
+ "loss": 2.211,
481
+ "step": 76
482
+ },
483
+ {
484
+ "epoch": 0.28,
485
+ "learning_rate": 0.00019856445951489982,
486
+ "loss": 2.2569,
487
+ "step": 77
488
+ },
489
+ {
490
+ "epoch": 0.28,
491
+ "learning_rate": 0.00019845643345292054,
492
+ "loss": 2.2212,
493
+ "step": 78
494
+ },
495
+ {
496
+ "epoch": 0.29,
497
+ "learning_rate": 0.00019834452049953297,
498
+ "loss": 2.1689,
499
+ "step": 79
500
+ },
501
+ {
502
+ "epoch": 0.29,
503
+ "learning_rate": 0.0001982287250728689,
504
+ "loss": 2.2705,
505
+ "step": 80
506
+ },
507
+ {
508
+ "epoch": 0.29,
509
+ "learning_rate": 0.0001981090517443334,
510
+ "loss": 2.1772,
511
+ "step": 81
512
+ },
513
+ {
514
+ "epoch": 0.3,
515
+ "learning_rate": 0.0001979855052384247,
516
+ "loss": 2.1771,
517
+ "step": 82
518
+ },
519
+ {
520
+ "epoch": 0.3,
521
+ "learning_rate": 0.00019785809043254722,
522
+ "loss": 2.2058,
523
+ "step": 83
524
+ },
525
+ {
526
+ "epoch": 0.31,
527
+ "learning_rate": 0.00019772681235681936,
528
+ "loss": 2.1685,
529
+ "step": 84
530
+ },
531
+ {
532
+ "epoch": 0.31,
533
+ "learning_rate": 0.00019759167619387476,
534
+ "loss": 2.1862,
535
+ "step": 85
536
+ },
537
+ {
538
+ "epoch": 0.31,
539
+ "learning_rate": 0.00019745268727865774,
540
+ "loss": 2.1707,
541
+ "step": 86
542
+ },
543
+ {
544
+ "epoch": 0.32,
545
+ "learning_rate": 0.00019730985109821266,
546
+ "loss": 2.1632,
547
+ "step": 87
548
+ },
549
+ {
550
+ "epoch": 0.32,
551
+ "learning_rate": 0.0001971631732914674,
552
+ "loss": 2.1792,
553
+ "step": 88
554
+ },
555
+ {
556
+ "epoch": 0.32,
557
+ "learning_rate": 0.0001970126596490106,
558
+ "loss": 2.1885,
559
+ "step": 89
560
+ },
561
+ {
562
+ "epoch": 0.33,
563
+ "learning_rate": 0.0001968583161128631,
564
+ "loss": 2.2058,
565
+ "step": 90
566
+ },
567
+ {
568
+ "epoch": 0.33,
569
+ "learning_rate": 0.00019670014877624353,
570
+ "loss": 2.1616,
571
+ "step": 91
572
+ },
573
+ {
574
+ "epoch": 0.33,
575
+ "learning_rate": 0.0001965381638833274,
576
+ "loss": 2.1764,
577
+ "step": 92
578
+ },
579
+ {
580
+ "epoch": 0.34,
581
+ "learning_rate": 0.000196372367829001,
582
+ "loss": 2.1524,
583
+ "step": 93
584
+ },
585
+ {
586
+ "epoch": 0.34,
587
+ "learning_rate": 0.0001962027671586086,
588
+ "loss": 2.1466,
589
+ "step": 94
590
+ },
591
+ {
592
+ "epoch": 0.35,
593
+ "learning_rate": 0.0001960293685676943,
594
+ "loss": 2.1066,
595
+ "step": 95
596
+ },
597
+ {
598
+ "epoch": 0.35,
599
+ "learning_rate": 0.0001958521789017376,
600
+ "loss": 2.1546,
601
+ "step": 96
602
+ },
603
+ {
604
+ "epoch": 0.35,
605
+ "learning_rate": 0.00019567120515588308,
606
+ "loss": 2.1259,
607
+ "step": 97
608
+ },
609
+ {
610
+ "epoch": 0.36,
611
+ "learning_rate": 0.00019548645447466431,
612
+ "loss": 2.1833,
613
+ "step": 98
614
+ },
615
+ {
616
+ "epoch": 0.36,
617
+ "learning_rate": 0.00019529793415172192,
618
+ "loss": 2.1349,
619
+ "step": 99
620
+ },
621
+ {
622
+ "epoch": 0.36,
623
+ "learning_rate": 0.00019510565162951537,
624
+ "loss": 2.1688,
625
+ "step": 100
626
+ },
627
+ {
628
+ "epoch": 0.37,
629
+ "learning_rate": 0.00019490961449902946,
630
+ "loss": 2.1241,
631
+ "step": 101
632
+ },
633
+ {
634
+ "epoch": 0.37,
635
+ "learning_rate": 0.00019470983049947444,
636
+ "loss": 2.0855,
637
+ "step": 102
638
+ },
639
+ {
640
+ "epoch": 0.37,
641
+ "learning_rate": 0.00019450630751798048,
642
+ "loss": 2.1153,
643
+ "step": 103
644
+ },
645
+ {
646
+ "epoch": 0.38,
647
+ "learning_rate": 0.00019429905358928646,
648
+ "loss": 2.134,
649
+ "step": 104
650
+ },
651
+ {
652
+ "epoch": 0.38,
653
+ "learning_rate": 0.00019408807689542257,
654
+ "loss": 2.097,
655
+ "step": 105
656
+ },
657
+ {
658
+ "epoch": 0.39,
659
+ "learning_rate": 0.00019387338576538744,
660
+ "loss": 2.0723,
661
+ "step": 106
662
+ },
663
+ {
664
+ "epoch": 0.39,
665
+ "learning_rate": 0.00019365498867481923,
666
+ "loss": 2.1019,
667
+ "step": 107
668
+ },
669
+ {
670
+ "epoch": 0.39,
671
+ "learning_rate": 0.00019343289424566122,
672
+ "loss": 2.1268,
673
+ "step": 108
674
+ },
675
+ {
676
+ "epoch": 0.4,
677
+ "learning_rate": 0.0001932071112458211,
678
+ "loss": 2.0742,
679
+ "step": 109
680
+ },
681
+ {
682
+ "epoch": 0.4,
683
+ "learning_rate": 0.00019297764858882514,
684
+ "loss": 2.0667,
685
+ "step": 110
686
+ },
687
+ {
688
+ "epoch": 0.4,
689
+ "learning_rate": 0.00019274451533346615,
690
+ "loss": 2.066,
691
+ "step": 111
692
+ },
693
+ {
694
+ "epoch": 0.41,
695
+ "learning_rate": 0.0001925077206834458,
696
+ "loss": 2.0845,
697
+ "step": 112
698
+ },
699
+ {
700
+ "epoch": 0.41,
701
+ "learning_rate": 0.0001922672739870115,
702
+ "loss": 2.112,
703
+ "step": 113
704
+ },
705
+ {
706
+ "epoch": 0.41,
707
+ "learning_rate": 0.00019202318473658705,
708
+ "loss": 2.1757,
709
+ "step": 114
710
+ },
711
+ {
712
+ "epoch": 0.42,
713
+ "learning_rate": 0.00019177546256839812,
714
+ "loss": 2.12,
715
+ "step": 115
716
+ },
717
+ {
718
+ "epoch": 0.42,
719
+ "learning_rate": 0.00019152411726209176,
720
+ "loss": 2.0802,
721
+ "step": 116
722
+ },
723
+ {
724
+ "epoch": 0.43,
725
+ "learning_rate": 0.0001912691587403503,
726
+ "loss": 2.0203,
727
+ "step": 117
728
+ },
729
+ {
730
+ "epoch": 0.43,
731
+ "learning_rate": 0.00019101059706849957,
732
+ "loss": 2.078,
733
+ "step": 118
734
+ },
735
+ {
736
+ "epoch": 0.43,
737
+ "learning_rate": 0.0001907484424541117,
738
+ "loss": 2.0773,
739
+ "step": 119
740
+ },
741
+ {
742
+ "epoch": 0.44,
743
+ "learning_rate": 0.00019048270524660196,
744
+ "loss": 2.0601,
745
+ "step": 120
746
+ },
747
+ {
748
+ "epoch": 0.44,
749
+ "learning_rate": 0.00019021339593682028,
750
+ "loss": 2.0711,
751
+ "step": 121
752
+ },
753
+ {
754
+ "epoch": 0.44,
755
+ "learning_rate": 0.0001899405251566371,
756
+ "loss": 2.0607,
757
+ "step": 122
758
+ },
759
+ {
760
+ "epoch": 0.45,
761
+ "learning_rate": 0.00018966410367852362,
762
+ "loss": 2.0505,
763
+ "step": 123
764
+ },
765
+ {
766
+ "epoch": 0.45,
767
+ "learning_rate": 0.0001893841424151264,
768
+ "loss": 2.0371,
769
+ "step": 124
770
+ },
771
+ {
772
+ "epoch": 0.45,
773
+ "learning_rate": 0.0001891006524188368,
774
+ "loss": 2.0507,
775
+ "step": 125
776
+ },
777
+ {
778
+ "epoch": 0.46,
779
+ "learning_rate": 0.00018881364488135448,
780
+ "loss": 2.0541,
781
+ "step": 126
782
+ },
783
+ {
784
+ "epoch": 0.46,
785
+ "learning_rate": 0.00018852313113324552,
786
+ "loss": 2.0362,
787
+ "step": 127
788
+ },
789
+ {
790
+ "epoch": 0.47,
791
+ "learning_rate": 0.00018822912264349534,
792
+ "loss": 2.0668,
793
+ "step": 128
794
+ },
795
+ {
796
+ "epoch": 0.47,
797
+ "learning_rate": 0.00018793163101905563,
798
+ "loss": 3.3779,
799
+ "step": 129
800
+ },
801
+ {
802
+ "epoch": 0.47,
803
+ "learning_rate": 0.00018763066800438636,
804
+ "loss": 2.0519,
805
+ "step": 130
806
+ },
807
+ {
808
+ "epoch": 0.48,
809
+ "learning_rate": 0.00018732624548099204,
810
+ "loss": 2.0925,
811
+ "step": 131
812
+ },
813
+ {
814
+ "epoch": 0.48,
815
+ "learning_rate": 0.0001870183754669526,
816
+ "loss": 2.0931,
817
+ "step": 132
818
+ },
819
+ {
820
+ "epoch": 0.48,
821
+ "learning_rate": 0.000186707070116449,
822
+ "loss": 2.0339,
823
+ "step": 133
824
+ },
825
+ {
826
+ "epoch": 0.49,
827
+ "learning_rate": 0.00018639234171928353,
828
+ "loss": 1.9732,
829
+ "step": 134
830
+ },
831
+ {
832
+ "epoch": 0.49,
833
+ "learning_rate": 0.0001860742027003944,
834
+ "loss": 2.0093,
835
+ "step": 135
836
+ },
837
+ {
838
+ "epoch": 0.49,
839
+ "learning_rate": 0.00018575266561936523,
840
+ "loss": 1.9884,
841
+ "step": 136
842
+ },
843
+ {
844
+ "epoch": 0.5,
845
+ "learning_rate": 0.0001854277431699295,
846
+ "loss": 1.9744,
847
+ "step": 137
848
+ },
849
+ {
850
+ "epoch": 0.5,
851
+ "learning_rate": 0.00018509944817946922,
852
+ "loss": 1.9539,
853
+ "step": 138
854
+ },
855
+ {
856
+ "epoch": 0.5,
857
+ "eval_loss": 2.0380747318267822,
858
+ "eval_runtime": 44.126,
859
+ "eval_samples_per_second": 25.608,
860
+ "eval_steps_per_second": 1.609,
861
+ "step": 138
862
+ },
863
+ {
864
+ "epoch": 0.51,
865
+ "learning_rate": 0.00018476779360850832,
866
+ "loss": 2.0191,
867
+ "step": 139
868
+ },
869
+ {
870
+ "epoch": 0.51,
871
+ "learning_rate": 0.00018443279255020152,
872
+ "loss": 2.0294,
873
+ "step": 140
874
+ },
875
+ {
876
+ "epoch": 0.51,
877
+ "learning_rate": 0.00018409445822981693,
878
+ "loss": 2.021,
879
+ "step": 141
880
+ },
881
+ {
882
+ "epoch": 0.52,
883
+ "learning_rate": 0.0001837528040042142,
884
+ "loss": 2.0358,
885
+ "step": 142
886
+ },
887
+ {
888
+ "epoch": 0.52,
889
+ "learning_rate": 0.00018340784336131713,
890
+ "loss": 1.9937,
891
+ "step": 143
892
+ },
893
+ {
894
+ "epoch": 0.52,
895
+ "learning_rate": 0.00018305958991958127,
896
+ "loss": 2.0256,
897
+ "step": 144
898
+ },
899
+ {
900
+ "epoch": 0.53,
901
+ "learning_rate": 0.00018270805742745617,
902
+ "loss": 2.0468,
903
+ "step": 145
904
+ },
905
+ {
906
+ "epoch": 0.53,
907
+ "learning_rate": 0.00018235325976284275,
908
+ "loss": 1.9277,
909
+ "step": 146
910
+ },
911
+ {
912
+ "epoch": 0.53,
913
+ "learning_rate": 0.00018199521093254523,
914
+ "loss": 2.047,
915
+ "step": 147
916
+ },
917
+ {
918
+ "epoch": 0.54,
919
+ "learning_rate": 0.00018163392507171842,
920
+ "loss": 2.0202,
921
+ "step": 148
922
+ },
923
+ {
924
+ "epoch": 0.54,
925
+ "learning_rate": 0.0001812694164433094,
926
+ "loss": 1.9753,
927
+ "step": 149
928
+ },
929
+ {
930
+ "epoch": 0.55,
931
+ "learning_rate": 0.00018090169943749476,
932
+ "loss": 1.9769,
933
+ "step": 150
934
+ },
935
+ {
936
+ "epoch": 0.55,
937
+ "learning_rate": 0.0001805307885711122,
938
+ "loss": 1.9002,
939
+ "step": 151
940
+ },
941
+ {
942
+ "epoch": 0.55,
943
+ "learning_rate": 0.00018015669848708767,
944
+ "loss": 1.8904,
945
+ "step": 152
946
+ },
947
+ {
948
+ "epoch": 0.56,
949
+ "learning_rate": 0.0001797794439538571,
950
+ "loss": 1.8454,
951
+ "step": 153
952
+ },
953
+ {
954
+ "epoch": 0.56,
955
+ "learning_rate": 0.00017939903986478355,
956
+ "loss": 1.9393,
957
+ "step": 154
958
+ },
959
+ {
960
+ "epoch": 0.56,
961
+ "learning_rate": 0.00017901550123756906,
962
+ "loss": 1.9692,
963
+ "step": 155
964
+ },
965
+ {
966
+ "epoch": 0.57,
967
+ "learning_rate": 0.00017862884321366188,
968
+ "loss": 1.8821,
969
+ "step": 156
970
+ },
971
+ {
972
+ "epoch": 0.57,
973
+ "learning_rate": 0.0001782390810576588,
974
+ "loss": 1.9025,
975
+ "step": 157
976
+ },
977
+ {
978
+ "epoch": 0.57,
979
+ "learning_rate": 0.00017784623015670238,
980
+ "loss": 1.8977,
981
+ "step": 158
982
+ },
983
+ {
984
+ "epoch": 0.58,
985
+ "learning_rate": 0.00017745030601987337,
986
+ "loss": 1.9179,
987
+ "step": 159
988
+ },
989
+ {
990
+ "epoch": 0.58,
991
+ "learning_rate": 0.00017705132427757895,
992
+ "loss": 1.8895,
993
+ "step": 160
994
+ },
995
+ {
996
+ "epoch": 0.59,
997
+ "learning_rate": 0.00017664930068093498,
998
+ "loss": 1.8531,
999
+ "step": 161
1000
+ },
1001
+ {
1002
+ "epoch": 0.59,
1003
+ "learning_rate": 0.0001762442511011448,
1004
+ "loss": 1.9298,
1005
+ "step": 162
1006
+ },
1007
+ {
1008
+ "epoch": 0.59,
1009
+ "learning_rate": 0.0001758361915288722,
1010
+ "loss": 1.862,
1011
+ "step": 163
1012
+ },
1013
+ {
1014
+ "epoch": 0.6,
1015
+ "learning_rate": 0.00017542513807361037,
1016
+ "loss": 1.9015,
1017
+ "step": 164
1018
+ },
1019
+ {
1020
+ "epoch": 0.6,
1021
+ "learning_rate": 0.00017501110696304596,
1022
+ "loss": 1.8509,
1023
+ "step": 165
1024
+ },
1025
+ {
1026
+ "epoch": 0.6,
1027
+ "learning_rate": 0.00017459411454241822,
1028
+ "loss": 1.8403,
1029
+ "step": 166
1030
+ },
1031
+ {
1032
+ "epoch": 0.61,
1033
+ "learning_rate": 0.00017417417727387394,
1034
+ "loss": 1.9279,
1035
+ "step": 167
1036
+ },
1037
+ {
1038
+ "epoch": 0.61,
1039
+ "learning_rate": 0.0001737513117358174,
1040
+ "loss": 1.7867,
1041
+ "step": 168
1042
+ },
1043
+ {
1044
+ "epoch": 0.61,
1045
+ "learning_rate": 0.00017332553462225602,
1046
+ "loss": 1.8755,
1047
+ "step": 169
1048
+ },
1049
+ {
1050
+ "epoch": 0.62,
1051
+ "learning_rate": 0.00017289686274214118,
1052
+ "loss": 1.8639,
1053
+ "step": 170
1054
+ },
1055
+ {
1056
+ "epoch": 0.62,
1057
+ "learning_rate": 0.0001724653130187047,
1058
+ "loss": 1.8795,
1059
+ "step": 171
1060
+ },
1061
+ {
1062
+ "epoch": 0.63,
1063
+ "learning_rate": 0.0001720309024887907,
1064
+ "loss": 1.8081,
1065
+ "step": 172
1066
+ },
1067
+ {
1068
+ "epoch": 0.63,
1069
+ "learning_rate": 0.00017159364830218312,
1070
+ "loss": 1.8725,
1071
+ "step": 173
1072
+ },
1073
+ {
1074
+ "epoch": 0.63,
1075
+ "learning_rate": 0.00017115356772092857,
1076
+ "loss": 1.8081,
1077
+ "step": 174
1078
+ },
1079
+ {
1080
+ "epoch": 0.64,
1081
+ "learning_rate": 0.00017071067811865476,
1082
+ "loss": 1.8939,
1083
+ "step": 175
1084
+ },
1085
+ {
1086
+ "epoch": 0.64,
1087
+ "learning_rate": 0.00017026499697988493,
1088
+ "loss": 1.819,
1089
+ "step": 176
1090
+ },
1091
+ {
1092
+ "epoch": 0.64,
1093
+ "learning_rate": 0.00016981654189934727,
1094
+ "loss": 1.8051,
1095
+ "step": 177
1096
+ },
1097
+ {
1098
+ "epoch": 0.65,
1099
+ "learning_rate": 0.0001693653305812805,
1100
+ "loss": 1.8504,
1101
+ "step": 178
1102
+ },
1103
+ {
1104
+ "epoch": 0.65,
1105
+ "learning_rate": 0.00016891138083873487,
1106
+ "loss": 1.7902,
1107
+ "step": 179
1108
+ },
1109
+ {
1110
+ "epoch": 0.65,
1111
+ "learning_rate": 0.00016845471059286887,
1112
+ "loss": 1.787,
1113
+ "step": 180
1114
+ },
1115
+ {
1116
+ "epoch": 0.66,
1117
+ "learning_rate": 0.00016799533787224192,
1118
+ "loss": 1.8533,
1119
+ "step": 181
1120
+ },
1121
+ {
1122
+ "epoch": 0.66,
1123
+ "learning_rate": 0.00016753328081210245,
1124
+ "loss": 1.831,
1125
+ "step": 182
1126
+ },
1127
+ {
1128
+ "epoch": 0.67,
1129
+ "learning_rate": 0.000167068557653672,
1130
+ "loss": 1.8204,
1131
+ "step": 183
1132
+ },
1133
+ {
1134
+ "epoch": 0.67,
1135
+ "learning_rate": 0.00016660118674342517,
1136
+ "loss": 1.8043,
1137
+ "step": 184
1138
+ },
1139
+ {
1140
+ "epoch": 0.67,
1141
+ "learning_rate": 0.00016613118653236518,
1142
+ "loss": 1.778,
1143
+ "step": 185
1144
+ },
1145
+ {
1146
+ "epoch": 0.68,
1147
+ "learning_rate": 0.00016565857557529566,
1148
+ "loss": 1.7868,
1149
+ "step": 186
1150
+ },
1151
+ {
1152
+ "epoch": 0.68,
1153
+ "learning_rate": 0.0001651833725300879,
1154
+ "loss": 1.8423,
1155
+ "step": 187
1156
+ },
1157
+ {
1158
+ "epoch": 0.68,
1159
+ "learning_rate": 0.00016470559615694446,
1160
+ "loss": 1.8157,
1161
+ "step": 188
1162
+ },
1163
+ {
1164
+ "epoch": 0.69,
1165
+ "learning_rate": 0.00016422526531765846,
1166
+ "loss": 1.7796,
1167
+ "step": 189
1168
+ },
1169
+ {
1170
+ "epoch": 0.69,
1171
+ "learning_rate": 0.000163742398974869,
1172
+ "loss": 1.705,
1173
+ "step": 190
1174
+ },
1175
+ {
1176
+ "epoch": 0.69,
1177
+ "learning_rate": 0.00016325701619131246,
1178
+ "loss": 1.7537,
1179
+ "step": 191
1180
+ },
1181
+ {
1182
+ "epoch": 0.7,
1183
+ "learning_rate": 0.00016276913612907007,
1184
+ "loss": 1.7629,
1185
+ "step": 192
1186
+ },
1187
+ {
1188
+ "epoch": 0.7,
1189
+ "learning_rate": 0.00016227877804881127,
1190
+ "loss": 1.7667,
1191
+ "step": 193
1192
+ },
1193
+ {
1194
+ "epoch": 0.71,
1195
+ "learning_rate": 0.00016178596130903344,
1196
+ "loss": 1.7376,
1197
+ "step": 194
1198
+ },
1199
+ {
1200
+ "epoch": 0.71,
1201
+ "learning_rate": 0.00016129070536529766,
1202
+ "loss": 1.7776,
1203
+ "step": 195
1204
+ },
1205
+ {
1206
+ "epoch": 0.71,
1207
+ "learning_rate": 0.00016079302976946055,
1208
+ "loss": 1.6759,
1209
+ "step": 196
1210
+ },
1211
+ {
1212
+ "epoch": 0.72,
1213
+ "learning_rate": 0.00016029295416890248,
1214
+ "loss": 1.7559,
1215
+ "step": 197
1216
+ },
1217
+ {
1218
+ "epoch": 0.72,
1219
+ "learning_rate": 0.0001597904983057519,
1220
+ "loss": 1.76,
1221
+ "step": 198
1222
+ },
1223
+ {
1224
+ "epoch": 0.72,
1225
+ "learning_rate": 0.00015928568201610595,
1226
+ "loss": 1.7412,
1227
+ "step": 199
1228
+ },
1229
+ {
1230
+ "epoch": 0.73,
1231
+ "learning_rate": 0.00015877852522924732,
1232
+ "loss": 1.6685,
1233
+ "step": 200
1234
+ },
1235
+ {
1236
+ "epoch": 0.73,
1237
+ "learning_rate": 0.00015826904796685762,
1238
+ "loss": 1.7587,
1239
+ "step": 201
1240
+ },
1241
+ {
1242
+ "epoch": 0.73,
1243
+ "learning_rate": 0.00015775727034222675,
1244
+ "loss": 1.7417,
1245
+ "step": 202
1246
+ },
1247
+ {
1248
+ "epoch": 0.74,
1249
+ "learning_rate": 0.0001572432125594591,
1250
+ "loss": 1.6513,
1251
+ "step": 203
1252
+ },
1253
+ {
1254
+ "epoch": 0.74,
1255
+ "learning_rate": 0.00015672689491267567,
1256
+ "loss": 1.6968,
1257
+ "step": 204
1258
+ },
1259
+ {
1260
+ "epoch": 0.75,
1261
+ "learning_rate": 0.00015620833778521307,
1262
+ "loss": 1.6509,
1263
+ "step": 205
1264
+ },
1265
+ {
1266
+ "epoch": 0.75,
1267
+ "learning_rate": 0.00015568756164881882,
1268
+ "loss": 1.6245,
1269
+ "step": 206
1270
+ },
1271
+ {
1272
+ "epoch": 0.75,
1273
+ "learning_rate": 0.00015516458706284303,
1274
+ "loss": 1.6608,
1275
+ "step": 207
1276
+ },
1277
+ {
1278
+ "epoch": 0.75,
1279
+ "eval_loss": 1.6871771812438965,
1280
+ "eval_runtime": 44.071,
1281
+ "eval_samples_per_second": 25.64,
1282
+ "eval_steps_per_second": 1.611,
1283
+ "step": 207
1284
+ },
1285
+ {
1286
+ "epoch": 0.76,
1287
+ "learning_rate": 0.00015463943467342693,
1288
+ "loss": 1.6572,
1289
+ "step": 208
1290
+ },
1291
+ {
1292
+ "epoch": 0.76,
1293
+ "learning_rate": 0.00015411212521268758,
1294
+ "loss": 1.5938,
1295
+ "step": 209
1296
+ },
1297
+ {
1298
+ "epoch": 0.76,
1299
+ "learning_rate": 0.00015358267949789966,
1300
+ "loss": 1.6421,
1301
+ "step": 210
1302
+ },
1303
+ {
1304
+ "epoch": 0.77,
1305
+ "learning_rate": 0.0001530511184306734,
1306
+ "loss": 1.7423,
1307
+ "step": 211
1308
+ },
1309
+ {
1310
+ "epoch": 0.77,
1311
+ "learning_rate": 0.0001525174629961296,
1312
+ "loss": 1.6636,
1313
+ "step": 212
1314
+ },
1315
+ {
1316
+ "epoch": 0.77,
1317
+ "learning_rate": 0.00015198173426207094,
1318
+ "loss": 1.7339,
1319
+ "step": 213
1320
+ },
1321
+ {
1322
+ "epoch": 0.78,
1323
+ "learning_rate": 0.00015144395337815064,
1324
+ "loss": 1.6729,
1325
+ "step": 214
1326
+ },
1327
+ {
1328
+ "epoch": 0.78,
1329
+ "learning_rate": 0.00015090414157503714,
1330
+ "loss": 1.6939,
1331
+ "step": 215
1332
+ },
1333
+ {
1334
+ "epoch": 0.79,
1335
+ "learning_rate": 0.0001503623201635761,
1336
+ "loss": 1.6394,
1337
+ "step": 216
1338
+ },
1339
+ {
1340
+ "epoch": 0.79,
1341
+ "learning_rate": 0.0001498185105339491,
1342
+ "loss": 1.6196,
1343
+ "step": 217
1344
+ },
1345
+ {
1346
+ "epoch": 0.79,
1347
+ "learning_rate": 0.00014927273415482915,
1348
+ "loss": 1.6304,
1349
+ "step": 218
1350
+ },
1351
+ {
1352
+ "epoch": 0.8,
1353
+ "learning_rate": 0.00014872501257253323,
1354
+ "loss": 1.6124,
1355
+ "step": 219
1356
+ },
1357
+ {
1358
+ "epoch": 0.8,
1359
+ "learning_rate": 0.00014817536741017152,
1360
+ "loss": 1.6011,
1361
+ "step": 220
1362
+ },
1363
+ {
1364
+ "epoch": 0.8,
1365
+ "learning_rate": 0.0001476238203667939,
1366
+ "loss": 1.5667,
1367
+ "step": 221
1368
+ },
1369
+ {
1370
+ "epoch": 0.81,
1371
+ "learning_rate": 0.0001470703932165333,
1372
+ "loss": 1.6536,
1373
+ "step": 222
1374
+ },
1375
+ {
1376
+ "epoch": 0.81,
1377
+ "learning_rate": 0.00014651510780774583,
1378
+ "loss": 1.5861,
1379
+ "step": 223
1380
+ },
1381
+ {
1382
+ "epoch": 0.81,
1383
+ "learning_rate": 0.00014595798606214882,
1384
+ "loss": 1.5339,
1385
+ "step": 224
1386
+ },
1387
+ {
1388
+ "epoch": 0.82,
1389
+ "learning_rate": 0.00014539904997395468,
1390
+ "loss": 1.6058,
1391
+ "step": 225
1392
+ },
1393
+ {
1394
+ "epoch": 0.82,
1395
+ "learning_rate": 0.00014483832160900326,
1396
+ "loss": 1.605,
1397
+ "step": 226
1398
+ },
1399
+ {
1400
+ "epoch": 0.83,
1401
+ "learning_rate": 0.0001442758231038902,
1402
+ "loss": 1.5731,
1403
+ "step": 227
1404
+ },
1405
+ {
1406
+ "epoch": 0.83,
1407
+ "learning_rate": 0.0001437115766650933,
1408
+ "loss": 1.5876,
1409
+ "step": 228
1410
+ },
1411
+ {
1412
+ "epoch": 0.83,
1413
+ "learning_rate": 0.0001431456045680959,
1414
+ "loss": 1.6278,
1415
+ "step": 229
1416
+ },
1417
+ {
1418
+ "epoch": 0.84,
1419
+ "learning_rate": 0.00014257792915650728,
1420
+ "loss": 1.5877,
1421
+ "step": 230
1422
+ },
1423
+ {
1424
+ "epoch": 0.84,
1425
+ "learning_rate": 0.00014200857284118066,
1426
+ "loss": 1.5985,
1427
+ "step": 231
1428
+ },
1429
+ {
1430
+ "epoch": 0.84,
1431
+ "learning_rate": 0.00014143755809932845,
1432
+ "loss": 1.5359,
1433
+ "step": 232
1434
+ },
1435
+ {
1436
+ "epoch": 0.85,
1437
+ "learning_rate": 0.00014086490747363493,
1438
+ "loss": 1.4969,
1439
+ "step": 233
1440
+ },
1441
+ {
1442
+ "epoch": 0.85,
1443
+ "learning_rate": 0.00014029064357136628,
1444
+ "loss": 1.5615,
1445
+ "step": 234
1446
+ },
1447
+ {
1448
+ "epoch": 0.85,
1449
+ "learning_rate": 0.00013971478906347806,
1450
+ "loss": 1.5356,
1451
+ "step": 235
1452
+ },
1453
+ {
1454
+ "epoch": 0.86,
1455
+ "learning_rate": 0.00013913736668372026,
1456
+ "loss": 1.5306,
1457
+ "step": 236
1458
+ },
1459
+ {
1460
+ "epoch": 0.86,
1461
+ "learning_rate": 0.00013855839922773968,
1462
+ "loss": 1.4575,
1463
+ "step": 237
1464
+ },
1465
+ {
1466
+ "epoch": 0.87,
1467
+ "learning_rate": 0.00013797790955218014,
1468
+ "loss": 1.5216,
1469
+ "step": 238
1470
+ },
1471
+ {
1472
+ "epoch": 0.87,
1473
+ "learning_rate": 0.00013739592057378003,
1474
+ "loss": 1.5153,
1475
+ "step": 239
1476
+ },
1477
+ {
1478
+ "epoch": 0.87,
1479
+ "learning_rate": 0.00013681245526846783,
1480
+ "loss": 1.4623,
1481
+ "step": 240
1482
+ },
1483
+ {
1484
+ "epoch": 0.88,
1485
+ "learning_rate": 0.00013622753667045457,
1486
+ "loss": 1.4995,
1487
+ "step": 241
1488
+ },
1489
+ {
1490
+ "epoch": 0.88,
1491
+ "learning_rate": 0.00013564118787132506,
1492
+ "loss": 1.5363,
1493
+ "step": 242
1494
+ },
1495
+ {
1496
+ "epoch": 0.88,
1497
+ "learning_rate": 0.0001350534320191259,
1498
+ "loss": 1.425,
1499
+ "step": 243
1500
+ },
1501
+ {
1502
+ "epoch": 0.89,
1503
+ "learning_rate": 0.0001344642923174517,
1504
+ "loss": 1.4145,
1505
+ "step": 244
1506
+ },
1507
+ {
1508
+ "epoch": 0.89,
1509
+ "learning_rate": 0.00013387379202452917,
1510
+ "loss": 1.4994,
1511
+ "step": 245
1512
+ },
1513
+ {
1514
+ "epoch": 0.89,
1515
+ "learning_rate": 0.00013328195445229868,
1516
+ "loss": 1.4758,
1517
+ "step": 246
1518
+ },
1519
+ {
1520
+ "epoch": 0.9,
1521
+ "learning_rate": 0.00013268880296549425,
1522
+ "loss": 1.5288,
1523
+ "step": 247
1524
+ },
1525
+ {
1526
+ "epoch": 0.9,
1527
+ "learning_rate": 0.00013209436098072095,
1528
+ "loss": 1.4809,
1529
+ "step": 248
1530
+ },
1531
+ {
1532
+ "epoch": 0.91,
1533
+ "learning_rate": 0.0001314986519655305,
1534
+ "loss": 1.4916,
1535
+ "step": 249
1536
+ },
1537
+ {
1538
+ "epoch": 0.91,
1539
+ "learning_rate": 0.00013090169943749476,
1540
+ "loss": 1.5181,
1541
+ "step": 250
1542
+ },
1543
+ {
1544
+ "epoch": 0.91,
1545
+ "learning_rate": 0.00013030352696327742,
1546
+ "loss": 1.4987,
1547
+ "step": 251
1548
+ },
1549
+ {
1550
+ "epoch": 0.92,
1551
+ "learning_rate": 0.0001297041581577035,
1552
+ "loss": 1.4526,
1553
+ "step": 252
1554
+ },
1555
+ {
1556
+ "epoch": 0.92,
1557
+ "learning_rate": 0.00012910361668282719,
1558
+ "loss": 1.4375,
1559
+ "step": 253
1560
+ },
1561
+ {
1562
+ "epoch": 0.92,
1563
+ "learning_rate": 0.0001285019262469976,
1564
+ "loss": 1.4934,
1565
+ "step": 254
1566
+ },
1567
+ {
1568
+ "epoch": 0.93,
1569
+ "learning_rate": 0.00012789911060392294,
1570
+ "loss": 1.443,
1571
+ "step": 255
1572
+ },
1573
+ {
1574
+ "epoch": 0.93,
1575
+ "learning_rate": 0.00012729519355173254,
1576
+ "loss": 1.359,
1577
+ "step": 256
1578
+ },
1579
+ {
1580
+ "epoch": 0.93,
1581
+ "learning_rate": 0.00012669019893203759,
1582
+ "loss": 1.4471,
1583
+ "step": 257
1584
+ },
1585
+ {
1586
+ "epoch": 0.94,
1587
+ "learning_rate": 0.00012608415062898972,
1588
+ "loss": 1.4615,
1589
+ "step": 258
1590
+ },
1591
+ {
1592
+ "epoch": 0.94,
1593
+ "learning_rate": 0.00012547707256833823,
1594
+ "loss": 1.4249,
1595
+ "step": 259
1596
+ },
1597
+ {
1598
+ "epoch": 0.95,
1599
+ "learning_rate": 0.0001248689887164855,
1600
+ "loss": 1.4157,
1601
+ "step": 260
1602
+ },
1603
+ {
1604
+ "epoch": 0.95,
1605
+ "learning_rate": 0.00012425992307954075,
1606
+ "loss": 1.4213,
1607
+ "step": 261
1608
+ },
1609
+ {
1610
+ "epoch": 0.95,
1611
+ "learning_rate": 0.00012364989970237248,
1612
+ "loss": 1.3552,
1613
+ "step": 262
1614
+ },
1615
+ {
1616
+ "epoch": 0.96,
1617
+ "learning_rate": 0.00012303894266765908,
1618
+ "loss": 1.3715,
1619
+ "step": 263
1620
+ },
1621
+ {
1622
+ "epoch": 0.96,
1623
+ "learning_rate": 0.00012242707609493814,
1624
+ "loss": 1.3484,
1625
+ "step": 264
1626
+ },
1627
+ {
1628
+ "epoch": 0.96,
1629
+ "learning_rate": 0.00012181432413965428,
1630
+ "loss": 1.4268,
1631
+ "step": 265
1632
+ },
1633
+ {
1634
+ "epoch": 0.97,
1635
+ "learning_rate": 0.00012120071099220549,
1636
+ "loss": 1.3612,
1637
+ "step": 266
1638
+ },
1639
+ {
1640
+ "epoch": 0.97,
1641
+ "learning_rate": 0.00012058626087698814,
1642
+ "loss": 1.341,
1643
+ "step": 267
1644
+ },
1645
+ {
1646
+ "epoch": 0.97,
1647
+ "learning_rate": 0.00011997099805144069,
1648
+ "loss": 1.3221,
1649
+ "step": 268
1650
+ },
1651
+ {
1652
+ "epoch": 0.98,
1653
+ "learning_rate": 0.00011935494680508606,
1654
+ "loss": 1.3872,
1655
+ "step": 269
1656
+ },
1657
+ {
1658
+ "epoch": 0.98,
1659
+ "learning_rate": 0.00011873813145857249,
1660
+ "loss": 1.4136,
1661
+ "step": 270
1662
+ },
1663
+ {
1664
+ "epoch": 0.99,
1665
+ "learning_rate": 0.00011812057636271374,
1666
+ "loss": 1.3503,
1667
+ "step": 271
1668
+ },
1669
+ {
1670
+ "epoch": 0.99,
1671
+ "learning_rate": 0.00011750230589752762,
1672
+ "loss": 1.4157,
1673
+ "step": 272
1674
+ },
1675
+ {
1676
+ "epoch": 0.99,
1677
+ "learning_rate": 0.00011688334447127338,
1678
+ "loss": 1.3828,
1679
+ "step": 273
1680
+ },
1681
+ {
1682
+ "epoch": 1.0,
1683
+ "learning_rate": 0.00011626371651948838,
1684
+ "loss": 1.3542,
1685
+ "step": 274
1686
+ },
1687
+ {
1688
+ "epoch": 1.0,
1689
+ "learning_rate": 0.0001156434465040231,
1690
+ "loss": 1.3105,
1691
+ "step": 275
1692
+ }
1693
+ ],
1694
+ "logging_steps": 1,
1695
+ "max_steps": 550,
1696
+ "num_input_tokens_seen": 0,
1697
+ "num_train_epochs": 2,
1698
+ "save_steps": 275,
1699
+ "total_flos": 2.76799703482368e+17,
1700
+ "train_batch_size": 16,
1701
+ "trial_name": null,
1702
+ "trial_params": null
1703
+ }
checkpoint-275/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ae15e3615b945ca842f662b92f1098ed197416220d63f43b2d6384dbcab105
3
+ size 5304
checkpoint-550/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "croissantllm/CroissantLLMBase",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5504,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 16,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "rope_theta": 10000.0,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.0.dev0",
26
+ "use_cache": false,
27
+ "vocab_size": 32000
28
+ }
checkpoint-550/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.38.0.dev0"
7
+ }
checkpoint-550/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6184eb5a15659bac69d92ded0737bcbee212fd60c74d96f1f39ff02ffad0532
3
+ size 2690869336
checkpoint-550/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa34ffe27c5dd37c36d42ef1a8fdb58f03ff7bfa5a48bf21361eae2a7885075c
3
+ size 2696906170
checkpoint-550/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cca5856c568bc52c683b690919168fa27bfbdfefc6e0a62355afa6011157c3
3
+ size 14244
checkpoint-550/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d9476704a2b5fc8a9f2cd4f2b882e6f0387d6e8e661a51cbf47cb22502edbe
3
+ size 1064
checkpoint-550/trainer_state.json ADDED
@@ -0,0 +1,3385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.978181818181818,
5
+ "eval_steps": 69,
6
+ "global_step": 550,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4.000000000000001e-06,
14
+ "loss": 2.5429,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 2.5242362022399902,
20
+ "eval_runtime": 44.0094,
21
+ "eval_samples_per_second": 25.676,
22
+ "eval_steps_per_second": 1.613,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 8.000000000000001e-06,
28
+ "loss": 2.5186,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 1.2e-05,
34
+ "loss": 2.5088,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.01,
39
+ "learning_rate": 1.6000000000000003e-05,
40
+ "loss": 2.5423,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.02,
45
+ "learning_rate": 2e-05,
46
+ "loss": 2.5137,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.02,
51
+ "learning_rate": 2.4e-05,
52
+ "loss": 2.4817,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.03,
57
+ "learning_rate": 2.8000000000000003e-05,
58
+ "loss": 2.4889,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03,
63
+ "learning_rate": 3.2000000000000005e-05,
64
+ "loss": 2.4334,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 3.6e-05,
70
+ "loss": 2.4707,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.04,
75
+ "learning_rate": 4e-05,
76
+ "loss": 2.4251,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.04,
81
+ "learning_rate": 4.4000000000000006e-05,
82
+ "loss": 2.4167,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.04,
87
+ "learning_rate": 4.8e-05,
88
+ "loss": 2.4235,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.05,
93
+ "learning_rate": 5.2000000000000004e-05,
94
+ "loss": 2.411,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.05,
99
+ "learning_rate": 5.6000000000000006e-05,
100
+ "loss": 2.4181,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.05,
105
+ "learning_rate": 6e-05,
106
+ "loss": 2.3847,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.06,
111
+ "learning_rate": 6.400000000000001e-05,
112
+ "loss": 2.407,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.06,
117
+ "learning_rate": 6.800000000000001e-05,
118
+ "loss": 2.3764,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.07,
123
+ "learning_rate": 7.2e-05,
124
+ "loss": 2.3801,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.07,
129
+ "learning_rate": 7.6e-05,
130
+ "loss": 2.387,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.07,
135
+ "learning_rate": 8e-05,
136
+ "loss": 2.3782,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.08,
141
+ "learning_rate": 8.4e-05,
142
+ "loss": 2.3684,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.08,
147
+ "learning_rate": 8.800000000000001e-05,
148
+ "loss": 2.381,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.08,
153
+ "learning_rate": 9.200000000000001e-05,
154
+ "loss": 2.381,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.09,
159
+ "learning_rate": 9.6e-05,
160
+ "loss": 2.34,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.09,
165
+ "learning_rate": 0.0001,
166
+ "loss": 2.3467,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.09,
171
+ "learning_rate": 0.00010400000000000001,
172
+ "loss": 2.335,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.1,
177
+ "learning_rate": 0.00010800000000000001,
178
+ "loss": 2.3741,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.1,
183
+ "learning_rate": 0.00011200000000000001,
184
+ "loss": 2.3716,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 0.11,
189
+ "learning_rate": 0.000116,
190
+ "loss": 2.3168,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 0.11,
195
+ "learning_rate": 0.00012,
196
+ "loss": 2.3438,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 0.11,
201
+ "learning_rate": 0.000124,
202
+ "loss": 2.3625,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 0.12,
207
+ "learning_rate": 0.00012800000000000002,
208
+ "loss": 2.3581,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 0.12,
213
+ "learning_rate": 0.000132,
214
+ "loss": 2.3283,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 0.12,
219
+ "learning_rate": 0.00013600000000000003,
220
+ "loss": 2.3691,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 0.13,
225
+ "learning_rate": 0.00014,
226
+ "loss": 2.3538,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 0.13,
231
+ "learning_rate": 0.000144,
232
+ "loss": 2.337,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 0.13,
237
+ "learning_rate": 0.000148,
238
+ "loss": 2.2913,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 0.14,
243
+ "learning_rate": 0.000152,
244
+ "loss": 2.2872,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 0.14,
249
+ "learning_rate": 0.00015600000000000002,
250
+ "loss": 2.319,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 0.15,
255
+ "learning_rate": 0.00016,
256
+ "loss": 2.3445,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 0.15,
261
+ "learning_rate": 0.000164,
262
+ "loss": 2.3293,
263
+ "step": 41
264
+ },
265
+ {
266
+ "epoch": 0.15,
267
+ "learning_rate": 0.000168,
268
+ "loss": 2.3142,
269
+ "step": 42
270
+ },
271
+ {
272
+ "epoch": 0.16,
273
+ "learning_rate": 0.000172,
274
+ "loss": 2.3172,
275
+ "step": 43
276
+ },
277
+ {
278
+ "epoch": 0.16,
279
+ "learning_rate": 0.00017600000000000002,
280
+ "loss": 2.3179,
281
+ "step": 44
282
+ },
283
+ {
284
+ "epoch": 0.16,
285
+ "learning_rate": 0.00018,
286
+ "loss": 2.3018,
287
+ "step": 45
288
+ },
289
+ {
290
+ "epoch": 0.17,
291
+ "learning_rate": 0.00018400000000000003,
292
+ "loss": 2.3266,
293
+ "step": 46
294
+ },
295
+ {
296
+ "epoch": 0.17,
297
+ "learning_rate": 0.000188,
298
+ "loss": 2.3131,
299
+ "step": 47
300
+ },
301
+ {
302
+ "epoch": 0.17,
303
+ "learning_rate": 0.000192,
304
+ "loss": 2.3235,
305
+ "step": 48
306
+ },
307
+ {
308
+ "epoch": 0.18,
309
+ "learning_rate": 0.000196,
310
+ "loss": 2.3281,
311
+ "step": 49
312
+ },
313
+ {
314
+ "epoch": 0.18,
315
+ "learning_rate": 0.0002,
316
+ "loss": 2.3332,
317
+ "step": 50
318
+ },
319
+ {
320
+ "epoch": 0.19,
321
+ "learning_rate": 0.0001999980260856137,
322
+ "loss": 2.3148,
323
+ "step": 51
324
+ },
325
+ {
326
+ "epoch": 0.19,
327
+ "learning_rate": 0.00019999210442038162,
328
+ "loss": 2.2979,
329
+ "step": 52
330
+ },
331
+ {
332
+ "epoch": 0.19,
333
+ "learning_rate": 0.0001999822352380809,
334
+ "loss": 2.3257,
335
+ "step": 53
336
+ },
337
+ {
338
+ "epoch": 0.2,
339
+ "learning_rate": 0.00019996841892833,
340
+ "loss": 2.2947,
341
+ "step": 54
342
+ },
343
+ {
344
+ "epoch": 0.2,
345
+ "learning_rate": 0.00019995065603657316,
346
+ "loss": 2.3266,
347
+ "step": 55
348
+ },
349
+ {
350
+ "epoch": 0.2,
351
+ "learning_rate": 0.00019992894726405893,
352
+ "loss": 2.3385,
353
+ "step": 56
354
+ },
355
+ {
356
+ "epoch": 0.21,
357
+ "learning_rate": 0.0001999032934678125,
358
+ "loss": 2.3289,
359
+ "step": 57
360
+ },
361
+ {
362
+ "epoch": 0.21,
363
+ "learning_rate": 0.00019987369566060176,
364
+ "loss": 2.2741,
365
+ "step": 58
366
+ },
367
+ {
368
+ "epoch": 0.21,
369
+ "learning_rate": 0.00019984015501089752,
370
+ "loss": 2.2867,
371
+ "step": 59
372
+ },
373
+ {
374
+ "epoch": 0.22,
375
+ "learning_rate": 0.00019980267284282717,
376
+ "loss": 2.2836,
377
+ "step": 60
378
+ },
379
+ {
380
+ "epoch": 0.22,
381
+ "learning_rate": 0.00019976125063612252,
382
+ "loss": 2.3015,
383
+ "step": 61
384
+ },
385
+ {
386
+ "epoch": 0.23,
387
+ "learning_rate": 0.0001997158900260614,
388
+ "loss": 2.3233,
389
+ "step": 62
390
+ },
391
+ {
392
+ "epoch": 0.23,
393
+ "learning_rate": 0.00019966659280340297,
394
+ "loss": 2.2758,
395
+ "step": 63
396
+ },
397
+ {
398
+ "epoch": 0.23,
399
+ "learning_rate": 0.00019961336091431727,
400
+ "loss": 2.2709,
401
+ "step": 64
402
+ },
403
+ {
404
+ "epoch": 0.24,
405
+ "learning_rate": 0.00019955619646030802,
406
+ "loss": 2.2797,
407
+ "step": 65
408
+ },
409
+ {
410
+ "epoch": 0.24,
411
+ "learning_rate": 0.00019949510169813003,
412
+ "loss": 2.304,
413
+ "step": 66
414
+ },
415
+ {
416
+ "epoch": 0.24,
417
+ "learning_rate": 0.0001994300790396999,
418
+ "loss": 2.276,
419
+ "step": 67
420
+ },
421
+ {
422
+ "epoch": 0.25,
423
+ "learning_rate": 0.00019936113105200085,
424
+ "loss": 2.2041,
425
+ "step": 68
426
+ },
427
+ {
428
+ "epoch": 0.25,
429
+ "learning_rate": 0.00019928826045698136,
430
+ "loss": 2.2283,
431
+ "step": 69
432
+ },
433
+ {
434
+ "epoch": 0.25,
435
+ "eval_loss": 2.2513792514801025,
436
+ "eval_runtime": 44.1028,
437
+ "eval_samples_per_second": 25.622,
438
+ "eval_steps_per_second": 1.61,
439
+ "step": 69
440
+ },
441
+ {
442
+ "epoch": 0.25,
443
+ "learning_rate": 0.0001992114701314478,
444
+ "loss": 2.2609,
445
+ "step": 70
446
+ },
447
+ {
448
+ "epoch": 0.26,
449
+ "learning_rate": 0.00019913076310695068,
450
+ "loss": 2.2827,
451
+ "step": 71
452
+ },
453
+ {
454
+ "epoch": 0.26,
455
+ "learning_rate": 0.00019904614256966512,
456
+ "loss": 2.209,
457
+ "step": 72
458
+ },
459
+ {
460
+ "epoch": 0.27,
461
+ "learning_rate": 0.0001989576118602651,
462
+ "loss": 2.2145,
463
+ "step": 73
464
+ },
465
+ {
466
+ "epoch": 0.27,
467
+ "learning_rate": 0.0001988651744737914,
468
+ "loss": 2.2473,
469
+ "step": 74
470
+ },
471
+ {
472
+ "epoch": 0.27,
473
+ "learning_rate": 0.00019876883405951377,
474
+ "loss": 2.2453,
475
+ "step": 75
476
+ },
477
+ {
478
+ "epoch": 0.28,
479
+ "learning_rate": 0.0001986685944207868,
480
+ "loss": 2.211,
481
+ "step": 76
482
+ },
483
+ {
484
+ "epoch": 0.28,
485
+ "learning_rate": 0.00019856445951489982,
486
+ "loss": 2.2569,
487
+ "step": 77
488
+ },
489
+ {
490
+ "epoch": 0.28,
491
+ "learning_rate": 0.00019845643345292054,
492
+ "loss": 2.2212,
493
+ "step": 78
494
+ },
495
+ {
496
+ "epoch": 0.29,
497
+ "learning_rate": 0.00019834452049953297,
498
+ "loss": 2.1689,
499
+ "step": 79
500
+ },
501
+ {
502
+ "epoch": 0.29,
503
+ "learning_rate": 0.0001982287250728689,
504
+ "loss": 2.2705,
505
+ "step": 80
506
+ },
507
+ {
508
+ "epoch": 0.29,
509
+ "learning_rate": 0.0001981090517443334,
510
+ "loss": 2.1772,
511
+ "step": 81
512
+ },
513
+ {
514
+ "epoch": 0.3,
515
+ "learning_rate": 0.0001979855052384247,
516
+ "loss": 2.1771,
517
+ "step": 82
518
+ },
519
+ {
520
+ "epoch": 0.3,
521
+ "learning_rate": 0.00019785809043254722,
522
+ "loss": 2.2058,
523
+ "step": 83
524
+ },
525
+ {
526
+ "epoch": 0.31,
527
+ "learning_rate": 0.00019772681235681936,
528
+ "loss": 2.1685,
529
+ "step": 84
530
+ },
531
+ {
532
+ "epoch": 0.31,
533
+ "learning_rate": 0.00019759167619387476,
534
+ "loss": 2.1862,
535
+ "step": 85
536
+ },
537
+ {
538
+ "epoch": 0.31,
539
+ "learning_rate": 0.00019745268727865774,
540
+ "loss": 2.1707,
541
+ "step": 86
542
+ },
543
+ {
544
+ "epoch": 0.32,
545
+ "learning_rate": 0.00019730985109821266,
546
+ "loss": 2.1632,
547
+ "step": 87
548
+ },
549
+ {
550
+ "epoch": 0.32,
551
+ "learning_rate": 0.0001971631732914674,
552
+ "loss": 2.1792,
553
+ "step": 88
554
+ },
555
+ {
556
+ "epoch": 0.32,
557
+ "learning_rate": 0.0001970126596490106,
558
+ "loss": 2.1885,
559
+ "step": 89
560
+ },
561
+ {
562
+ "epoch": 0.33,
563
+ "learning_rate": 0.0001968583161128631,
564
+ "loss": 2.2058,
565
+ "step": 90
566
+ },
567
+ {
568
+ "epoch": 0.33,
569
+ "learning_rate": 0.00019670014877624353,
570
+ "loss": 2.1616,
571
+ "step": 91
572
+ },
573
+ {
574
+ "epoch": 0.33,
575
+ "learning_rate": 0.0001965381638833274,
576
+ "loss": 2.1764,
577
+ "step": 92
578
+ },
579
+ {
580
+ "epoch": 0.34,
581
+ "learning_rate": 0.000196372367829001,
582
+ "loss": 2.1524,
583
+ "step": 93
584
+ },
585
+ {
586
+ "epoch": 0.34,
587
+ "learning_rate": 0.0001962027671586086,
588
+ "loss": 2.1466,
589
+ "step": 94
590
+ },
591
+ {
592
+ "epoch": 0.35,
593
+ "learning_rate": 0.0001960293685676943,
594
+ "loss": 2.1066,
595
+ "step": 95
596
+ },
597
+ {
598
+ "epoch": 0.35,
599
+ "learning_rate": 0.0001958521789017376,
600
+ "loss": 2.1546,
601
+ "step": 96
602
+ },
603
+ {
604
+ "epoch": 0.35,
605
+ "learning_rate": 0.00019567120515588308,
606
+ "loss": 2.1259,
607
+ "step": 97
608
+ },
609
+ {
610
+ "epoch": 0.36,
611
+ "learning_rate": 0.00019548645447466431,
612
+ "loss": 2.1833,
613
+ "step": 98
614
+ },
615
+ {
616
+ "epoch": 0.36,
617
+ "learning_rate": 0.00019529793415172192,
618
+ "loss": 2.1349,
619
+ "step": 99
620
+ },
621
+ {
622
+ "epoch": 0.36,
623
+ "learning_rate": 0.00019510565162951537,
624
+ "loss": 2.1688,
625
+ "step": 100
626
+ },
627
+ {
628
+ "epoch": 0.37,
629
+ "learning_rate": 0.00019490961449902946,
630
+ "loss": 2.1241,
631
+ "step": 101
632
+ },
633
+ {
634
+ "epoch": 0.37,
635
+ "learning_rate": 0.00019470983049947444,
636
+ "loss": 2.0855,
637
+ "step": 102
638
+ },
639
+ {
640
+ "epoch": 0.37,
641
+ "learning_rate": 0.00019450630751798048,
642
+ "loss": 2.1153,
643
+ "step": 103
644
+ },
645
+ {
646
+ "epoch": 0.38,
647
+ "learning_rate": 0.00019429905358928646,
648
+ "loss": 2.134,
649
+ "step": 104
650
+ },
651
+ {
652
+ "epoch": 0.38,
653
+ "learning_rate": 0.00019408807689542257,
654
+ "loss": 2.097,
655
+ "step": 105
656
+ },
657
+ {
658
+ "epoch": 0.39,
659
+ "learning_rate": 0.00019387338576538744,
660
+ "loss": 2.0723,
661
+ "step": 106
662
+ },
663
+ {
664
+ "epoch": 0.39,
665
+ "learning_rate": 0.00019365498867481923,
666
+ "loss": 2.1019,
667
+ "step": 107
668
+ },
669
+ {
670
+ "epoch": 0.39,
671
+ "learning_rate": 0.00019343289424566122,
672
+ "loss": 2.1268,
673
+ "step": 108
674
+ },
675
+ {
676
+ "epoch": 0.4,
677
+ "learning_rate": 0.0001932071112458211,
678
+ "loss": 2.0742,
679
+ "step": 109
680
+ },
681
+ {
682
+ "epoch": 0.4,
683
+ "learning_rate": 0.00019297764858882514,
684
+ "loss": 2.0667,
685
+ "step": 110
686
+ },
687
+ {
688
+ "epoch": 0.4,
689
+ "learning_rate": 0.00019274451533346615,
690
+ "loss": 2.066,
691
+ "step": 111
692
+ },
693
+ {
694
+ "epoch": 0.41,
695
+ "learning_rate": 0.0001925077206834458,
696
+ "loss": 2.0845,
697
+ "step": 112
698
+ },
699
+ {
700
+ "epoch": 0.41,
701
+ "learning_rate": 0.0001922672739870115,
702
+ "loss": 2.112,
703
+ "step": 113
704
+ },
705
+ {
706
+ "epoch": 0.41,
707
+ "learning_rate": 0.00019202318473658705,
708
+ "loss": 2.1757,
709
+ "step": 114
710
+ },
711
+ {
712
+ "epoch": 0.42,
713
+ "learning_rate": 0.00019177546256839812,
714
+ "loss": 2.12,
715
+ "step": 115
716
+ },
717
+ {
718
+ "epoch": 0.42,
719
+ "learning_rate": 0.00019152411726209176,
720
+ "loss": 2.0802,
721
+ "step": 116
722
+ },
723
+ {
724
+ "epoch": 0.43,
725
+ "learning_rate": 0.0001912691587403503,
726
+ "loss": 2.0203,
727
+ "step": 117
728
+ },
729
+ {
730
+ "epoch": 0.43,
731
+ "learning_rate": 0.00019101059706849957,
732
+ "loss": 2.078,
733
+ "step": 118
734
+ },
735
+ {
736
+ "epoch": 0.43,
737
+ "learning_rate": 0.0001907484424541117,
738
+ "loss": 2.0773,
739
+ "step": 119
740
+ },
741
+ {
742
+ "epoch": 0.44,
743
+ "learning_rate": 0.00019048270524660196,
744
+ "loss": 2.0601,
745
+ "step": 120
746
+ },
747
+ {
748
+ "epoch": 0.44,
749
+ "learning_rate": 0.00019021339593682028,
750
+ "loss": 2.0711,
751
+ "step": 121
752
+ },
753
+ {
754
+ "epoch": 0.44,
755
+ "learning_rate": 0.0001899405251566371,
756
+ "loss": 2.0607,
757
+ "step": 122
758
+ },
759
+ {
760
+ "epoch": 0.45,
761
+ "learning_rate": 0.00018966410367852362,
762
+ "loss": 2.0505,
763
+ "step": 123
764
+ },
765
+ {
766
+ "epoch": 0.45,
767
+ "learning_rate": 0.0001893841424151264,
768
+ "loss": 2.0371,
769
+ "step": 124
770
+ },
771
+ {
772
+ "epoch": 0.45,
773
+ "learning_rate": 0.0001891006524188368,
774
+ "loss": 2.0507,
775
+ "step": 125
776
+ },
777
+ {
778
+ "epoch": 0.46,
779
+ "learning_rate": 0.00018881364488135448,
780
+ "loss": 2.0541,
781
+ "step": 126
782
+ },
783
+ {
784
+ "epoch": 0.46,
785
+ "learning_rate": 0.00018852313113324552,
786
+ "loss": 2.0362,
787
+ "step": 127
788
+ },
789
+ {
790
+ "epoch": 0.47,
791
+ "learning_rate": 0.00018822912264349534,
792
+ "loss": 2.0668,
793
+ "step": 128
794
+ },
795
+ {
796
+ "epoch": 0.47,
797
+ "learning_rate": 0.00018793163101905563,
798
+ "loss": 3.3779,
799
+ "step": 129
800
+ },
801
+ {
802
+ "epoch": 0.47,
803
+ "learning_rate": 0.00018763066800438636,
804
+ "loss": 2.0519,
805
+ "step": 130
806
+ },
807
+ {
808
+ "epoch": 0.48,
809
+ "learning_rate": 0.00018732624548099204,
810
+ "loss": 2.0925,
811
+ "step": 131
812
+ },
813
+ {
814
+ "epoch": 0.48,
815
+ "learning_rate": 0.0001870183754669526,
816
+ "loss": 2.0931,
817
+ "step": 132
818
+ },
819
+ {
820
+ "epoch": 0.48,
821
+ "learning_rate": 0.000186707070116449,
822
+ "loss": 2.0339,
823
+ "step": 133
824
+ },
825
+ {
826
+ "epoch": 0.49,
827
+ "learning_rate": 0.00018639234171928353,
828
+ "loss": 1.9732,
829
+ "step": 134
830
+ },
831
+ {
832
+ "epoch": 0.49,
833
+ "learning_rate": 0.0001860742027003944,
834
+ "loss": 2.0093,
835
+ "step": 135
836
+ },
837
+ {
838
+ "epoch": 0.49,
839
+ "learning_rate": 0.00018575266561936523,
840
+ "loss": 1.9884,
841
+ "step": 136
842
+ },
843
+ {
844
+ "epoch": 0.5,
845
+ "learning_rate": 0.0001854277431699295,
846
+ "loss": 1.9744,
847
+ "step": 137
848
+ },
849
+ {
850
+ "epoch": 0.5,
851
+ "learning_rate": 0.00018509944817946922,
852
+ "loss": 1.9539,
853
+ "step": 138
854
+ },
855
+ {
856
+ "epoch": 0.5,
857
+ "eval_loss": 2.0380747318267822,
858
+ "eval_runtime": 44.126,
859
+ "eval_samples_per_second": 25.608,
860
+ "eval_steps_per_second": 1.609,
861
+ "step": 138
862
+ },
863
+ {
864
+ "epoch": 0.51,
865
+ "learning_rate": 0.00018476779360850832,
866
+ "loss": 2.0191,
867
+ "step": 139
868
+ },
869
+ {
870
+ "epoch": 0.51,
871
+ "learning_rate": 0.00018443279255020152,
872
+ "loss": 2.0294,
873
+ "step": 140
874
+ },
875
+ {
876
+ "epoch": 0.51,
877
+ "learning_rate": 0.00018409445822981693,
878
+ "loss": 2.021,
879
+ "step": 141
880
+ },
881
+ {
882
+ "epoch": 0.52,
883
+ "learning_rate": 0.0001837528040042142,
884
+ "loss": 2.0358,
885
+ "step": 142
886
+ },
887
+ {
888
+ "epoch": 0.52,
889
+ "learning_rate": 0.00018340784336131713,
890
+ "loss": 1.9937,
891
+ "step": 143
892
+ },
893
+ {
894
+ "epoch": 0.52,
895
+ "learning_rate": 0.00018305958991958127,
896
+ "loss": 2.0256,
897
+ "step": 144
898
+ },
899
+ {
900
+ "epoch": 0.53,
901
+ "learning_rate": 0.00018270805742745617,
902
+ "loss": 2.0468,
903
+ "step": 145
904
+ },
905
+ {
906
+ "epoch": 0.53,
907
+ "learning_rate": 0.00018235325976284275,
908
+ "loss": 1.9277,
909
+ "step": 146
910
+ },
911
+ {
912
+ "epoch": 0.53,
913
+ "learning_rate": 0.00018199521093254523,
914
+ "loss": 2.047,
915
+ "step": 147
916
+ },
917
+ {
918
+ "epoch": 0.54,
919
+ "learning_rate": 0.00018163392507171842,
920
+ "loss": 2.0202,
921
+ "step": 148
922
+ },
923
+ {
924
+ "epoch": 0.54,
925
+ "learning_rate": 0.0001812694164433094,
926
+ "loss": 1.9753,
927
+ "step": 149
928
+ },
929
+ {
930
+ "epoch": 0.55,
931
+ "learning_rate": 0.00018090169943749476,
932
+ "loss": 1.9769,
933
+ "step": 150
934
+ },
935
+ {
936
+ "epoch": 0.55,
937
+ "learning_rate": 0.0001805307885711122,
938
+ "loss": 1.9002,
939
+ "step": 151
940
+ },
941
+ {
942
+ "epoch": 0.55,
943
+ "learning_rate": 0.00018015669848708767,
944
+ "loss": 1.8904,
945
+ "step": 152
946
+ },
947
+ {
948
+ "epoch": 0.56,
949
+ "learning_rate": 0.0001797794439538571,
950
+ "loss": 1.8454,
951
+ "step": 153
952
+ },
953
+ {
954
+ "epoch": 0.56,
955
+ "learning_rate": 0.00017939903986478355,
956
+ "loss": 1.9393,
957
+ "step": 154
958
+ },
959
+ {
960
+ "epoch": 0.56,
961
+ "learning_rate": 0.00017901550123756906,
962
+ "loss": 1.9692,
963
+ "step": 155
964
+ },
965
+ {
966
+ "epoch": 0.57,
967
+ "learning_rate": 0.00017862884321366188,
968
+ "loss": 1.8821,
969
+ "step": 156
970
+ },
971
+ {
972
+ "epoch": 0.57,
973
+ "learning_rate": 0.0001782390810576588,
974
+ "loss": 1.9025,
975
+ "step": 157
976
+ },
977
+ {
978
+ "epoch": 0.57,
979
+ "learning_rate": 0.00017784623015670238,
980
+ "loss": 1.8977,
981
+ "step": 158
982
+ },
983
+ {
984
+ "epoch": 0.58,
985
+ "learning_rate": 0.00017745030601987337,
986
+ "loss": 1.9179,
987
+ "step": 159
988
+ },
989
+ {
990
+ "epoch": 0.58,
991
+ "learning_rate": 0.00017705132427757895,
992
+ "loss": 1.8895,
993
+ "step": 160
994
+ },
995
+ {
996
+ "epoch": 0.59,
997
+ "learning_rate": 0.00017664930068093498,
998
+ "loss": 1.8531,
999
+ "step": 161
1000
+ },
1001
+ {
1002
+ "epoch": 0.59,
1003
+ "learning_rate": 0.0001762442511011448,
1004
+ "loss": 1.9298,
1005
+ "step": 162
1006
+ },
1007
+ {
1008
+ "epoch": 0.59,
1009
+ "learning_rate": 0.0001758361915288722,
1010
+ "loss": 1.862,
1011
+ "step": 163
1012
+ },
1013
+ {
1014
+ "epoch": 0.6,
1015
+ "learning_rate": 0.00017542513807361037,
1016
+ "loss": 1.9015,
1017
+ "step": 164
1018
+ },
1019
+ {
1020
+ "epoch": 0.6,
1021
+ "learning_rate": 0.00017501110696304596,
1022
+ "loss": 1.8509,
1023
+ "step": 165
1024
+ },
1025
+ {
1026
+ "epoch": 0.6,
1027
+ "learning_rate": 0.00017459411454241822,
1028
+ "loss": 1.8403,
1029
+ "step": 166
1030
+ },
1031
+ {
1032
+ "epoch": 0.61,
1033
+ "learning_rate": 0.00017417417727387394,
1034
+ "loss": 1.9279,
1035
+ "step": 167
1036
+ },
1037
+ {
1038
+ "epoch": 0.61,
1039
+ "learning_rate": 0.0001737513117358174,
1040
+ "loss": 1.7867,
1041
+ "step": 168
1042
+ },
1043
+ {
1044
+ "epoch": 0.61,
1045
+ "learning_rate": 0.00017332553462225602,
1046
+ "loss": 1.8755,
1047
+ "step": 169
1048
+ },
1049
+ {
1050
+ "epoch": 0.62,
1051
+ "learning_rate": 0.00017289686274214118,
1052
+ "loss": 1.8639,
1053
+ "step": 170
1054
+ },
1055
+ {
1056
+ "epoch": 0.62,
1057
+ "learning_rate": 0.0001724653130187047,
1058
+ "loss": 1.8795,
1059
+ "step": 171
1060
+ },
1061
+ {
1062
+ "epoch": 0.63,
1063
+ "learning_rate": 0.0001720309024887907,
1064
+ "loss": 1.8081,
1065
+ "step": 172
1066
+ },
1067
+ {
1068
+ "epoch": 0.63,
1069
+ "learning_rate": 0.00017159364830218312,
1070
+ "loss": 1.8725,
1071
+ "step": 173
1072
+ },
1073
+ {
1074
+ "epoch": 0.63,
1075
+ "learning_rate": 0.00017115356772092857,
1076
+ "loss": 1.8081,
1077
+ "step": 174
1078
+ },
1079
+ {
1080
+ "epoch": 0.64,
1081
+ "learning_rate": 0.00017071067811865476,
1082
+ "loss": 1.8939,
1083
+ "step": 175
1084
+ },
1085
+ {
1086
+ "epoch": 0.64,
1087
+ "learning_rate": 0.00017026499697988493,
1088
+ "loss": 1.819,
1089
+ "step": 176
1090
+ },
1091
+ {
1092
+ "epoch": 0.64,
1093
+ "learning_rate": 0.00016981654189934727,
1094
+ "loss": 1.8051,
1095
+ "step": 177
1096
+ },
1097
+ {
1098
+ "epoch": 0.65,
1099
+ "learning_rate": 0.0001693653305812805,
1100
+ "loss": 1.8504,
1101
+ "step": 178
1102
+ },
1103
+ {
1104
+ "epoch": 0.65,
1105
+ "learning_rate": 0.00016891138083873487,
1106
+ "loss": 1.7902,
1107
+ "step": 179
1108
+ },
1109
+ {
1110
+ "epoch": 0.65,
1111
+ "learning_rate": 0.00016845471059286887,
1112
+ "loss": 1.787,
1113
+ "step": 180
1114
+ },
1115
+ {
1116
+ "epoch": 0.66,
1117
+ "learning_rate": 0.00016799533787224192,
1118
+ "loss": 1.8533,
1119
+ "step": 181
1120
+ },
1121
+ {
1122
+ "epoch": 0.66,
1123
+ "learning_rate": 0.00016753328081210245,
1124
+ "loss": 1.831,
1125
+ "step": 182
1126
+ },
1127
+ {
1128
+ "epoch": 0.67,
1129
+ "learning_rate": 0.000167068557653672,
1130
+ "loss": 1.8204,
1131
+ "step": 183
1132
+ },
1133
+ {
1134
+ "epoch": 0.67,
1135
+ "learning_rate": 0.00016660118674342517,
1136
+ "loss": 1.8043,
1137
+ "step": 184
1138
+ },
1139
+ {
1140
+ "epoch": 0.67,
1141
+ "learning_rate": 0.00016613118653236518,
1142
+ "loss": 1.778,
1143
+ "step": 185
1144
+ },
1145
+ {
1146
+ "epoch": 0.68,
1147
+ "learning_rate": 0.00016565857557529566,
1148
+ "loss": 1.7868,
1149
+ "step": 186
1150
+ },
1151
+ {
1152
+ "epoch": 0.68,
1153
+ "learning_rate": 0.0001651833725300879,
1154
+ "loss": 1.8423,
1155
+ "step": 187
1156
+ },
1157
+ {
1158
+ "epoch": 0.68,
1159
+ "learning_rate": 0.00016470559615694446,
1160
+ "loss": 1.8157,
1161
+ "step": 188
1162
+ },
1163
+ {
1164
+ "epoch": 0.69,
1165
+ "learning_rate": 0.00016422526531765846,
1166
+ "loss": 1.7796,
1167
+ "step": 189
1168
+ },
1169
+ {
1170
+ "epoch": 0.69,
1171
+ "learning_rate": 0.000163742398974869,
1172
+ "loss": 1.705,
1173
+ "step": 190
1174
+ },
1175
+ {
1176
+ "epoch": 0.69,
1177
+ "learning_rate": 0.00016325701619131246,
1178
+ "loss": 1.7537,
1179
+ "step": 191
1180
+ },
1181
+ {
1182
+ "epoch": 0.7,
1183
+ "learning_rate": 0.00016276913612907007,
1184
+ "loss": 1.7629,
1185
+ "step": 192
1186
+ },
1187
+ {
1188
+ "epoch": 0.7,
1189
+ "learning_rate": 0.00016227877804881127,
1190
+ "loss": 1.7667,
1191
+ "step": 193
1192
+ },
1193
+ {
1194
+ "epoch": 0.71,
1195
+ "learning_rate": 0.00016178596130903344,
1196
+ "loss": 1.7376,
1197
+ "step": 194
1198
+ },
1199
+ {
1200
+ "epoch": 0.71,
1201
+ "learning_rate": 0.00016129070536529766,
1202
+ "loss": 1.7776,
1203
+ "step": 195
1204
+ },
1205
+ {
1206
+ "epoch": 0.71,
1207
+ "learning_rate": 0.00016079302976946055,
1208
+ "loss": 1.6759,
1209
+ "step": 196
1210
+ },
1211
+ {
1212
+ "epoch": 0.72,
1213
+ "learning_rate": 0.00016029295416890248,
1214
+ "loss": 1.7559,
1215
+ "step": 197
1216
+ },
1217
+ {
1218
+ "epoch": 0.72,
1219
+ "learning_rate": 0.0001597904983057519,
1220
+ "loss": 1.76,
1221
+ "step": 198
1222
+ },
1223
+ {
1224
+ "epoch": 0.72,
1225
+ "learning_rate": 0.00015928568201610595,
1226
+ "loss": 1.7412,
1227
+ "step": 199
1228
+ },
1229
+ {
1230
+ "epoch": 0.73,
1231
+ "learning_rate": 0.00015877852522924732,
1232
+ "loss": 1.6685,
1233
+ "step": 200
1234
+ },
1235
+ {
1236
+ "epoch": 0.73,
1237
+ "learning_rate": 0.00015826904796685762,
1238
+ "loss": 1.7587,
1239
+ "step": 201
1240
+ },
1241
+ {
1242
+ "epoch": 0.73,
1243
+ "learning_rate": 0.00015775727034222675,
1244
+ "loss": 1.7417,
1245
+ "step": 202
1246
+ },
1247
+ {
1248
+ "epoch": 0.74,
1249
+ "learning_rate": 0.0001572432125594591,
1250
+ "loss": 1.6513,
1251
+ "step": 203
1252
+ },
1253
+ {
1254
+ "epoch": 0.74,
1255
+ "learning_rate": 0.00015672689491267567,
1256
+ "loss": 1.6968,
1257
+ "step": 204
1258
+ },
1259
+ {
1260
+ "epoch": 0.75,
1261
+ "learning_rate": 0.00015620833778521307,
1262
+ "loss": 1.6509,
1263
+ "step": 205
1264
+ },
1265
+ {
1266
+ "epoch": 0.75,
1267
+ "learning_rate": 0.00015568756164881882,
1268
+ "loss": 1.6245,
1269
+ "step": 206
1270
+ },
1271
+ {
1272
+ "epoch": 0.75,
1273
+ "learning_rate": 0.00015516458706284303,
1274
+ "loss": 1.6608,
1275
+ "step": 207
1276
+ },
1277
+ {
1278
+ "epoch": 0.75,
1279
+ "eval_loss": 1.6871771812438965,
1280
+ "eval_runtime": 44.071,
1281
+ "eval_samples_per_second": 25.64,
1282
+ "eval_steps_per_second": 1.611,
1283
+ "step": 207
1284
+ },
1285
+ {
1286
+ "epoch": 0.76,
1287
+ "learning_rate": 0.00015463943467342693,
1288
+ "loss": 1.6572,
1289
+ "step": 208
1290
+ },
1291
+ {
1292
+ "epoch": 0.76,
1293
+ "learning_rate": 0.00015411212521268758,
1294
+ "loss": 1.5938,
1295
+ "step": 209
1296
+ },
1297
+ {
1298
+ "epoch": 0.76,
1299
+ "learning_rate": 0.00015358267949789966,
1300
+ "loss": 1.6421,
1301
+ "step": 210
1302
+ },
1303
+ {
1304
+ "epoch": 0.77,
1305
+ "learning_rate": 0.0001530511184306734,
1306
+ "loss": 1.7423,
1307
+ "step": 211
1308
+ },
1309
+ {
1310
+ "epoch": 0.77,
1311
+ "learning_rate": 0.0001525174629961296,
1312
+ "loss": 1.6636,
1313
+ "step": 212
1314
+ },
1315
+ {
1316
+ "epoch": 0.77,
1317
+ "learning_rate": 0.00015198173426207094,
1318
+ "loss": 1.7339,
1319
+ "step": 213
1320
+ },
1321
+ {
1322
+ "epoch": 0.78,
1323
+ "learning_rate": 0.00015144395337815064,
1324
+ "loss": 1.6729,
1325
+ "step": 214
1326
+ },
1327
+ {
1328
+ "epoch": 0.78,
1329
+ "learning_rate": 0.00015090414157503714,
1330
+ "loss": 1.6939,
1331
+ "step": 215
1332
+ },
1333
+ {
1334
+ "epoch": 0.79,
1335
+ "learning_rate": 0.0001503623201635761,
1336
+ "loss": 1.6394,
1337
+ "step": 216
1338
+ },
1339
+ {
1340
+ "epoch": 0.79,
1341
+ "learning_rate": 0.0001498185105339491,
1342
+ "loss": 1.6196,
1343
+ "step": 217
1344
+ },
1345
+ {
1346
+ "epoch": 0.79,
1347
+ "learning_rate": 0.00014927273415482915,
1348
+ "loss": 1.6304,
1349
+ "step": 218
1350
+ },
1351
+ {
1352
+ "epoch": 0.8,
1353
+ "learning_rate": 0.00014872501257253323,
1354
+ "loss": 1.6124,
1355
+ "step": 219
1356
+ },
1357
+ {
1358
+ "epoch": 0.8,
1359
+ "learning_rate": 0.00014817536741017152,
1360
+ "loss": 1.6011,
1361
+ "step": 220
1362
+ },
1363
+ {
1364
+ "epoch": 0.8,
1365
+ "learning_rate": 0.0001476238203667939,
1366
+ "loss": 1.5667,
1367
+ "step": 221
1368
+ },
1369
+ {
1370
+ "epoch": 0.81,
1371
+ "learning_rate": 0.0001470703932165333,
1372
+ "loss": 1.6536,
1373
+ "step": 222
1374
+ },
1375
+ {
1376
+ "epoch": 0.81,
1377
+ "learning_rate": 0.00014651510780774583,
1378
+ "loss": 1.5861,
1379
+ "step": 223
1380
+ },
1381
+ {
1382
+ "epoch": 0.81,
1383
+ "learning_rate": 0.00014595798606214882,
1384
+ "loss": 1.5339,
1385
+ "step": 224
1386
+ },
1387
+ {
1388
+ "epoch": 0.82,
1389
+ "learning_rate": 0.00014539904997395468,
1390
+ "loss": 1.6058,
1391
+ "step": 225
1392
+ },
1393
+ {
1394
+ "epoch": 0.82,
1395
+ "learning_rate": 0.00014483832160900326,
1396
+ "loss": 1.605,
1397
+ "step": 226
1398
+ },
1399
+ {
1400
+ "epoch": 0.83,
1401
+ "learning_rate": 0.0001442758231038902,
1402
+ "loss": 1.5731,
1403
+ "step": 227
1404
+ },
1405
+ {
1406
+ "epoch": 0.83,
1407
+ "learning_rate": 0.0001437115766650933,
1408
+ "loss": 1.5876,
1409
+ "step": 228
1410
+ },
1411
+ {
1412
+ "epoch": 0.83,
1413
+ "learning_rate": 0.0001431456045680959,
1414
+ "loss": 1.6278,
1415
+ "step": 229
1416
+ },
1417
+ {
1418
+ "epoch": 0.84,
1419
+ "learning_rate": 0.00014257792915650728,
1420
+ "loss": 1.5877,
1421
+ "step": 230
1422
+ },
1423
+ {
1424
+ "epoch": 0.84,
1425
+ "learning_rate": 0.00014200857284118066,
1426
+ "loss": 1.5985,
1427
+ "step": 231
1428
+ },
1429
+ {
1430
+ "epoch": 0.84,
1431
+ "learning_rate": 0.00014143755809932845,
1432
+ "loss": 1.5359,
1433
+ "step": 232
1434
+ },
1435
+ {
1436
+ "epoch": 0.85,
1437
+ "learning_rate": 0.00014086490747363493,
1438
+ "loss": 1.4969,
1439
+ "step": 233
1440
+ },
1441
+ {
1442
+ "epoch": 0.85,
1443
+ "learning_rate": 0.00014029064357136628,
1444
+ "loss": 1.5615,
1445
+ "step": 234
1446
+ },
1447
+ {
1448
+ "epoch": 0.85,
1449
+ "learning_rate": 0.00013971478906347806,
1450
+ "loss": 1.5356,
1451
+ "step": 235
1452
+ },
1453
+ {
1454
+ "epoch": 0.86,
1455
+ "learning_rate": 0.00013913736668372026,
1456
+ "loss": 1.5306,
1457
+ "step": 236
1458
+ },
1459
+ {
1460
+ "epoch": 0.86,
1461
+ "learning_rate": 0.00013855839922773968,
1462
+ "loss": 1.4575,
1463
+ "step": 237
1464
+ },
1465
+ {
1466
+ "epoch": 0.87,
1467
+ "learning_rate": 0.00013797790955218014,
1468
+ "loss": 1.5216,
1469
+ "step": 238
1470
+ },
1471
+ {
1472
+ "epoch": 0.87,
1473
+ "learning_rate": 0.00013739592057378003,
1474
+ "loss": 1.5153,
1475
+ "step": 239
1476
+ },
1477
+ {
1478
+ "epoch": 0.87,
1479
+ "learning_rate": 0.00013681245526846783,
1480
+ "loss": 1.4623,
1481
+ "step": 240
1482
+ },
1483
+ {
1484
+ "epoch": 0.88,
1485
+ "learning_rate": 0.00013622753667045457,
1486
+ "loss": 1.4995,
1487
+ "step": 241
1488
+ },
1489
+ {
1490
+ "epoch": 0.88,
1491
+ "learning_rate": 0.00013564118787132506,
1492
+ "loss": 1.5363,
1493
+ "step": 242
1494
+ },
1495
+ {
1496
+ "epoch": 0.88,
1497
+ "learning_rate": 0.0001350534320191259,
1498
+ "loss": 1.425,
1499
+ "step": 243
1500
+ },
1501
+ {
1502
+ "epoch": 0.89,
1503
+ "learning_rate": 0.0001344642923174517,
1504
+ "loss": 1.4145,
1505
+ "step": 244
1506
+ },
1507
+ {
1508
+ "epoch": 0.89,
1509
+ "learning_rate": 0.00013387379202452917,
1510
+ "loss": 1.4994,
1511
+ "step": 245
1512
+ },
1513
+ {
1514
+ "epoch": 0.89,
1515
+ "learning_rate": 0.00013328195445229868,
1516
+ "loss": 1.4758,
1517
+ "step": 246
1518
+ },
1519
+ {
1520
+ "epoch": 0.9,
1521
+ "learning_rate": 0.00013268880296549425,
1522
+ "loss": 1.5288,
1523
+ "step": 247
1524
+ },
1525
+ {
1526
+ "epoch": 0.9,
1527
+ "learning_rate": 0.00013209436098072095,
1528
+ "loss": 1.4809,
1529
+ "step": 248
1530
+ },
1531
+ {
1532
+ "epoch": 0.91,
1533
+ "learning_rate": 0.0001314986519655305,
1534
+ "loss": 1.4916,
1535
+ "step": 249
1536
+ },
1537
+ {
1538
+ "epoch": 0.91,
1539
+ "learning_rate": 0.00013090169943749476,
1540
+ "loss": 1.5181,
1541
+ "step": 250
1542
+ },
1543
+ {
1544
+ "epoch": 0.91,
1545
+ "learning_rate": 0.00013030352696327742,
1546
+ "loss": 1.4987,
1547
+ "step": 251
1548
+ },
1549
+ {
1550
+ "epoch": 0.92,
1551
+ "learning_rate": 0.0001297041581577035,
1552
+ "loss": 1.4526,
1553
+ "step": 252
1554
+ },
1555
+ {
1556
+ "epoch": 0.92,
1557
+ "learning_rate": 0.00012910361668282719,
1558
+ "loss": 1.4375,
1559
+ "step": 253
1560
+ },
1561
+ {
1562
+ "epoch": 0.92,
1563
+ "learning_rate": 0.0001285019262469976,
1564
+ "loss": 1.4934,
1565
+ "step": 254
1566
+ },
1567
+ {
1568
+ "epoch": 0.93,
1569
+ "learning_rate": 0.00012789911060392294,
1570
+ "loss": 1.443,
1571
+ "step": 255
1572
+ },
1573
+ {
1574
+ "epoch": 0.93,
1575
+ "learning_rate": 0.00012729519355173254,
1576
+ "loss": 1.359,
1577
+ "step": 256
1578
+ },
1579
+ {
1580
+ "epoch": 0.93,
1581
+ "learning_rate": 0.00012669019893203759,
1582
+ "loss": 1.4471,
1583
+ "step": 257
1584
+ },
1585
+ {
1586
+ "epoch": 0.94,
1587
+ "learning_rate": 0.00012608415062898972,
1588
+ "loss": 1.4615,
1589
+ "step": 258
1590
+ },
1591
+ {
1592
+ "epoch": 0.94,
1593
+ "learning_rate": 0.00012547707256833823,
1594
+ "loss": 1.4249,
1595
+ "step": 259
1596
+ },
1597
+ {
1598
+ "epoch": 0.95,
1599
+ "learning_rate": 0.0001248689887164855,
1600
+ "loss": 1.4157,
1601
+ "step": 260
1602
+ },
1603
+ {
1604
+ "epoch": 0.95,
1605
+ "learning_rate": 0.00012425992307954075,
1606
+ "loss": 1.4213,
1607
+ "step": 261
1608
+ },
1609
+ {
1610
+ "epoch": 0.95,
1611
+ "learning_rate": 0.00012364989970237248,
1612
+ "loss": 1.3552,
1613
+ "step": 262
1614
+ },
1615
+ {
1616
+ "epoch": 0.96,
1617
+ "learning_rate": 0.00012303894266765908,
1618
+ "loss": 1.3715,
1619
+ "step": 263
1620
+ },
1621
+ {
1622
+ "epoch": 0.96,
1623
+ "learning_rate": 0.00012242707609493814,
1624
+ "loss": 1.3484,
1625
+ "step": 264
1626
+ },
1627
+ {
1628
+ "epoch": 0.96,
1629
+ "learning_rate": 0.00012181432413965428,
1630
+ "loss": 1.4268,
1631
+ "step": 265
1632
+ },
1633
+ {
1634
+ "epoch": 0.97,
1635
+ "learning_rate": 0.00012120071099220549,
1636
+ "loss": 1.3612,
1637
+ "step": 266
1638
+ },
1639
+ {
1640
+ "epoch": 0.97,
1641
+ "learning_rate": 0.00012058626087698814,
1642
+ "loss": 1.341,
1643
+ "step": 267
1644
+ },
1645
+ {
1646
+ "epoch": 0.97,
1647
+ "learning_rate": 0.00011997099805144069,
1648
+ "loss": 1.3221,
1649
+ "step": 268
1650
+ },
1651
+ {
1652
+ "epoch": 0.98,
1653
+ "learning_rate": 0.00011935494680508606,
1654
+ "loss": 1.3872,
1655
+ "step": 269
1656
+ },
1657
+ {
1658
+ "epoch": 0.98,
1659
+ "learning_rate": 0.00011873813145857249,
1660
+ "loss": 1.4136,
1661
+ "step": 270
1662
+ },
1663
+ {
1664
+ "epoch": 0.99,
1665
+ "learning_rate": 0.00011812057636271374,
1666
+ "loss": 1.3503,
1667
+ "step": 271
1668
+ },
1669
+ {
1670
+ "epoch": 0.99,
1671
+ "learning_rate": 0.00011750230589752762,
1672
+ "loss": 1.4157,
1673
+ "step": 272
1674
+ },
1675
+ {
1676
+ "epoch": 0.99,
1677
+ "learning_rate": 0.00011688334447127338,
1678
+ "loss": 1.3828,
1679
+ "step": 273
1680
+ },
1681
+ {
1682
+ "epoch": 1.0,
1683
+ "learning_rate": 0.00011626371651948838,
1684
+ "loss": 1.3542,
1685
+ "step": 274
1686
+ },
1687
+ {
1688
+ "epoch": 1.0,
1689
+ "learning_rate": 0.0001156434465040231,
1690
+ "loss": 1.3105,
1691
+ "step": 275
1692
+ },
1693
+ {
1694
+ "epoch": 1.0,
1695
+ "learning_rate": 0.00011502255891207572,
1696
+ "loss": 1.3767,
1697
+ "step": 276
1698
+ },
1699
+ {
1700
+ "epoch": 1.0,
1701
+ "eval_loss": 1.3322992324829102,
1702
+ "eval_runtime": 43.9906,
1703
+ "eval_samples_per_second": 25.687,
1704
+ "eval_steps_per_second": 1.614,
1705
+ "step": 276
1706
+ },
1707
+ {
1708
+ "epoch": 1.01,
1709
+ "learning_rate": 0.00011440107825522521,
1710
+ "loss": 1.329,
1711
+ "step": 277
1712
+ },
1713
+ {
1714
+ "epoch": 1.01,
1715
+ "learning_rate": 0.0001137790290684638,
1716
+ "loss": 1.3425,
1717
+ "step": 278
1718
+ },
1719
+ {
1720
+ "epoch": 1.01,
1721
+ "learning_rate": 0.00011315643590922827,
1722
+ "loss": 1.3341,
1723
+ "step": 279
1724
+ },
1725
+ {
1726
+ "epoch": 1.02,
1727
+ "learning_rate": 0.00011253332335643043,
1728
+ "loss": 1.2998,
1729
+ "step": 280
1730
+ },
1731
+ {
1732
+ "epoch": 1.02,
1733
+ "learning_rate": 0.00011190971600948699,
1734
+ "loss": 1.2836,
1735
+ "step": 281
1736
+ },
1737
+ {
1738
+ "epoch": 1.0,
1739
+ "learning_rate": 0.00011128563848734816,
1740
+ "loss": 0.9234,
1741
+ "step": 282
1742
+ },
1743
+ {
1744
+ "epoch": 1.01,
1745
+ "learning_rate": 0.000110661115427526,
1746
+ "loss": 1.0205,
1747
+ "step": 283
1748
+ },
1749
+ {
1750
+ "epoch": 1.01,
1751
+ "learning_rate": 0.00011003617148512149,
1752
+ "loss": 0.8919,
1753
+ "step": 284
1754
+ },
1755
+ {
1756
+ "epoch": 1.01,
1757
+ "learning_rate": 0.00010941083133185146,
1758
+ "loss": 0.9155,
1759
+ "step": 285
1760
+ },
1761
+ {
1762
+ "epoch": 1.02,
1763
+ "learning_rate": 0.00010878511965507434,
1764
+ "loss": 0.9399,
1765
+ "step": 286
1766
+ },
1767
+ {
1768
+ "epoch": 1.02,
1769
+ "learning_rate": 0.00010815906115681578,
1770
+ "loss": 0.9605,
1771
+ "step": 287
1772
+ },
1773
+ {
1774
+ "epoch": 1.03,
1775
+ "learning_rate": 0.00010753268055279329,
1776
+ "loss": 0.9216,
1777
+ "step": 288
1778
+ },
1779
+ {
1780
+ "epoch": 1.03,
1781
+ "learning_rate": 0.00010690600257144061,
1782
+ "loss": 0.8771,
1783
+ "step": 289
1784
+ },
1785
+ {
1786
+ "epoch": 1.03,
1787
+ "learning_rate": 0.00010627905195293135,
1788
+ "loss": 0.8391,
1789
+ "step": 290
1790
+ },
1791
+ {
1792
+ "epoch": 1.04,
1793
+ "learning_rate": 0.00010565185344820247,
1794
+ "loss": 0.8604,
1795
+ "step": 291
1796
+ },
1797
+ {
1798
+ "epoch": 1.04,
1799
+ "learning_rate": 0.00010502443181797697,
1800
+ "loss": 0.911,
1801
+ "step": 292
1802
+ },
1803
+ {
1804
+ "epoch": 1.04,
1805
+ "learning_rate": 0.0001043968118317865,
1806
+ "loss": 0.9754,
1807
+ "step": 293
1808
+ },
1809
+ {
1810
+ "epoch": 1.05,
1811
+ "learning_rate": 0.00010376901826699348,
1812
+ "loss": 0.9006,
1813
+ "step": 294
1814
+ },
1815
+ {
1816
+ "epoch": 1.05,
1817
+ "learning_rate": 0.00010314107590781284,
1818
+ "loss": 0.8617,
1819
+ "step": 295
1820
+ },
1821
+ {
1822
+ "epoch": 1.05,
1823
+ "learning_rate": 0.00010251300954433376,
1824
+ "loss": 0.9077,
1825
+ "step": 296
1826
+ },
1827
+ {
1828
+ "epoch": 1.06,
1829
+ "learning_rate": 0.00010188484397154084,
1830
+ "loss": 0.8979,
1831
+ "step": 297
1832
+ },
1833
+ {
1834
+ "epoch": 1.06,
1835
+ "learning_rate": 0.00010125660398833528,
1836
+ "loss": 0.8798,
1837
+ "step": 298
1838
+ },
1839
+ {
1840
+ "epoch": 1.07,
1841
+ "learning_rate": 0.00010062831439655591,
1842
+ "loss": 0.8271,
1843
+ "step": 299
1844
+ },
1845
+ {
1846
+ "epoch": 1.07,
1847
+ "learning_rate": 0.0001,
1848
+ "loss": 0.8504,
1849
+ "step": 300
1850
+ },
1851
+ {
1852
+ "epoch": 1.07,
1853
+ "learning_rate": 9.937168560344412e-05,
1854
+ "loss": 0.8761,
1855
+ "step": 301
1856
+ },
1857
+ {
1858
+ "epoch": 1.08,
1859
+ "learning_rate": 9.874339601166473e-05,
1860
+ "loss": 0.9453,
1861
+ "step": 302
1862
+ },
1863
+ {
1864
+ "epoch": 1.08,
1865
+ "learning_rate": 9.81151560284592e-05,
1866
+ "loss": 0.828,
1867
+ "step": 303
1868
+ },
1869
+ {
1870
+ "epoch": 1.08,
1871
+ "learning_rate": 9.748699045566626e-05,
1872
+ "loss": 0.7736,
1873
+ "step": 304
1874
+ },
1875
+ {
1876
+ "epoch": 1.09,
1877
+ "learning_rate": 9.685892409218717e-05,
1878
+ "loss": 0.8621,
1879
+ "step": 305
1880
+ },
1881
+ {
1882
+ "epoch": 1.09,
1883
+ "learning_rate": 9.623098173300654e-05,
1884
+ "loss": 0.8606,
1885
+ "step": 306
1886
+ },
1887
+ {
1888
+ "epoch": 1.09,
1889
+ "learning_rate": 9.560318816821353e-05,
1890
+ "loss": 0.8186,
1891
+ "step": 307
1892
+ },
1893
+ {
1894
+ "epoch": 1.1,
1895
+ "learning_rate": 9.497556818202306e-05,
1896
+ "loss": 0.8447,
1897
+ "step": 308
1898
+ },
1899
+ {
1900
+ "epoch": 1.1,
1901
+ "learning_rate": 9.434814655179755e-05,
1902
+ "loss": 0.8451,
1903
+ "step": 309
1904
+ },
1905
+ {
1906
+ "epoch": 1.11,
1907
+ "learning_rate": 9.372094804706867e-05,
1908
+ "loss": 0.8788,
1909
+ "step": 310
1910
+ },
1911
+ {
1912
+ "epoch": 1.11,
1913
+ "learning_rate": 9.309399742855942e-05,
1914
+ "loss": 0.8211,
1915
+ "step": 311
1916
+ },
1917
+ {
1918
+ "epoch": 1.11,
1919
+ "learning_rate": 9.246731944720675e-05,
1920
+ "loss": 0.8384,
1921
+ "step": 312
1922
+ },
1923
+ {
1924
+ "epoch": 1.12,
1925
+ "learning_rate": 9.184093884318425e-05,
1926
+ "loss": 0.8284,
1927
+ "step": 313
1928
+ },
1929
+ {
1930
+ "epoch": 1.12,
1931
+ "learning_rate": 9.121488034492569e-05,
1932
+ "loss": 0.8377,
1933
+ "step": 314
1934
+ },
1935
+ {
1936
+ "epoch": 1.12,
1937
+ "learning_rate": 9.058916866814858e-05,
1938
+ "loss": 0.8355,
1939
+ "step": 315
1940
+ },
1941
+ {
1942
+ "epoch": 1.13,
1943
+ "learning_rate": 8.99638285148785e-05,
1944
+ "loss": 0.8995,
1945
+ "step": 316
1946
+ },
1947
+ {
1948
+ "epoch": 1.13,
1949
+ "learning_rate": 8.933888457247402e-05,
1950
+ "loss": 0.8128,
1951
+ "step": 317
1952
+ },
1953
+ {
1954
+ "epoch": 1.13,
1955
+ "learning_rate": 8.871436151265184e-05,
1956
+ "loss": 0.8239,
1957
+ "step": 318
1958
+ },
1959
+ {
1960
+ "epoch": 1.14,
1961
+ "learning_rate": 8.809028399051302e-05,
1962
+ "loss": 0.8796,
1963
+ "step": 319
1964
+ },
1965
+ {
1966
+ "epoch": 1.14,
1967
+ "learning_rate": 8.746667664356956e-05,
1968
+ "loss": 0.8583,
1969
+ "step": 320
1970
+ },
1971
+ {
1972
+ "epoch": 1.15,
1973
+ "learning_rate": 8.684356409077176e-05,
1974
+ "loss": 0.8328,
1975
+ "step": 321
1976
+ },
1977
+ {
1978
+ "epoch": 1.15,
1979
+ "learning_rate": 8.62209709315362e-05,
1980
+ "loss": 0.8706,
1981
+ "step": 322
1982
+ },
1983
+ {
1984
+ "epoch": 1.15,
1985
+ "learning_rate": 8.559892174477479e-05,
1986
+ "loss": 0.8346,
1987
+ "step": 323
1988
+ },
1989
+ {
1990
+ "epoch": 1.16,
1991
+ "learning_rate": 8.497744108792429e-05,
1992
+ "loss": 0.8461,
1993
+ "step": 324
1994
+ },
1995
+ {
1996
+ "epoch": 1.16,
1997
+ "learning_rate": 8.435655349597689e-05,
1998
+ "loss": 0.8005,
1999
+ "step": 325
2000
+ },
2001
+ {
2002
+ "epoch": 1.16,
2003
+ "learning_rate": 8.373628348051165e-05,
2004
+ "loss": 0.8283,
2005
+ "step": 326
2006
+ },
2007
+ {
2008
+ "epoch": 1.17,
2009
+ "learning_rate": 8.311665552872662e-05,
2010
+ "loss": 0.8153,
2011
+ "step": 327
2012
+ },
2013
+ {
2014
+ "epoch": 1.17,
2015
+ "learning_rate": 8.249769410247239e-05,
2016
+ "loss": 0.819,
2017
+ "step": 328
2018
+ },
2019
+ {
2020
+ "epoch": 1.17,
2021
+ "learning_rate": 8.187942363728625e-05,
2022
+ "loss": 0.7948,
2023
+ "step": 329
2024
+ },
2025
+ {
2026
+ "epoch": 1.18,
2027
+ "learning_rate": 8.126186854142752e-05,
2028
+ "loss": 0.8707,
2029
+ "step": 330
2030
+ },
2031
+ {
2032
+ "epoch": 1.18,
2033
+ "learning_rate": 8.064505319491398e-05,
2034
+ "loss": 0.7892,
2035
+ "step": 331
2036
+ },
2037
+ {
2038
+ "epoch": 1.19,
2039
+ "learning_rate": 8.002900194855932e-05,
2040
+ "loss": 0.7967,
2041
+ "step": 332
2042
+ },
2043
+ {
2044
+ "epoch": 1.19,
2045
+ "learning_rate": 7.941373912301189e-05,
2046
+ "loss": 0.8121,
2047
+ "step": 333
2048
+ },
2049
+ {
2050
+ "epoch": 1.19,
2051
+ "learning_rate": 7.879928900779456e-05,
2052
+ "loss": 0.8048,
2053
+ "step": 334
2054
+ },
2055
+ {
2056
+ "epoch": 1.2,
2057
+ "learning_rate": 7.818567586034577e-05,
2058
+ "loss": 0.8349,
2059
+ "step": 335
2060
+ },
2061
+ {
2062
+ "epoch": 1.2,
2063
+ "learning_rate": 7.75729239050619e-05,
2064
+ "loss": 0.8283,
2065
+ "step": 336
2066
+ },
2067
+ {
2068
+ "epoch": 1.2,
2069
+ "learning_rate": 7.696105733234098e-05,
2070
+ "loss": 0.7967,
2071
+ "step": 337
2072
+ },
2073
+ {
2074
+ "epoch": 1.21,
2075
+ "learning_rate": 7.635010029762756e-05,
2076
+ "loss": 0.7849,
2077
+ "step": 338
2078
+ },
2079
+ {
2080
+ "epoch": 1.21,
2081
+ "learning_rate": 7.574007692045928e-05,
2082
+ "loss": 0.7551,
2083
+ "step": 339
2084
+ },
2085
+ {
2086
+ "epoch": 1.21,
2087
+ "learning_rate": 7.513101128351454e-05,
2088
+ "loss": 0.7867,
2089
+ "step": 340
2090
+ },
2091
+ {
2092
+ "epoch": 1.22,
2093
+ "learning_rate": 7.45229274316618e-05,
2094
+ "loss": 0.803,
2095
+ "step": 341
2096
+ },
2097
+ {
2098
+ "epoch": 1.22,
2099
+ "learning_rate": 7.391584937101033e-05,
2100
+ "loss": 0.7521,
2101
+ "step": 342
2102
+ },
2103
+ {
2104
+ "epoch": 1.23,
2105
+ "learning_rate": 7.330980106796246e-05,
2106
+ "loss": 0.763,
2107
+ "step": 343
2108
+ },
2109
+ {
2110
+ "epoch": 1.23,
2111
+ "learning_rate": 7.270480644826749e-05,
2112
+ "loss": 0.7401,
2113
+ "step": 344
2114
+ },
2115
+ {
2116
+ "epoch": 1.23,
2117
+ "learning_rate": 7.210088939607708e-05,
2118
+ "loss": 0.7872,
2119
+ "step": 345
2120
+ },
2121
+ {
2122
+ "epoch": 1.23,
2123
+ "eval_loss": 1.0583446025848389,
2124
+ "eval_runtime": 44.1159,
2125
+ "eval_samples_per_second": 25.614,
2126
+ "eval_steps_per_second": 1.609,
2127
+ "step": 345
2128
+ },
2129
+ {
2130
+ "epoch": 1.24,
2131
+ "learning_rate": 7.149807375300239e-05,
2132
+ "loss": 0.815,
2133
+ "step": 346
2134
+ },
2135
+ {
2136
+ "epoch": 1.24,
2137
+ "learning_rate": 7.089638331717284e-05,
2138
+ "loss": 0.7244,
2139
+ "step": 347
2140
+ },
2141
+ {
2142
+ "epoch": 1.24,
2143
+ "learning_rate": 7.029584184229653e-05,
2144
+ "loss": 0.7499,
2145
+ "step": 348
2146
+ },
2147
+ {
2148
+ "epoch": 1.25,
2149
+ "learning_rate": 6.969647303672262e-05,
2150
+ "loss": 0.6976,
2151
+ "step": 349
2152
+ },
2153
+ {
2154
+ "epoch": 1.25,
2155
+ "learning_rate": 6.909830056250527e-05,
2156
+ "loss": 0.787,
2157
+ "step": 350
2158
+ },
2159
+ {
2160
+ "epoch": 1.25,
2161
+ "learning_rate": 6.850134803446954e-05,
2162
+ "loss": 0.8019,
2163
+ "step": 351
2164
+ },
2165
+ {
2166
+ "epoch": 1.26,
2167
+ "learning_rate": 6.790563901927907e-05,
2168
+ "loss": 0.7323,
2169
+ "step": 352
2170
+ },
2171
+ {
2172
+ "epoch": 1.26,
2173
+ "learning_rate": 6.731119703450577e-05,
2174
+ "loss": 0.7873,
2175
+ "step": 353
2176
+ },
2177
+ {
2178
+ "epoch": 1.27,
2179
+ "learning_rate": 6.671804554770135e-05,
2180
+ "loss": 0.8039,
2181
+ "step": 354
2182
+ },
2183
+ {
2184
+ "epoch": 1.27,
2185
+ "learning_rate": 6.612620797547087e-05,
2186
+ "loss": 0.7192,
2187
+ "step": 355
2188
+ },
2189
+ {
2190
+ "epoch": 1.27,
2191
+ "learning_rate": 6.55357076825483e-05,
2192
+ "loss": 0.7871,
2193
+ "step": 356
2194
+ },
2195
+ {
2196
+ "epoch": 1.28,
2197
+ "learning_rate": 6.494656798087412e-05,
2198
+ "loss": 0.6948,
2199
+ "step": 357
2200
+ },
2201
+ {
2202
+ "epoch": 1.28,
2203
+ "learning_rate": 6.435881212867493e-05,
2204
+ "loss": 0.7302,
2205
+ "step": 358
2206
+ },
2207
+ {
2208
+ "epoch": 1.28,
2209
+ "learning_rate": 6.377246332954544e-05,
2210
+ "loss": 0.7555,
2211
+ "step": 359
2212
+ },
2213
+ {
2214
+ "epoch": 1.29,
2215
+ "learning_rate": 6.318754473153221e-05,
2216
+ "loss": 0.7155,
2217
+ "step": 360
2218
+ },
2219
+ {
2220
+ "epoch": 1.29,
2221
+ "learning_rate": 6.260407942621998e-05,
2222
+ "loss": 0.7348,
2223
+ "step": 361
2224
+ },
2225
+ {
2226
+ "epoch": 1.29,
2227
+ "learning_rate": 6.20220904478199e-05,
2228
+ "loss": 0.7378,
2229
+ "step": 362
2230
+ },
2231
+ {
2232
+ "epoch": 1.3,
2233
+ "learning_rate": 6.144160077226036e-05,
2234
+ "loss": 0.6851,
2235
+ "step": 363
2236
+ },
2237
+ {
2238
+ "epoch": 1.3,
2239
+ "learning_rate": 6.086263331627976e-05,
2240
+ "loss": 0.7479,
2241
+ "step": 364
2242
+ },
2243
+ {
2244
+ "epoch": 1.31,
2245
+ "learning_rate": 6.0285210936521955e-05,
2246
+ "loss": 0.6409,
2247
+ "step": 365
2248
+ },
2249
+ {
2250
+ "epoch": 1.31,
2251
+ "learning_rate": 5.9709356428633746e-05,
2252
+ "loss": 0.697,
2253
+ "step": 366
2254
+ },
2255
+ {
2256
+ "epoch": 1.31,
2257
+ "learning_rate": 5.913509252636511e-05,
2258
+ "loss": 0.7208,
2259
+ "step": 367
2260
+ },
2261
+ {
2262
+ "epoch": 1.32,
2263
+ "learning_rate": 5.856244190067159e-05,
2264
+ "loss": 0.7362,
2265
+ "step": 368
2266
+ },
2267
+ {
2268
+ "epoch": 1.32,
2269
+ "learning_rate": 5.799142715881938e-05,
2270
+ "loss": 0.7458,
2271
+ "step": 369
2272
+ },
2273
+ {
2274
+ "epoch": 1.32,
2275
+ "learning_rate": 5.7422070843492734e-05,
2276
+ "loss": 0.715,
2277
+ "step": 370
2278
+ },
2279
+ {
2280
+ "epoch": 1.33,
2281
+ "learning_rate": 5.6854395431904094e-05,
2282
+ "loss": 0.7802,
2283
+ "step": 371
2284
+ },
2285
+ {
2286
+ "epoch": 1.33,
2287
+ "learning_rate": 5.6288423334906735e-05,
2288
+ "loss": 0.7155,
2289
+ "step": 372
2290
+ },
2291
+ {
2292
+ "epoch": 1.33,
2293
+ "learning_rate": 5.572417689610987e-05,
2294
+ "loss": 0.7218,
2295
+ "step": 373
2296
+ },
2297
+ {
2298
+ "epoch": 1.34,
2299
+ "learning_rate": 5.5161678390996796e-05,
2300
+ "loss": 0.6944,
2301
+ "step": 374
2302
+ },
2303
+ {
2304
+ "epoch": 1.34,
2305
+ "learning_rate": 5.4600950026045326e-05,
2306
+ "loss": 0.6461,
2307
+ "step": 375
2308
+ },
2309
+ {
2310
+ "epoch": 1.35,
2311
+ "learning_rate": 5.404201393785122e-05,
2312
+ "loss": 0.6931,
2313
+ "step": 376
2314
+ },
2315
+ {
2316
+ "epoch": 1.35,
2317
+ "learning_rate": 5.348489219225416e-05,
2318
+ "loss": 0.6618,
2319
+ "step": 377
2320
+ },
2321
+ {
2322
+ "epoch": 1.35,
2323
+ "learning_rate": 5.292960678346675e-05,
2324
+ "loss": 0.6212,
2325
+ "step": 378
2326
+ },
2327
+ {
2328
+ "epoch": 1.36,
2329
+ "learning_rate": 5.237617963320608e-05,
2330
+ "loss": 0.7231,
2331
+ "step": 379
2332
+ },
2333
+ {
2334
+ "epoch": 1.36,
2335
+ "learning_rate": 5.182463258982846e-05,
2336
+ "loss": 0.6582,
2337
+ "step": 380
2338
+ },
2339
+ {
2340
+ "epoch": 1.36,
2341
+ "learning_rate": 5.127498742746675e-05,
2342
+ "loss": 0.6574,
2343
+ "step": 381
2344
+ },
2345
+ {
2346
+ "epoch": 1.37,
2347
+ "learning_rate": 5.072726584517086e-05,
2348
+ "loss": 0.6529,
2349
+ "step": 382
2350
+ },
2351
+ {
2352
+ "epoch": 1.37,
2353
+ "learning_rate": 5.018148946605092e-05,
2354
+ "loss": 0.6178,
2355
+ "step": 383
2356
+ },
2357
+ {
2358
+ "epoch": 1.37,
2359
+ "learning_rate": 4.9637679836423924e-05,
2360
+ "loss": 0.7012,
2361
+ "step": 384
2362
+ },
2363
+ {
2364
+ "epoch": 1.38,
2365
+ "learning_rate": 4.909585842496287e-05,
2366
+ "loss": 0.7101,
2367
+ "step": 385
2368
+ },
2369
+ {
2370
+ "epoch": 1.38,
2371
+ "learning_rate": 4.8556046621849346e-05,
2372
+ "loss": 0.6351,
2373
+ "step": 386
2374
+ },
2375
+ {
2376
+ "epoch": 1.39,
2377
+ "learning_rate": 4.8018265737929044e-05,
2378
+ "loss": 0.6254,
2379
+ "step": 387
2380
+ },
2381
+ {
2382
+ "epoch": 1.39,
2383
+ "learning_rate": 4.748253700387042e-05,
2384
+ "loss": 0.6286,
2385
+ "step": 388
2386
+ },
2387
+ {
2388
+ "epoch": 1.39,
2389
+ "learning_rate": 4.694888156932658e-05,
2390
+ "loss": 0.6273,
2391
+ "step": 389
2392
+ },
2393
+ {
2394
+ "epoch": 1.4,
2395
+ "learning_rate": 4.6417320502100316e-05,
2396
+ "loss": 0.5979,
2397
+ "step": 390
2398
+ },
2399
+ {
2400
+ "epoch": 1.4,
2401
+ "learning_rate": 4.588787478731242e-05,
2402
+ "loss": 0.6387,
2403
+ "step": 391
2404
+ },
2405
+ {
2406
+ "epoch": 1.4,
2407
+ "learning_rate": 4.5360565326573104e-05,
2408
+ "loss": 0.5849,
2409
+ "step": 392
2410
+ },
2411
+ {
2412
+ "epoch": 1.41,
2413
+ "learning_rate": 4.483541293715698e-05,
2414
+ "loss": 0.6265,
2415
+ "step": 393
2416
+ },
2417
+ {
2418
+ "epoch": 1.41,
2419
+ "learning_rate": 4.431243835118124e-05,
2420
+ "loss": 0.7038,
2421
+ "step": 394
2422
+ },
2423
+ {
2424
+ "epoch": 1.41,
2425
+ "learning_rate": 4.379166221478697e-05,
2426
+ "loss": 0.5836,
2427
+ "step": 395
2428
+ },
2429
+ {
2430
+ "epoch": 1.42,
2431
+ "learning_rate": 4.327310508732437e-05,
2432
+ "loss": 0.6163,
2433
+ "step": 396
2434
+ },
2435
+ {
2436
+ "epoch": 1.42,
2437
+ "learning_rate": 4.2756787440540936e-05,
2438
+ "loss": 0.6606,
2439
+ "step": 397
2440
+ },
2441
+ {
2442
+ "epoch": 1.43,
2443
+ "learning_rate": 4.224272965777326e-05,
2444
+ "loss": 0.6412,
2445
+ "step": 398
2446
+ },
2447
+ {
2448
+ "epoch": 1.43,
2449
+ "learning_rate": 4.173095203314241e-05,
2450
+ "loss": 0.6556,
2451
+ "step": 399
2452
+ },
2453
+ {
2454
+ "epoch": 1.43,
2455
+ "learning_rate": 4.12214747707527e-05,
2456
+ "loss": 0.6199,
2457
+ "step": 400
2458
+ },
2459
+ {
2460
+ "epoch": 1.44,
2461
+ "learning_rate": 4.071431798389408e-05,
2462
+ "loss": 0.6253,
2463
+ "step": 401
2464
+ },
2465
+ {
2466
+ "epoch": 1.44,
2467
+ "learning_rate": 4.020950169424815e-05,
2468
+ "loss": 0.6303,
2469
+ "step": 402
2470
+ },
2471
+ {
2472
+ "epoch": 1.44,
2473
+ "learning_rate": 3.9707045831097555e-05,
2474
+ "loss": 0.6379,
2475
+ "step": 403
2476
+ },
2477
+ {
2478
+ "epoch": 1.45,
2479
+ "learning_rate": 3.920697023053949e-05,
2480
+ "loss": 0.589,
2481
+ "step": 404
2482
+ },
2483
+ {
2484
+ "epoch": 1.45,
2485
+ "learning_rate": 3.8709294634702376e-05,
2486
+ "loss": 0.5801,
2487
+ "step": 405
2488
+ },
2489
+ {
2490
+ "epoch": 1.45,
2491
+ "learning_rate": 3.821403869096658e-05,
2492
+ "loss": 0.5913,
2493
+ "step": 406
2494
+ },
2495
+ {
2496
+ "epoch": 1.46,
2497
+ "learning_rate": 3.7721221951188765e-05,
2498
+ "loss": 0.6082,
2499
+ "step": 407
2500
+ },
2501
+ {
2502
+ "epoch": 1.46,
2503
+ "learning_rate": 3.7230863870929964e-05,
2504
+ "loss": 0.6199,
2505
+ "step": 408
2506
+ },
2507
+ {
2508
+ "epoch": 1.47,
2509
+ "learning_rate": 3.674298380868756e-05,
2510
+ "loss": 0.5785,
2511
+ "step": 409
2512
+ },
2513
+ {
2514
+ "epoch": 1.47,
2515
+ "learning_rate": 3.6257601025131026e-05,
2516
+ "loss": 0.588,
2517
+ "step": 410
2518
+ },
2519
+ {
2520
+ "epoch": 1.47,
2521
+ "learning_rate": 3.577473468234156e-05,
2522
+ "loss": 0.5988,
2523
+ "step": 411
2524
+ },
2525
+ {
2526
+ "epoch": 1.48,
2527
+ "learning_rate": 3.52944038430556e-05,
2528
+ "loss": 0.5986,
2529
+ "step": 412
2530
+ },
2531
+ {
2532
+ "epoch": 1.48,
2533
+ "learning_rate": 3.481662746991214e-05,
2534
+ "loss": 0.6181,
2535
+ "step": 413
2536
+ },
2537
+ {
2538
+ "epoch": 1.48,
2539
+ "learning_rate": 3.4341424424704375e-05,
2540
+ "loss": 0.5873,
2541
+ "step": 414
2542
+ },
2543
+ {
2544
+ "epoch": 1.48,
2545
+ "eval_loss": 0.8250847458839417,
2546
+ "eval_runtime": 44.0494,
2547
+ "eval_samples_per_second": 25.653,
2548
+ "eval_steps_per_second": 1.612,
2549
+ "step": 414
2550
+ },
2551
+ {
2552
+ "epoch": 1.49,
2553
+ "learning_rate": 3.386881346763483e-05,
2554
+ "loss": 0.599,
2555
+ "step": 415
2556
+ },
2557
+ {
2558
+ "epoch": 1.49,
2559
+ "learning_rate": 3.339881325657484e-05,
2560
+ "loss": 0.5636,
2561
+ "step": 416
2562
+ },
2563
+ {
2564
+ "epoch": 1.49,
2565
+ "learning_rate": 3.2931442346328004e-05,
2566
+ "loss": 0.5657,
2567
+ "step": 417
2568
+ },
2569
+ {
2570
+ "epoch": 1.5,
2571
+ "learning_rate": 3.246671918789755e-05,
2572
+ "loss": 0.6404,
2573
+ "step": 418
2574
+ },
2575
+ {
2576
+ "epoch": 1.5,
2577
+ "learning_rate": 3.200466212775808e-05,
2578
+ "loss": 0.6046,
2579
+ "step": 419
2580
+ },
2581
+ {
2582
+ "epoch": 1.51,
2583
+ "learning_rate": 3.154528940713113e-05,
2584
+ "loss": 0.5863,
2585
+ "step": 420
2586
+ },
2587
+ {
2588
+ "epoch": 1.51,
2589
+ "learning_rate": 3.108861916126518e-05,
2590
+ "loss": 0.5241,
2591
+ "step": 421
2592
+ },
2593
+ {
2594
+ "epoch": 1.51,
2595
+ "learning_rate": 3.063466941871952e-05,
2596
+ "loss": 0.5882,
2597
+ "step": 422
2598
+ },
2599
+ {
2600
+ "epoch": 1.52,
2601
+ "learning_rate": 3.018345810065275e-05,
2602
+ "loss": 0.5735,
2603
+ "step": 423
2604
+ },
2605
+ {
2606
+ "epoch": 1.52,
2607
+ "learning_rate": 2.9735003020115092e-05,
2608
+ "loss": 0.6128,
2609
+ "step": 424
2610
+ },
2611
+ {
2612
+ "epoch": 1.52,
2613
+ "learning_rate": 2.9289321881345254e-05,
2614
+ "loss": 0.5456,
2615
+ "step": 425
2616
+ },
2617
+ {
2618
+ "epoch": 1.53,
2619
+ "learning_rate": 2.8846432279071467e-05,
2620
+ "loss": 0.5692,
2621
+ "step": 426
2622
+ },
2623
+ {
2624
+ "epoch": 1.53,
2625
+ "learning_rate": 2.840635169781688e-05,
2626
+ "loss": 0.5997,
2627
+ "step": 427
2628
+ },
2629
+ {
2630
+ "epoch": 1.53,
2631
+ "learning_rate": 2.7969097511209308e-05,
2632
+ "loss": 0.5698,
2633
+ "step": 428
2634
+ },
2635
+ {
2636
+ "epoch": 1.54,
2637
+ "learning_rate": 2.753468698129533e-05,
2638
+ "loss": 0.5583,
2639
+ "step": 429
2640
+ },
2641
+ {
2642
+ "epoch": 1.54,
2643
+ "learning_rate": 2.7103137257858868e-05,
2644
+ "loss": 0.5595,
2645
+ "step": 430
2646
+ },
2647
+ {
2648
+ "epoch": 1.55,
2649
+ "learning_rate": 2.6674465377744017e-05,
2650
+ "loss": 0.5426,
2651
+ "step": 431
2652
+ },
2653
+ {
2654
+ "epoch": 1.55,
2655
+ "learning_rate": 2.624868826418262e-05,
2656
+ "loss": 0.5478,
2657
+ "step": 432
2658
+ },
2659
+ {
2660
+ "epoch": 1.55,
2661
+ "learning_rate": 2.582582272612609e-05,
2662
+ "loss": 0.5857,
2663
+ "step": 433
2664
+ },
2665
+ {
2666
+ "epoch": 1.56,
2667
+ "learning_rate": 2.540588545758179e-05,
2668
+ "loss": 0.5367,
2669
+ "step": 434
2670
+ },
2671
+ {
2672
+ "epoch": 1.56,
2673
+ "learning_rate": 2.4988893036954043e-05,
2674
+ "loss": 0.5333,
2675
+ "step": 435
2676
+ },
2677
+ {
2678
+ "epoch": 1.56,
2679
+ "learning_rate": 2.4574861926389615e-05,
2680
+ "loss": 0.5319,
2681
+ "step": 436
2682
+ },
2683
+ {
2684
+ "epoch": 1.57,
2685
+ "learning_rate": 2.4163808471127812e-05,
2686
+ "loss": 0.59,
2687
+ "step": 437
2688
+ },
2689
+ {
2690
+ "epoch": 1.57,
2691
+ "learning_rate": 2.37557488988552e-05,
2692
+ "loss": 0.5218,
2693
+ "step": 438
2694
+ },
2695
+ {
2696
+ "epoch": 1.57,
2697
+ "learning_rate": 2.3350699319065026e-05,
2698
+ "loss": 0.5501,
2699
+ "step": 439
2700
+ },
2701
+ {
2702
+ "epoch": 1.58,
2703
+ "learning_rate": 2.2948675722421086e-05,
2704
+ "loss": 0.5955,
2705
+ "step": 440
2706
+ },
2707
+ {
2708
+ "epoch": 1.58,
2709
+ "learning_rate": 2.254969398012663e-05,
2710
+ "loss": 0.5356,
2711
+ "step": 441
2712
+ },
2713
+ {
2714
+ "epoch": 1.59,
2715
+ "learning_rate": 2.2153769843297667e-05,
2716
+ "loss": 0.5743,
2717
+ "step": 442
2718
+ },
2719
+ {
2720
+ "epoch": 1.59,
2721
+ "learning_rate": 2.1760918942341192e-05,
2722
+ "loss": 0.5952,
2723
+ "step": 443
2724
+ },
2725
+ {
2726
+ "epoch": 1.59,
2727
+ "learning_rate": 2.137115678633811e-05,
2728
+ "loss": 0.5604,
2729
+ "step": 444
2730
+ },
2731
+ {
2732
+ "epoch": 1.6,
2733
+ "learning_rate": 2.098449876243096e-05,
2734
+ "loss": 0.5339,
2735
+ "step": 445
2736
+ },
2737
+ {
2738
+ "epoch": 1.6,
2739
+ "learning_rate": 2.0600960135216462e-05,
2740
+ "loss": 0.5411,
2741
+ "step": 446
2742
+ },
2743
+ {
2744
+ "epoch": 1.6,
2745
+ "learning_rate": 2.0220556046142893e-05,
2746
+ "loss": 0.554,
2747
+ "step": 447
2748
+ },
2749
+ {
2750
+ "epoch": 1.61,
2751
+ "learning_rate": 1.9843301512912327e-05,
2752
+ "loss": 0.5875,
2753
+ "step": 448
2754
+ },
2755
+ {
2756
+ "epoch": 1.61,
2757
+ "learning_rate": 1.946921142888781e-05,
2758
+ "loss": 0.5284,
2759
+ "step": 449
2760
+ },
2761
+ {
2762
+ "epoch": 1.61,
2763
+ "learning_rate": 1.9098300562505266e-05,
2764
+ "loss": 0.5418,
2765
+ "step": 450
2766
+ },
2767
+ {
2768
+ "epoch": 1.62,
2769
+ "learning_rate": 1.8730583556690605e-05,
2770
+ "loss": 0.5146,
2771
+ "step": 451
2772
+ },
2773
+ {
2774
+ "epoch": 1.62,
2775
+ "learning_rate": 1.8366074928281607e-05,
2776
+ "loss": 0.5023,
2777
+ "step": 452
2778
+ },
2779
+ {
2780
+ "epoch": 1.63,
2781
+ "learning_rate": 1.8004789067454764e-05,
2782
+ "loss": 0.5407,
2783
+ "step": 453
2784
+ },
2785
+ {
2786
+ "epoch": 1.63,
2787
+ "learning_rate": 1.7646740237157256e-05,
2788
+ "loss": 0.5389,
2789
+ "step": 454
2790
+ },
2791
+ {
2792
+ "epoch": 1.63,
2793
+ "learning_rate": 1.7291942572543807e-05,
2794
+ "loss": 0.5224,
2795
+ "step": 455
2796
+ },
2797
+ {
2798
+ "epoch": 1.64,
2799
+ "learning_rate": 1.6940410080418723e-05,
2800
+ "loss": 0.503,
2801
+ "step": 456
2802
+ },
2803
+ {
2804
+ "epoch": 1.64,
2805
+ "learning_rate": 1.6592156638682886e-05,
2806
+ "loss": 0.5115,
2807
+ "step": 457
2808
+ },
2809
+ {
2810
+ "epoch": 1.64,
2811
+ "learning_rate": 1.6247195995785837e-05,
2812
+ "loss": 0.5031,
2813
+ "step": 458
2814
+ },
2815
+ {
2816
+ "epoch": 1.65,
2817
+ "learning_rate": 1.5905541770183096e-05,
2818
+ "loss": 0.5371,
2819
+ "step": 459
2820
+ },
2821
+ {
2822
+ "epoch": 1.65,
2823
+ "learning_rate": 1.5567207449798515e-05,
2824
+ "loss": 0.5685,
2825
+ "step": 460
2826
+ },
2827
+ {
2828
+ "epoch": 1.65,
2829
+ "learning_rate": 1.5232206391491699e-05,
2830
+ "loss": 0.5498,
2831
+ "step": 461
2832
+ },
2833
+ {
2834
+ "epoch": 1.66,
2835
+ "learning_rate": 1.4900551820530828e-05,
2836
+ "loss": 0.5875,
2837
+ "step": 462
2838
+ },
2839
+ {
2840
+ "epoch": 1.66,
2841
+ "learning_rate": 1.4572256830070497e-05,
2842
+ "loss": 0.499,
2843
+ "step": 463
2844
+ },
2845
+ {
2846
+ "epoch": 1.67,
2847
+ "learning_rate": 1.4247334380634792e-05,
2848
+ "loss": 0.5447,
2849
+ "step": 464
2850
+ },
2851
+ {
2852
+ "epoch": 1.67,
2853
+ "learning_rate": 1.3925797299605647e-05,
2854
+ "loss": 0.4996,
2855
+ "step": 465
2856
+ },
2857
+ {
2858
+ "epoch": 1.67,
2859
+ "learning_rate": 1.3607658280716473e-05,
2860
+ "loss": 0.5528,
2861
+ "step": 466
2862
+ },
2863
+ {
2864
+ "epoch": 1.68,
2865
+ "learning_rate": 1.3292929883550998e-05,
2866
+ "loss": 0.5656,
2867
+ "step": 467
2868
+ },
2869
+ {
2870
+ "epoch": 1.68,
2871
+ "learning_rate": 1.2981624533047432e-05,
2872
+ "loss": 0.5276,
2873
+ "step": 468
2874
+ },
2875
+ {
2876
+ "epoch": 1.68,
2877
+ "learning_rate": 1.2673754519008008e-05,
2878
+ "loss": 0.5376,
2879
+ "step": 469
2880
+ },
2881
+ {
2882
+ "epoch": 1.69,
2883
+ "learning_rate": 1.2369331995613665e-05,
2884
+ "loss": 0.525,
2885
+ "step": 470
2886
+ },
2887
+ {
2888
+ "epoch": 1.69,
2889
+ "learning_rate": 1.206836898094439e-05,
2890
+ "loss": 0.5625,
2891
+ "step": 471
2892
+ },
2893
+ {
2894
+ "epoch": 1.69,
2895
+ "learning_rate": 1.1770877356504683e-05,
2896
+ "loss": 0.512,
2897
+ "step": 472
2898
+ },
2899
+ {
2900
+ "epoch": 1.7,
2901
+ "learning_rate": 1.1476868866754486e-05,
2902
+ "loss": 0.5057,
2903
+ "step": 473
2904
+ },
2905
+ {
2906
+ "epoch": 1.7,
2907
+ "learning_rate": 1.1186355118645554e-05,
2908
+ "loss": 0.5255,
2909
+ "step": 474
2910
+ },
2911
+ {
2912
+ "epoch": 1.71,
2913
+ "learning_rate": 1.0899347581163221e-05,
2914
+ "loss": 0.5742,
2915
+ "step": 475
2916
+ },
2917
+ {
2918
+ "epoch": 1.71,
2919
+ "learning_rate": 1.0615857584873623e-05,
2920
+ "loss": 0.5368,
2921
+ "step": 476
2922
+ },
2923
+ {
2924
+ "epoch": 1.71,
2925
+ "learning_rate": 1.0335896321476413e-05,
2926
+ "loss": 0.5122,
2927
+ "step": 477
2928
+ },
2929
+ {
2930
+ "epoch": 1.72,
2931
+ "learning_rate": 1.0059474843362892e-05,
2932
+ "loss": 0.5292,
2933
+ "step": 478
2934
+ },
2935
+ {
2936
+ "epoch": 1.72,
2937
+ "learning_rate": 9.786604063179728e-06,
2938
+ "loss": 0.4903,
2939
+ "step": 479
2940
+ },
2941
+ {
2942
+ "epoch": 1.72,
2943
+ "learning_rate": 9.517294753398064e-06,
2944
+ "loss": 0.5403,
2945
+ "step": 480
2946
+ },
2947
+ {
2948
+ "epoch": 1.73,
2949
+ "learning_rate": 9.251557545888312e-06,
2950
+ "loss": 0.5243,
2951
+ "step": 481
2952
+ },
2953
+ {
2954
+ "epoch": 1.73,
2955
+ "learning_rate": 8.989402931500434e-06,
2956
+ "loss": 0.5179,
2957
+ "step": 482
2958
+ },
2959
+ {
2960
+ "epoch": 1.73,
2961
+ "learning_rate": 8.730841259649725e-06,
2962
+ "loss": 0.5154,
2963
+ "step": 483
2964
+ },
2965
+ {
2966
+ "epoch": 1.73,
2967
+ "eval_loss": 0.7377763986587524,
2968
+ "eval_runtime": 44.2257,
2969
+ "eval_samples_per_second": 25.551,
2970
+ "eval_steps_per_second": 1.605,
2971
+ "step": 483
2972
+ },
2973
+ {
2974
+ "epoch": 1.74,
2975
+ "learning_rate": 8.475882737908248e-06,
2976
+ "loss": 0.5378,
2977
+ "step": 484
2978
+ },
2979
+ {
2980
+ "epoch": 1.74,
2981
+ "learning_rate": 8.224537431601886e-06,
2982
+ "loss": 0.5589,
2983
+ "step": 485
2984
+ },
2985
+ {
2986
+ "epoch": 1.75,
2987
+ "learning_rate": 7.976815263412963e-06,
2988
+ "loss": 0.5938,
2989
+ "step": 486
2990
+ },
2991
+ {
2992
+ "epoch": 1.75,
2993
+ "learning_rate": 7.73272601298851e-06,
2994
+ "loss": 0.5143,
2995
+ "step": 487
2996
+ },
2997
+ {
2998
+ "epoch": 1.75,
2999
+ "learning_rate": 7.492279316554207e-06,
3000
+ "loss": 0.5098,
3001
+ "step": 488
3002
+ },
3003
+ {
3004
+ "epoch": 1.76,
3005
+ "learning_rate": 7.255484666533874e-06,
3006
+ "loss": 0.5097,
3007
+ "step": 489
3008
+ },
3009
+ {
3010
+ "epoch": 1.76,
3011
+ "learning_rate": 7.022351411174866e-06,
3012
+ "loss": 0.5456,
3013
+ "step": 490
3014
+ },
3015
+ {
3016
+ "epoch": 1.76,
3017
+ "learning_rate": 6.7928887541789055e-06,
3018
+ "loss": 0.539,
3019
+ "step": 491
3020
+ },
3021
+ {
3022
+ "epoch": 1.77,
3023
+ "learning_rate": 6.5671057543387985e-06,
3024
+ "loss": 0.471,
3025
+ "step": 492
3026
+ },
3027
+ {
3028
+ "epoch": 1.77,
3029
+ "learning_rate": 6.345011325180772e-06,
3030
+ "loss": 0.506,
3031
+ "step": 493
3032
+ },
3033
+ {
3034
+ "epoch": 1.77,
3035
+ "learning_rate": 6.126614234612593e-06,
3036
+ "loss": 0.5465,
3037
+ "step": 494
3038
+ },
3039
+ {
3040
+ "epoch": 1.78,
3041
+ "learning_rate": 5.911923104577455e-06,
3042
+ "loss": 0.4872,
3043
+ "step": 495
3044
+ },
3045
+ {
3046
+ "epoch": 1.78,
3047
+ "learning_rate": 5.700946410713548e-06,
3048
+ "loss": 0.5338,
3049
+ "step": 496
3050
+ },
3051
+ {
3052
+ "epoch": 1.79,
3053
+ "learning_rate": 5.49369248201953e-06,
3054
+ "loss": 0.5305,
3055
+ "step": 497
3056
+ },
3057
+ {
3058
+ "epoch": 1.79,
3059
+ "learning_rate": 5.290169500525577e-06,
3060
+ "loss": 0.4829,
3061
+ "step": 498
3062
+ },
3063
+ {
3064
+ "epoch": 1.79,
3065
+ "learning_rate": 5.0903855009705514e-06,
3066
+ "loss": 0.5006,
3067
+ "step": 499
3068
+ },
3069
+ {
3070
+ "epoch": 1.8,
3071
+ "learning_rate": 4.8943483704846475e-06,
3072
+ "loss": 0.5341,
3073
+ "step": 500
3074
+ },
3075
+ {
3076
+ "epoch": 1.8,
3077
+ "learning_rate": 4.702065848278126e-06,
3078
+ "loss": 0.5244,
3079
+ "step": 501
3080
+ },
3081
+ {
3082
+ "epoch": 1.8,
3083
+ "learning_rate": 4.513545525335705e-06,
3084
+ "loss": 0.5144,
3085
+ "step": 502
3086
+ },
3087
+ {
3088
+ "epoch": 1.81,
3089
+ "learning_rate": 4.328794844116946e-06,
3090
+ "loss": 0.5192,
3091
+ "step": 503
3092
+ },
3093
+ {
3094
+ "epoch": 1.81,
3095
+ "learning_rate": 4.147821098262405e-06,
3096
+ "loss": 0.4968,
3097
+ "step": 504
3098
+ },
3099
+ {
3100
+ "epoch": 1.81,
3101
+ "learning_rate": 3.970631432305694e-06,
3102
+ "loss": 0.5383,
3103
+ "step": 505
3104
+ },
3105
+ {
3106
+ "epoch": 1.82,
3107
+ "learning_rate": 3.797232841391407e-06,
3108
+ "loss": 0.5149,
3109
+ "step": 506
3110
+ },
3111
+ {
3112
+ "epoch": 1.82,
3113
+ "learning_rate": 3.627632170999029e-06,
3114
+ "loss": 0.5104,
3115
+ "step": 507
3116
+ },
3117
+ {
3118
+ "epoch": 1.83,
3119
+ "learning_rate": 3.461836116672612e-06,
3120
+ "loss": 0.5108,
3121
+ "step": 508
3122
+ },
3123
+ {
3124
+ "epoch": 1.83,
3125
+ "learning_rate": 3.2998512237565005e-06,
3126
+ "loss": 0.526,
3127
+ "step": 509
3128
+ },
3129
+ {
3130
+ "epoch": 1.83,
3131
+ "learning_rate": 3.1416838871368924e-06,
3132
+ "loss": 0.5518,
3133
+ "step": 510
3134
+ },
3135
+ {
3136
+ "epoch": 1.84,
3137
+ "learning_rate": 2.9873403509894203e-06,
3138
+ "loss": 0.5257,
3139
+ "step": 511
3140
+ },
3141
+ {
3142
+ "epoch": 1.84,
3143
+ "learning_rate": 2.836826708532603e-06,
3144
+ "loss": 0.5527,
3145
+ "step": 512
3146
+ },
3147
+ {
3148
+ "epoch": 1.84,
3149
+ "learning_rate": 2.690148901787337e-06,
3150
+ "loss": 0.5411,
3151
+ "step": 513
3152
+ },
3153
+ {
3154
+ "epoch": 1.85,
3155
+ "learning_rate": 2.5473127213422763e-06,
3156
+ "loss": 0.4848,
3157
+ "step": 514
3158
+ },
3159
+ {
3160
+ "epoch": 1.85,
3161
+ "learning_rate": 2.4083238061252567e-06,
3162
+ "loss": 0.5216,
3163
+ "step": 515
3164
+ },
3165
+ {
3166
+ "epoch": 1.85,
3167
+ "learning_rate": 2.273187643180652e-06,
3168
+ "loss": 0.526,
3169
+ "step": 516
3170
+ },
3171
+ {
3172
+ "epoch": 1.86,
3173
+ "learning_rate": 2.141909567452793e-06,
3174
+ "loss": 0.5351,
3175
+ "step": 517
3176
+ },
3177
+ {
3178
+ "epoch": 1.86,
3179
+ "learning_rate": 2.014494761575314e-06,
3180
+ "loss": 0.5087,
3181
+ "step": 518
3182
+ },
3183
+ {
3184
+ "epoch": 1.87,
3185
+ "learning_rate": 1.8909482556666024e-06,
3186
+ "loss": 0.4533,
3187
+ "step": 519
3188
+ },
3189
+ {
3190
+ "epoch": 1.87,
3191
+ "learning_rate": 1.771274927131139e-06,
3192
+ "loss": 0.5023,
3193
+ "step": 520
3194
+ },
3195
+ {
3196
+ "epoch": 1.87,
3197
+ "learning_rate": 1.6554795004670388e-06,
3198
+ "loss": 0.5154,
3199
+ "step": 521
3200
+ },
3201
+ {
3202
+ "epoch": 1.88,
3203
+ "learning_rate": 1.543566547079467e-06,
3204
+ "loss": 0.5292,
3205
+ "step": 522
3206
+ },
3207
+ {
3208
+ "epoch": 1.88,
3209
+ "learning_rate": 1.4355404851001952e-06,
3210
+ "loss": 0.5136,
3211
+ "step": 523
3212
+ },
3213
+ {
3214
+ "epoch": 1.88,
3215
+ "learning_rate": 1.3314055792131964e-06,
3216
+ "loss": 0.5698,
3217
+ "step": 524
3218
+ },
3219
+ {
3220
+ "epoch": 1.89,
3221
+ "learning_rate": 1.231165940486234e-06,
3222
+ "loss": 0.5363,
3223
+ "step": 525
3224
+ },
3225
+ {
3226
+ "epoch": 1.89,
3227
+ "learning_rate": 1.134825526208605e-06,
3228
+ "loss": 0.4928,
3229
+ "step": 526
3230
+ },
3231
+ {
3232
+ "epoch": 1.89,
3233
+ "learning_rate": 1.0423881397349068e-06,
3234
+ "loss": 0.5235,
3235
+ "step": 527
3236
+ },
3237
+ {
3238
+ "epoch": 1.9,
3239
+ "learning_rate": 9.538574303348813e-07,
3240
+ "loss": 0.5244,
3241
+ "step": 528
3242
+ },
3243
+ {
3244
+ "epoch": 1.9,
3245
+ "learning_rate": 8.692368930493521e-07,
3246
+ "loss": 0.5381,
3247
+ "step": 529
3248
+ },
3249
+ {
3250
+ "epoch": 1.91,
3251
+ "learning_rate": 7.885298685522235e-07,
3252
+ "loss": 0.5538,
3253
+ "step": 530
3254
+ },
3255
+ {
3256
+ "epoch": 1.91,
3257
+ "learning_rate": 7.117395430186414e-07,
3258
+ "loss": 0.5859,
3259
+ "step": 531
3260
+ },
3261
+ {
3262
+ "epoch": 1.91,
3263
+ "learning_rate": 6.388689479991605e-07,
3264
+ "loss": 0.541,
3265
+ "step": 532
3266
+ },
3267
+ {
3268
+ "epoch": 1.92,
3269
+ "learning_rate": 5.699209603001076e-07,
3270
+ "loss": 0.5342,
3271
+ "step": 533
3272
+ },
3273
+ {
3274
+ "epoch": 1.92,
3275
+ "learning_rate": 5.048983018699827e-07,
3276
+ "loss": 0.5255,
3277
+ "step": 534
3278
+ },
3279
+ {
3280
+ "epoch": 1.92,
3281
+ "learning_rate": 4.438035396920004e-07,
3282
+ "loss": 0.4907,
3283
+ "step": 535
3284
+ },
3285
+ {
3286
+ "epoch": 1.93,
3287
+ "learning_rate": 3.866390856827495e-07,
3288
+ "loss": 0.5084,
3289
+ "step": 536
3290
+ },
3291
+ {
3292
+ "epoch": 1.93,
3293
+ "learning_rate": 3.3340719659701313e-07,
3294
+ "loss": 0.5013,
3295
+ "step": 537
3296
+ },
3297
+ {
3298
+ "epoch": 1.93,
3299
+ "learning_rate": 2.841099739386066e-07,
3300
+ "loss": 0.5997,
3301
+ "step": 538
3302
+ },
3303
+ {
3304
+ "epoch": 1.94,
3305
+ "learning_rate": 2.387493638774774e-07,
3306
+ "loss": 0.5665,
3307
+ "step": 539
3308
+ },
3309
+ {
3310
+ "epoch": 1.94,
3311
+ "learning_rate": 1.973271571728441e-07,
3312
+ "loss": 0.5652,
3313
+ "step": 540
3314
+ },
3315
+ {
3316
+ "epoch": 1.95,
3317
+ "learning_rate": 1.598449891024978e-07,
3318
+ "loss": 0.5198,
3319
+ "step": 541
3320
+ },
3321
+ {
3322
+ "epoch": 1.95,
3323
+ "learning_rate": 1.2630433939825327e-07,
3324
+ "loss": 0.5551,
3325
+ "step": 542
3326
+ },
3327
+ {
3328
+ "epoch": 1.95,
3329
+ "learning_rate": 9.670653218752934e-08,
3330
+ "loss": 0.4814,
3331
+ "step": 543
3332
+ },
3333
+ {
3334
+ "epoch": 1.96,
3335
+ "learning_rate": 7.105273594107953e-08,
3336
+ "loss": 0.5496,
3337
+ "step": 544
3338
+ },
3339
+ {
3340
+ "epoch": 1.96,
3341
+ "learning_rate": 4.934396342684e-08,
3342
+ "loss": 0.5146,
3343
+ "step": 545
3344
+ },
3345
+ {
3346
+ "epoch": 1.96,
3347
+ "learning_rate": 3.1581071670006015e-08,
3348
+ "loss": 0.5035,
3349
+ "step": 546
3350
+ },
3351
+ {
3352
+ "epoch": 1.97,
3353
+ "learning_rate": 1.7764761919103477e-08,
3354
+ "loss": 0.4571,
3355
+ "step": 547
3356
+ },
3357
+ {
3358
+ "epoch": 1.97,
3359
+ "learning_rate": 7.895579618388827e-09,
3360
+ "loss": 0.4909,
3361
+ "step": 548
3362
+ },
3363
+ {
3364
+ "epoch": 1.97,
3365
+ "learning_rate": 1.973914386288467e-09,
3366
+ "loss": 0.5427,
3367
+ "step": 549
3368
+ },
3369
+ {
3370
+ "epoch": 1.98,
3371
+ "learning_rate": 0.0,
3372
+ "loss": 0.5284,
3373
+ "step": 550
3374
+ }
3375
+ ],
3376
+ "logging_steps": 1,
3377
+ "max_steps": 550,
3378
+ "num_input_tokens_seen": 0,
3379
+ "num_train_epochs": 2,
3380
+ "save_steps": 275,
3381
+ "total_flos": 5.533792253824205e+17,
3382
+ "train_batch_size": 16,
3383
+ "trial_name": null,
3384
+ "trial_params": null
3385
+ }
checkpoint-550/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ae15e3615b945ca842f662b92f1098ed197416220d63f43b2d6384dbcab105
3
+ size 5304
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "croissantllm/CroissantLLMBase",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5504,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 16,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "rope_theta": 10000.0,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.0.dev0",
26
+ "use_cache": false,
27
+ "vocab_size": 32000
28
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.38.0.dev0"
7
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59f190ff8dc877edb80ca05a84c25873448fd1a94fe369431f6f301e37c665e2
3
+ size 2690915926
runs/Feb08_14-38-32_ruche-gpu12.cluster/events.out.tfevents.1707399515.ruche-gpu12.cluster.26480.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9309542902f666e297064e336f6a1eaf08bc5e133a85aa2e4386f8d6b36fa8f3
3
+ size 6241
runs/Feb08_14-45-27_ruche-gpu12.cluster/events.out.tfevents.1707399931.ruche-gpu12.cluster.27626.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d7110338f9604fc09c841a6b5cedb7e9dd07f80b5e42006a4f2090cee5922e
3
+ size 12863
runs/Feb08_15-00-36_ruche-gpu12.cluster/events.out.tfevents.1707400839.ruche-gpu12.cluster.29774.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56e84b4378a58fc205bdddc7234c75c42515c30907b4bfe5904de6dbe6da4656
3
+ size 16222
runs/Feb08_15-17-32_ruche-gpu12.cluster/events.out.tfevents.1707401856.ruche-gpu12.cluster.664.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2b95356bfd3ed9ca23d358ed2fc506d96506e95d8c04c303e04aad329b01a35
3
+ size 93502
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<pad>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<extra_id_0>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<extra_id_1>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<extra_id_2>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<extra_id_3>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<extra_id_4>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<extra_id_5>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<extra_id_6>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<extra_id_7>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<extra_id_8>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<extra_id_9>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<extra_id_10>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<extra_id_11>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<extra_id_12>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<extra_id_13>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<extra_id_14>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<extra_id_15>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "<extra_id_16>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<extra_id_17>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "<extra_id_18>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<extra_id_19>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "<extra_id_20>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<extra_id_21>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "<extra_id_22>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "27": {
222
+ "content": "<extra_id_23>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "28": {
230
+ "content": "<extra_id_24>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "29": {
238
+ "content": "<extra_id_25>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "30": {
246
+ "content": "<extra_id_26>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "31": {
254
+ "content": "<extra_id_27>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32": {
262
+ "content": "<extra_id_28>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "33": {
270
+ "content": "<extra_id_29>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "34": {
278
+ "content": "<extra_id_30>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "35": {
286
+ "content": "<extra_id_31>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "36": {
294
+ "content": "<extra_id_32>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "37": {
302
+ "content": "<extra_id_33>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "38": {
310
+ "content": "<extra_id_34>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "39": {
318
+ "content": "<extra_id_35>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "40": {
326
+ "content": "<extra_id_36>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "41": {
334
+ "content": "<extra_id_37>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "42": {
342
+ "content": "<extra_id_38>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "43": {
350
+ "content": "<extra_id_39>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "44": {
358
+ "content": "<extra_id_40>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "45": {
366
+ "content": "<extra_id_41>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "46": {
374
+ "content": "<extra_id_42>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "47": {
382
+ "content": "<extra_id_43>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "48": {
390
+ "content": "<extra_id_44>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "49": {
398
+ "content": "<extra_id_45>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "50": {
406
+ "content": "<extra_id_46>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "51": {
414
+ "content": "<extra_id_47>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "52": {
422
+ "content": "<extra_id_48>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "53": {
430
+ "content": "<extra_id_49>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "54": {
438
+ "content": "<extra_id_50>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "55": {
446
+ "content": "<extra_id_51>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "56": {
454
+ "content": "<extra_id_52>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "57": {
462
+ "content": "<extra_id_53>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "58": {
470
+ "content": "<extra_id_54>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "59": {
478
+ "content": "<extra_id_55>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "60": {
486
+ "content": "<extra_id_56>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "61": {
494
+ "content": "<extra_id_57>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "62": {
502
+ "content": "<extra_id_58>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "63": {
510
+ "content": "<extra_id_59>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "64": {
518
+ "content": "<extra_id_60>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "65": {
526
+ "content": "<extra_id_61>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "66": {
534
+ "content": "<extra_id_62>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "67": {
542
+ "content": "<extra_id_63>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "68": {
550
+ "content": "<extra_id_64>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "69": {
558
+ "content": "<extra_id_65>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "70": {
566
+ "content": "<extra_id_66>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "71": {
574
+ "content": "<extra_id_67>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "72": {
582
+ "content": "<extra_id_68>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "73": {
590
+ "content": "<extra_id_69>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "74": {
598
+ "content": "<extra_id_70>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "75": {
606
+ "content": "<extra_id_71>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "76": {
614
+ "content": "<extra_id_72>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "77": {
622
+ "content": "<extra_id_73>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "78": {
630
+ "content": "<extra_id_74>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "79": {
638
+ "content": "<extra_id_75>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "80": {
646
+ "content": "<extra_id_76>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "81": {
654
+ "content": "<extra_id_77>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "82": {
662
+ "content": "<extra_id_78>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "83": {
670
+ "content": "<extra_id_79>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "84": {
678
+ "content": "<extra_id_80>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "85": {
686
+ "content": "<extra_id_81>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "86": {
694
+ "content": "<extra_id_82>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "87": {
702
+ "content": "<extra_id_83>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "88": {
710
+ "content": "<extra_id_84>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "89": {
718
+ "content": "<extra_id_85>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "90": {
726
+ "content": "<extra_id_86>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "91": {
734
+ "content": "<extra_id_87>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "92": {
742
+ "content": "<extra_id_88>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "93": {
750
+ "content": "<extra_id_89>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "94": {
758
+ "content": "<extra_id_90>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "95": {
766
+ "content": "<extra_id_91>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "96": {
774
+ "content": "<extra_id_92>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "97": {
782
+ "content": "<extra_id_93>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "98": {
790
+ "content": "<extra_id_94>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "99": {
798
+ "content": "<extra_id_95>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "100": {
806
+ "content": "<extra_id_96>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "101": {
814
+ "content": "<extra_id_97>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "102": {
822
+ "content": "<extra_id_98>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "103": {
830
+ "content": "<extra_id_99>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ }
837
+ },
838
+ "additional_special_tokens": [],
839
+ "bos_token": "<s>",
840
+ "clean_up_tokenization_spaces": false,
841
+ "eos_token": "</s>",
842
+ "model_max_length": 1000000000000000019884624838656,
843
+ "pad_token": "</s>",
844
+ "tokenizer_class": "LlamaTokenizer",
845
+ "trust_remote_code": false,
846
+ "unk_token": "<unk>",
847
+ "use_default_system_prompt": true,
848
+ "use_fast": true
849
+ }