shujatoor commited on
Commit
10e70c5
1 Parent(s): 91fe168

End of training

Browse files
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: microsoft/Phi-3-mini-4k-instruct
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: checkpoint_update
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # checkpoint_update
20
+
21
+ This model is a fine-tuned version of [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.9356
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 1
44
+ - eval_batch_size: 1
45
+ - seed: 0
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.2
49
+ - num_epochs: 5
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss |
54
+ |:-------------:|:------:|:----:|:---------------:|
55
+ | 1.1904 | 0.5618 | 500 | 1.0617 |
56
+ | 0.765 | 1.1236 | 1000 | 0.9442 |
57
+ | 0.782 | 1.6854 | 1500 | 0.8690 |
58
+ | 0.5591 | 2.2472 | 2000 | 0.8647 |
59
+ | 0.5669 | 2.8090 | 2500 | 0.8296 |
60
+ | 0.4205 | 3.3708 | 3000 | 0.8820 |
61
+ | 0.3812 | 3.9326 | 3500 | 0.8859 |
62
+ | 0.3323 | 4.4944 | 4000 | 0.9360 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - PEFT 0.10.1.dev0
68
+ - Transformers 4.41.0.dev0
69
+ - Pytorch 2.2.1+cu121
70
+ - Datasets 2.19.0
71
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "down_proj",
25
+ "qkv_proj",
26
+ "gate_up_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845b0545b68b4d44ef779d00ef1c452c0d2a81c3d9c81962460104323fad4cbd
3
+ size 50366280
added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|assistant|>": 32001,
3
+ "<|endoftext|>": 32000,
4
+ "<|end|>": 32007,
5
+ "<|placeholder1|>": 32002,
6
+ "<|placeholder2|>": 32003,
7
+ "<|placeholder3|>": 32004,
8
+ "<|placeholder4|>": 32005,
9
+ "<|placeholder5|>": 32008,
10
+ "<|placeholder6|>": 32009,
11
+ "<|system|>": 32006,
12
+ "<|user|>": 32010
13
+ }
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.9356008172035217,
4
+ "eval_runtime": 207.2602,
5
+ "eval_samples": 506,
6
+ "eval_samples_per_second": 1.848,
7
+ "eval_steps_per_second": 1.848,
8
+ "total_flos": 1.024663401529344e+17,
9
+ "train_loss": 0.6764815047617708,
10
+ "train_runtime": 9686.8536,
11
+ "train_samples_per_second": 0.459,
12
+ "train_steps_per_second": 0.459
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.9356008172035217,
4
+ "eval_runtime": 207.2602,
5
+ "eval_samples": 506,
6
+ "eval_samples_per_second": 1.848,
7
+ "eval_steps_per_second": 1.848
8
+ }
runs/May16_13-51-04_imran-Precision-Tower-7910/events.out.tfevents.1715849498.imran-Precision-Tower-7910.3233262.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c38f5ee0cc01916c4adab2c740cffc492855f3a9cb1c9b5355e3a95836daa633
3
+ size 6400
runs/May16_13-54-13_imran-Precision-Tower-7910/events.out.tfevents.1715849706.imran-Precision-Tower-7910.3233683.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e07ee582fc3c3adc049ededfc5123be9cf9037f1b99667a6f0b3dc88c3396b
3
+ size 9565
runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715850899.imran-Precision-Tower-7910.3234996.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab38a24e98ccf5ff44589b747bd06023f692af602f1111d8b7fe3b35fed2f345
3
+ size 54498
runs/May16_14-14-10_imran-Precision-Tower-7910/events.out.tfevents.1715860793.imran-Precision-Tower-7910.3234996.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:277c4063efc10d9e723846f5becdddecca74a4705bd93f3adf159c8c2e71b8e3
3
+ size 359
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|placeholder1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|placeholder2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|placeholder3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|placeholder4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": true,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": true,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|placeholder5|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|placeholder6|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": true,
114
+ "single_word": false,
115
+ "special": true
116
+ }
117
+ },
118
+ "bos_token": "<s>",
119
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
120
+ "clean_up_tokenization_spaces": false,
121
+ "eos_token": "<|endoftext|>",
122
+ "model_max_length": 1024,
123
+ "pad_token": "<unk>",
124
+ "padding_side": "right",
125
+ "sp_model_kwargs": {},
126
+ "tokenizer_class": "LlamaTokenizer",
127
+ "unk_token": "<unk>",
128
+ "use_default_system_prompt": false
129
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.024663401529344e+17,
4
+ "train_loss": 0.6764815047617708,
5
+ "train_runtime": 9686.8536,
6
+ "train_samples_per_second": 0.459,
7
+ "train_steps_per_second": 0.459
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02247191011235955,
13
+ "grad_norm": 3.921875,
14
+ "learning_rate": 4.49438202247191e-06,
15
+ "loss": 2.2098,
16
+ "step": 20
17
+ },
18
+ {
19
+ "epoch": 0.0449438202247191,
20
+ "grad_norm": 2.03125,
21
+ "learning_rate": 8.98876404494382e-06,
22
+ "loss": 2.1052,
23
+ "step": 40
24
+ },
25
+ {
26
+ "epoch": 0.06741573033707865,
27
+ "grad_norm": 1.21875,
28
+ "learning_rate": 1.348314606741573e-05,
29
+ "loss": 2.1605,
30
+ "step": 60
31
+ },
32
+ {
33
+ "epoch": 0.0898876404494382,
34
+ "grad_norm": 1.0234375,
35
+ "learning_rate": 1.797752808988764e-05,
36
+ "loss": 1.8331,
37
+ "step": 80
38
+ },
39
+ {
40
+ "epoch": 0.11235955056179775,
41
+ "grad_norm": 0.61328125,
42
+ "learning_rate": 2.2471910112359552e-05,
43
+ "loss": 1.8186,
44
+ "step": 100
45
+ },
46
+ {
47
+ "epoch": 0.1348314606741573,
48
+ "grad_norm": 0.703125,
49
+ "learning_rate": 2.696629213483146e-05,
50
+ "loss": 1.5533,
51
+ "step": 120
52
+ },
53
+ {
54
+ "epoch": 0.15730337078651685,
55
+ "grad_norm": 0.4921875,
56
+ "learning_rate": 3.1460674157303374e-05,
57
+ "loss": 1.5419,
58
+ "step": 140
59
+ },
60
+ {
61
+ "epoch": 0.1797752808988764,
62
+ "grad_norm": 0.470703125,
63
+ "learning_rate": 3.595505617977528e-05,
64
+ "loss": 1.3374,
65
+ "step": 160
66
+ },
67
+ {
68
+ "epoch": 0.20224719101123595,
69
+ "grad_norm": 0.52734375,
70
+ "learning_rate": 4.044943820224719e-05,
71
+ "loss": 1.3418,
72
+ "step": 180
73
+ },
74
+ {
75
+ "epoch": 0.2247191011235955,
76
+ "grad_norm": 0.59375,
77
+ "learning_rate": 4.4943820224719104e-05,
78
+ "loss": 1.3921,
79
+ "step": 200
80
+ },
81
+ {
82
+ "epoch": 0.24719101123595505,
83
+ "grad_norm": 0.67578125,
84
+ "learning_rate": 4.943820224719101e-05,
85
+ "loss": 1.1401,
86
+ "step": 220
87
+ },
88
+ {
89
+ "epoch": 0.2696629213483146,
90
+ "grad_norm": 0.72265625,
91
+ "learning_rate": 5.393258426966292e-05,
92
+ "loss": 1.2139,
93
+ "step": 240
94
+ },
95
+ {
96
+ "epoch": 0.29213483146067415,
97
+ "grad_norm": 0.49609375,
98
+ "learning_rate": 5.8426966292134835e-05,
99
+ "loss": 1.171,
100
+ "step": 260
101
+ },
102
+ {
103
+ "epoch": 0.3146067415730337,
104
+ "grad_norm": 0.94921875,
105
+ "learning_rate": 6.292134831460675e-05,
106
+ "loss": 1.1424,
107
+ "step": 280
108
+ },
109
+ {
110
+ "epoch": 0.33707865168539325,
111
+ "grad_norm": 0.8046875,
112
+ "learning_rate": 6.741573033707866e-05,
113
+ "loss": 1.2171,
114
+ "step": 300
115
+ },
116
+ {
117
+ "epoch": 0.3595505617977528,
118
+ "grad_norm": 1.09375,
119
+ "learning_rate": 7.191011235955056e-05,
120
+ "loss": 1.1575,
121
+ "step": 320
122
+ },
123
+ {
124
+ "epoch": 0.38202247191011235,
125
+ "grad_norm": 1.4453125,
126
+ "learning_rate": 7.640449438202247e-05,
127
+ "loss": 1.2041,
128
+ "step": 340
129
+ },
130
+ {
131
+ "epoch": 0.4044943820224719,
132
+ "grad_norm": 1.0,
133
+ "learning_rate": 8.089887640449438e-05,
134
+ "loss": 1.106,
135
+ "step": 360
136
+ },
137
+ {
138
+ "epoch": 0.42696629213483145,
139
+ "grad_norm": 1.0390625,
140
+ "learning_rate": 8.53932584269663e-05,
141
+ "loss": 1.0728,
142
+ "step": 380
143
+ },
144
+ {
145
+ "epoch": 0.449438202247191,
146
+ "grad_norm": 1.0859375,
147
+ "learning_rate": 8.988764044943821e-05,
148
+ "loss": 0.9622,
149
+ "step": 400
150
+ },
151
+ {
152
+ "epoch": 0.47191011235955055,
153
+ "grad_norm": 0.99609375,
154
+ "learning_rate": 9.438202247191012e-05,
155
+ "loss": 1.0835,
156
+ "step": 420
157
+ },
158
+ {
159
+ "epoch": 0.4943820224719101,
160
+ "grad_norm": 0.8828125,
161
+ "learning_rate": 9.887640449438202e-05,
162
+ "loss": 1.0557,
163
+ "step": 440
164
+ },
165
+ {
166
+ "epoch": 0.5168539325842697,
167
+ "grad_norm": 1.0859375,
168
+ "learning_rate": 0.00010337078651685395,
169
+ "loss": 1.1037,
170
+ "step": 460
171
+ },
172
+ {
173
+ "epoch": 0.5393258426966292,
174
+ "grad_norm": 1.3203125,
175
+ "learning_rate": 0.00010786516853932584,
176
+ "loss": 1.058,
177
+ "step": 480
178
+ },
179
+ {
180
+ "epoch": 0.5617977528089888,
181
+ "grad_norm": 1.03125,
182
+ "learning_rate": 0.00011235955056179777,
183
+ "loss": 1.1904,
184
+ "step": 500
185
+ },
186
+ {
187
+ "epoch": 0.5617977528089888,
188
+ "eval_loss": 1.0617437362670898,
189
+ "eval_runtime": 206.3616,
190
+ "eval_samples_per_second": 1.856,
191
+ "eval_steps_per_second": 1.856,
192
+ "step": 500
193
+ },
194
+ {
195
+ "epoch": 0.5842696629213483,
196
+ "grad_norm": 0.60546875,
197
+ "learning_rate": 0.00011685393258426967,
198
+ "loss": 1.1192,
199
+ "step": 520
200
+ },
201
+ {
202
+ "epoch": 0.6067415730337079,
203
+ "grad_norm": 0.96484375,
204
+ "learning_rate": 0.00012134831460674158,
205
+ "loss": 1.0356,
206
+ "step": 540
207
+ },
208
+ {
209
+ "epoch": 0.6292134831460674,
210
+ "grad_norm": 1.53125,
211
+ "learning_rate": 0.0001258426966292135,
212
+ "loss": 1.0108,
213
+ "step": 560
214
+ },
215
+ {
216
+ "epoch": 0.651685393258427,
217
+ "grad_norm": 1.453125,
218
+ "learning_rate": 0.0001303370786516854,
219
+ "loss": 0.8835,
220
+ "step": 580
221
+ },
222
+ {
223
+ "epoch": 0.6741573033707865,
224
+ "grad_norm": 0.84765625,
225
+ "learning_rate": 0.00013483146067415732,
226
+ "loss": 0.9925,
227
+ "step": 600
228
+ },
229
+ {
230
+ "epoch": 0.6966292134831461,
231
+ "grad_norm": 1.0625,
232
+ "learning_rate": 0.00013932584269662923,
233
+ "loss": 0.9548,
234
+ "step": 620
235
+ },
236
+ {
237
+ "epoch": 0.7191011235955056,
238
+ "grad_norm": 0.9375,
239
+ "learning_rate": 0.00014382022471910112,
240
+ "loss": 1.113,
241
+ "step": 640
242
+ },
243
+ {
244
+ "epoch": 0.7415730337078652,
245
+ "grad_norm": 0.8125,
246
+ "learning_rate": 0.00014831460674157306,
247
+ "loss": 0.9504,
248
+ "step": 660
249
+ },
250
+ {
251
+ "epoch": 0.7640449438202247,
252
+ "grad_norm": 0.6953125,
253
+ "learning_rate": 0.00015280898876404494,
254
+ "loss": 1.1266,
255
+ "step": 680
256
+ },
257
+ {
258
+ "epoch": 0.7865168539325843,
259
+ "grad_norm": 1.2890625,
260
+ "learning_rate": 0.00015730337078651685,
261
+ "loss": 1.0707,
262
+ "step": 700
263
+ },
264
+ {
265
+ "epoch": 0.8089887640449438,
266
+ "grad_norm": 0.88671875,
267
+ "learning_rate": 0.00016179775280898877,
268
+ "loss": 0.9222,
269
+ "step": 720
270
+ },
271
+ {
272
+ "epoch": 0.8314606741573034,
273
+ "grad_norm": 0.79296875,
274
+ "learning_rate": 0.00016629213483146068,
275
+ "loss": 1.1053,
276
+ "step": 740
277
+ },
278
+ {
279
+ "epoch": 0.8539325842696629,
280
+ "grad_norm": 0.83984375,
281
+ "learning_rate": 0.0001707865168539326,
282
+ "loss": 1.25,
283
+ "step": 760
284
+ },
285
+ {
286
+ "epoch": 0.8764044943820225,
287
+ "grad_norm": 0.46875,
288
+ "learning_rate": 0.0001752808988764045,
289
+ "loss": 0.933,
290
+ "step": 780
291
+ },
292
+ {
293
+ "epoch": 0.898876404494382,
294
+ "grad_norm": 0.66015625,
295
+ "learning_rate": 0.00017977528089887642,
296
+ "loss": 0.885,
297
+ "step": 800
298
+ },
299
+ {
300
+ "epoch": 0.9213483146067416,
301
+ "grad_norm": 1.4140625,
302
+ "learning_rate": 0.00018426966292134833,
303
+ "loss": 0.9786,
304
+ "step": 820
305
+ },
306
+ {
307
+ "epoch": 0.9438202247191011,
308
+ "grad_norm": 0.96875,
309
+ "learning_rate": 0.00018876404494382024,
310
+ "loss": 0.9101,
311
+ "step": 840
312
+ },
313
+ {
314
+ "epoch": 0.9662921348314607,
315
+ "grad_norm": 0.80859375,
316
+ "learning_rate": 0.00019325842696629215,
317
+ "loss": 0.9005,
318
+ "step": 860
319
+ },
320
+ {
321
+ "epoch": 0.9887640449438202,
322
+ "grad_norm": 0.99609375,
323
+ "learning_rate": 0.00019775280898876404,
324
+ "loss": 0.9485,
325
+ "step": 880
326
+ },
327
+ {
328
+ "epoch": 1.0112359550561798,
329
+ "grad_norm": 0.9921875,
330
+ "learning_rate": 0.00019999610626011892,
331
+ "loss": 1.0071,
332
+ "step": 900
333
+ },
334
+ {
335
+ "epoch": 1.0337078651685394,
336
+ "grad_norm": 0.84375,
337
+ "learning_rate": 0.00019996495816039186,
338
+ "loss": 0.9138,
339
+ "step": 920
340
+ },
341
+ {
342
+ "epoch": 1.0561797752808988,
343
+ "grad_norm": 0.69140625,
344
+ "learning_rate": 0.00019990267166335664,
345
+ "loss": 0.7752,
346
+ "step": 940
347
+ },
348
+ {
349
+ "epoch": 1.0786516853932584,
350
+ "grad_norm": 0.92578125,
351
+ "learning_rate": 0.00019980926617082901,
352
+ "loss": 0.9331,
353
+ "step": 960
354
+ },
355
+ {
356
+ "epoch": 1.101123595505618,
357
+ "grad_norm": 1.046875,
358
+ "learning_rate": 0.00019968477077797781,
359
+ "loss": 1.0037,
360
+ "step": 980
361
+ },
362
+ {
363
+ "epoch": 1.1235955056179776,
364
+ "grad_norm": 0.56640625,
365
+ "learning_rate": 0.00019952922426426207,
366
+ "loss": 0.765,
367
+ "step": 1000
368
+ },
369
+ {
370
+ "epoch": 1.1235955056179776,
371
+ "eval_loss": 0.944207489490509,
372
+ "eval_runtime": 206.3524,
373
+ "eval_samples_per_second": 1.856,
374
+ "eval_steps_per_second": 1.856,
375
+ "step": 1000
376
+ },
377
+ {
378
+ "epoch": 1.146067415730337,
379
+ "grad_norm": 1.09375,
380
+ "learning_rate": 0.00019934267508135164,
381
+ "loss": 0.861,
382
+ "step": 1020
383
+ },
384
+ {
385
+ "epoch": 1.1685393258426966,
386
+ "grad_norm": 1.25,
387
+ "learning_rate": 0.00019912518133803465,
388
+ "loss": 0.8251,
389
+ "step": 1040
390
+ },
391
+ {
392
+ "epoch": 1.1910112359550562,
393
+ "grad_norm": 0.609375,
394
+ "learning_rate": 0.00019887681078211707,
395
+ "loss": 0.9779,
396
+ "step": 1060
397
+ },
398
+ {
399
+ "epoch": 1.2134831460674158,
400
+ "grad_norm": 0.765625,
401
+ "learning_rate": 0.00019859764077931978,
402
+ "loss": 0.8112,
403
+ "step": 1080
404
+ },
405
+ {
406
+ "epoch": 1.2359550561797752,
407
+ "grad_norm": 0.890625,
408
+ "learning_rate": 0.00019828775828917964,
409
+ "loss": 0.9084,
410
+ "step": 1100
411
+ },
412
+ {
413
+ "epoch": 1.2584269662921348,
414
+ "grad_norm": 1.15625,
415
+ "learning_rate": 0.00019794725983796218,
416
+ "loss": 0.8429,
417
+ "step": 1120
418
+ },
419
+ {
420
+ "epoch": 1.2808988764044944,
421
+ "grad_norm": 0.7890625,
422
+ "learning_rate": 0.00019757625148859441,
423
+ "loss": 0.8029,
424
+ "step": 1140
425
+ },
426
+ {
427
+ "epoch": 1.303370786516854,
428
+ "grad_norm": 1.0078125,
429
+ "learning_rate": 0.00019717484880762685,
430
+ "loss": 0.9478,
431
+ "step": 1160
432
+ },
433
+ {
434
+ "epoch": 1.3258426966292136,
435
+ "grad_norm": 0.9765625,
436
+ "learning_rate": 0.00019674317682923532,
437
+ "loss": 0.6985,
438
+ "step": 1180
439
+ },
440
+ {
441
+ "epoch": 1.348314606741573,
442
+ "grad_norm": 0.9609375,
443
+ "learning_rate": 0.00019628137001627383,
444
+ "loss": 0.8653,
445
+ "step": 1200
446
+ },
447
+ {
448
+ "epoch": 1.3707865168539326,
449
+ "grad_norm": 0.82421875,
450
+ "learning_rate": 0.00019578957221839014,
451
+ "loss": 0.891,
452
+ "step": 1220
453
+ },
454
+ {
455
+ "epoch": 1.3932584269662922,
456
+ "grad_norm": 1.0390625,
457
+ "learning_rate": 0.00019526793662721768,
458
+ "loss": 0.861,
459
+ "step": 1240
460
+ },
461
+ {
462
+ "epoch": 1.4157303370786516,
463
+ "grad_norm": 0.62890625,
464
+ "learning_rate": 0.00019471662572865736,
465
+ "loss": 0.7591,
466
+ "step": 1260
467
+ },
468
+ {
469
+ "epoch": 1.4382022471910112,
470
+ "grad_norm": 0.8828125,
471
+ "learning_rate": 0.00019413581125226438,
472
+ "loss": 0.7109,
473
+ "step": 1280
474
+ },
475
+ {
476
+ "epoch": 1.4606741573033708,
477
+ "grad_norm": 0.6875,
478
+ "learning_rate": 0.00019352567411775565,
479
+ "loss": 0.8947,
480
+ "step": 1300
481
+ },
482
+ {
483
+ "epoch": 1.4831460674157304,
484
+ "grad_norm": 0.88671875,
485
+ "learning_rate": 0.00019288640437865445,
486
+ "loss": 0.8514,
487
+ "step": 1320
488
+ },
489
+ {
490
+ "epoch": 1.50561797752809,
491
+ "grad_norm": 0.69140625,
492
+ "learning_rate": 0.0001922182011630902,
493
+ "loss": 0.7379,
494
+ "step": 1340
495
+ },
496
+ {
497
+ "epoch": 1.5280898876404494,
498
+ "grad_norm": 0.63671875,
499
+ "learning_rate": 0.00019152127261177126,
500
+ "loss": 0.6778,
501
+ "step": 1360
502
+ },
503
+ {
504
+ "epoch": 1.550561797752809,
505
+ "grad_norm": 0.65234375,
506
+ "learning_rate": 0.00019079583581315076,
507
+ "loss": 0.6591,
508
+ "step": 1380
509
+ },
510
+ {
511
+ "epoch": 1.5730337078651684,
512
+ "grad_norm": 0.96484375,
513
+ "learning_rate": 0.0001900421167358048,
514
+ "loss": 0.8635,
515
+ "step": 1400
516
+ },
517
+ {
518
+ "epoch": 1.595505617977528,
519
+ "grad_norm": 0.8125,
520
+ "learning_rate": 0.00018926035015804488,
521
+ "loss": 0.924,
522
+ "step": 1420
523
+ },
524
+ {
525
+ "epoch": 1.6179775280898876,
526
+ "grad_norm": 1.1953125,
527
+ "learning_rate": 0.00018845077959478613,
528
+ "loss": 0.8554,
529
+ "step": 1440
530
+ },
531
+ {
532
+ "epoch": 1.6404494382022472,
533
+ "grad_norm": 0.6171875,
534
+ "learning_rate": 0.00018761365722169403,
535
+ "loss": 0.9471,
536
+ "step": 1460
537
+ },
538
+ {
539
+ "epoch": 1.6629213483146068,
540
+ "grad_norm": 0.64453125,
541
+ "learning_rate": 0.00018674924379663338,
542
+ "loss": 0.9187,
543
+ "step": 1480
544
+ },
545
+ {
546
+ "epoch": 1.6853932584269664,
547
+ "grad_norm": 0.56640625,
548
+ "learning_rate": 0.00018585780857844418,
549
+ "loss": 0.782,
550
+ "step": 1500
551
+ },
552
+ {
553
+ "epoch": 1.6853932584269664,
554
+ "eval_loss": 0.8689672350883484,
555
+ "eval_runtime": 206.6921,
556
+ "eval_samples_per_second": 1.853,
557
+ "eval_steps_per_second": 1.853,
558
+ "step": 1500
559
+ },
560
+ {
561
+ "epoch": 1.7078651685393258,
562
+ "grad_norm": 0.890625,
563
+ "learning_rate": 0.00018493962924306912,
564
+ "loss": 0.8983,
565
+ "step": 1520
566
+ },
567
+ {
568
+ "epoch": 1.7303370786516854,
569
+ "grad_norm": 0.83984375,
570
+ "learning_rate": 0.0001839949917970596,
571
+ "loss": 0.5218,
572
+ "step": 1540
573
+ },
574
+ {
575
+ "epoch": 1.7528089887640448,
576
+ "grad_norm": 0.921875,
577
+ "learning_rate": 0.00018302419048848667,
578
+ "loss": 0.6711,
579
+ "step": 1560
580
+ },
581
+ {
582
+ "epoch": 1.7752808988764044,
583
+ "grad_norm": 0.7578125,
584
+ "learning_rate": 0.0001820275277152846,
585
+ "loss": 0.7932,
586
+ "step": 1580
587
+ },
588
+ {
589
+ "epoch": 1.797752808988764,
590
+ "grad_norm": 0.97265625,
591
+ "learning_rate": 0.00018100531393105623,
592
+ "loss": 0.7181,
593
+ "step": 1600
594
+ },
595
+ {
596
+ "epoch": 1.8202247191011236,
597
+ "grad_norm": 0.95703125,
598
+ "learning_rate": 0.00017995786754836863,
599
+ "loss": 0.8525,
600
+ "step": 1620
601
+ },
602
+ {
603
+ "epoch": 1.8426966292134832,
604
+ "grad_norm": 0.6484375,
605
+ "learning_rate": 0.00017888551483956987,
606
+ "loss": 0.6968,
607
+ "step": 1640
608
+ },
609
+ {
610
+ "epoch": 1.8651685393258428,
611
+ "grad_norm": 0.78515625,
612
+ "learning_rate": 0.00017778858983515743,
613
+ "loss": 0.902,
614
+ "step": 1660
615
+ },
616
+ {
617
+ "epoch": 1.8876404494382022,
618
+ "grad_norm": 0.875,
619
+ "learning_rate": 0.00017666743421972987,
620
+ "loss": 0.954,
621
+ "step": 1680
622
+ },
623
+ {
624
+ "epoch": 1.9101123595505618,
625
+ "grad_norm": 1.0546875,
626
+ "learning_rate": 0.0001755223972255546,
627
+ "loss": 0.791,
628
+ "step": 1700
629
+ },
630
+ {
631
+ "epoch": 1.9325842696629212,
632
+ "grad_norm": 0.765625,
633
+ "learning_rate": 0.00017435383552378428,
634
+ "loss": 0.77,
635
+ "step": 1720
636
+ },
637
+ {
638
+ "epoch": 1.9550561797752808,
639
+ "grad_norm": 1.25,
640
+ "learning_rate": 0.0001731621131133564,
641
+ "loss": 0.6294,
642
+ "step": 1740
643
+ },
644
+ {
645
+ "epoch": 1.9775280898876404,
646
+ "grad_norm": 0.70703125,
647
+ "learning_rate": 0.00017194760120760986,
648
+ "loss": 0.7982,
649
+ "step": 1760
650
+ },
651
+ {
652
+ "epoch": 2.0,
653
+ "grad_norm": 0.67578125,
654
+ "learning_rate": 0.00017071067811865476,
655
+ "loss": 0.7643,
656
+ "step": 1780
657
+ },
658
+ {
659
+ "epoch": 2.0224719101123596,
660
+ "grad_norm": 1.0234375,
661
+ "learning_rate": 0.0001694517291395307,
662
+ "loss": 0.5279,
663
+ "step": 1800
664
+ },
665
+ {
666
+ "epoch": 2.044943820224719,
667
+ "grad_norm": 1.078125,
668
+ "learning_rate": 0.00016817114642419067,
669
+ "loss": 0.6667,
670
+ "step": 1820
671
+ },
672
+ {
673
+ "epoch": 2.067415730337079,
674
+ "grad_norm": 0.9375,
675
+ "learning_rate": 0.00016686932886534781,
676
+ "loss": 0.6427,
677
+ "step": 1840
678
+ },
679
+ {
680
+ "epoch": 2.0898876404494384,
681
+ "grad_norm": 0.7890625,
682
+ "learning_rate": 0.00016554668197022295,
683
+ "loss": 0.633,
684
+ "step": 1860
685
+ },
686
+ {
687
+ "epoch": 2.1123595505617976,
688
+ "grad_norm": 0.7421875,
689
+ "learning_rate": 0.00016420361773423204,
690
+ "loss": 0.5623,
691
+ "step": 1880
692
+ },
693
+ {
694
+ "epoch": 2.134831460674157,
695
+ "grad_norm": 1.078125,
696
+ "learning_rate": 0.00016284055451265246,
697
+ "loss": 0.6311,
698
+ "step": 1900
699
+ },
700
+ {
701
+ "epoch": 2.157303370786517,
702
+ "grad_norm": 1.25,
703
+ "learning_rate": 0.00016145791689030795,
704
+ "loss": 0.7469,
705
+ "step": 1920
706
+ },
707
+ {
708
+ "epoch": 2.1797752808988764,
709
+ "grad_norm": 0.68359375,
710
+ "learning_rate": 0.0001600561355493137,
711
+ "loss": 0.7196,
712
+ "step": 1940
713
+ },
714
+ {
715
+ "epoch": 2.202247191011236,
716
+ "grad_norm": 0.95703125,
717
+ "learning_rate": 0.0001586356471349215,
718
+ "loss": 0.6328,
719
+ "step": 1960
720
+ },
721
+ {
722
+ "epoch": 2.2247191011235956,
723
+ "grad_norm": 0.9765625,
724
+ "learning_rate": 0.00015719689411950808,
725
+ "loss": 0.6349,
726
+ "step": 1980
727
+ },
728
+ {
729
+ "epoch": 2.247191011235955,
730
+ "grad_norm": 1.140625,
731
+ "learning_rate": 0.00015574032466474775,
732
+ "loss": 0.5591,
733
+ "step": 2000
734
+ },
735
+ {
736
+ "epoch": 2.247191011235955,
737
+ "eval_loss": 0.8647096753120422,
738
+ "eval_runtime": 206.6878,
739
+ "eval_samples_per_second": 1.853,
740
+ "eval_steps_per_second": 1.853,
741
+ "step": 2000
742
+ },
743
+ {
744
+ "epoch": 2.2696629213483144,
745
+ "grad_norm": 1.1328125,
746
+ "learning_rate": 0.00015426639248201313,
747
+ "loss": 0.5206,
748
+ "step": 2020
749
+ },
750
+ {
751
+ "epoch": 2.292134831460674,
752
+ "grad_norm": 0.8125,
753
+ "learning_rate": 0.0001527755566910474,
754
+ "loss": 0.7186,
755
+ "step": 2040
756
+ },
757
+ {
758
+ "epoch": 2.3146067415730336,
759
+ "grad_norm": 0.97265625,
760
+ "learning_rate": 0.00015126828167695146,
761
+ "loss": 0.6533,
762
+ "step": 2060
763
+ },
764
+ {
765
+ "epoch": 2.337078651685393,
766
+ "grad_norm": 1.4609375,
767
+ "learning_rate": 0.0001497450369455312,
768
+ "loss": 0.6324,
769
+ "step": 2080
770
+ },
771
+ {
772
+ "epoch": 2.359550561797753,
773
+ "grad_norm": 0.875,
774
+ "learning_rate": 0.00014820629697704965,
775
+ "loss": 0.5276,
776
+ "step": 2100
777
+ },
778
+ {
779
+ "epoch": 2.3820224719101124,
780
+ "grad_norm": 0.74609375,
781
+ "learning_rate": 0.00014665254107842964,
782
+ "loss": 0.612,
783
+ "step": 2120
784
+ },
785
+ {
786
+ "epoch": 2.404494382022472,
787
+ "grad_norm": 0.92578125,
788
+ "learning_rate": 0.00014508425323395317,
789
+ "loss": 0.614,
790
+ "step": 2140
791
+ },
792
+ {
793
+ "epoch": 2.4269662921348316,
794
+ "grad_norm": 0.9921875,
795
+ "learning_rate": 0.0001435019219545034,
796
+ "loss": 0.4988,
797
+ "step": 2160
798
+ },
799
+ {
800
+ "epoch": 2.449438202247191,
801
+ "grad_norm": 0.8515625,
802
+ "learning_rate": 0.00014190604012539684,
803
+ "loss": 0.6777,
804
+ "step": 2180
805
+ },
806
+ {
807
+ "epoch": 2.4719101123595504,
808
+ "grad_norm": 0.96875,
809
+ "learning_rate": 0.00014029710485285324,
810
+ "loss": 0.662,
811
+ "step": 2200
812
+ },
813
+ {
814
+ "epoch": 2.49438202247191,
815
+ "grad_norm": 0.74609375,
816
+ "learning_rate": 0.00013867561730915016,
817
+ "loss": 0.6087,
818
+ "step": 2220
819
+ },
820
+ {
821
+ "epoch": 2.5168539325842696,
822
+ "grad_norm": 0.875,
823
+ "learning_rate": 0.0001370420825765114,
824
+ "loss": 0.56,
825
+ "step": 2240
826
+ },
827
+ {
828
+ "epoch": 2.539325842696629,
829
+ "grad_norm": 1.2421875,
830
+ "learning_rate": 0.00013539700948977717,
831
+ "loss": 0.572,
832
+ "step": 2260
833
+ },
834
+ {
835
+ "epoch": 2.561797752808989,
836
+ "grad_norm": 1.421875,
837
+ "learning_rate": 0.00013374091047790585,
838
+ "loss": 0.7334,
839
+ "step": 2280
840
+ },
841
+ {
842
+ "epoch": 2.5842696629213484,
843
+ "grad_norm": 1.3125,
844
+ "learning_rate": 0.00013207430140435556,
845
+ "loss": 0.5377,
846
+ "step": 2300
847
+ },
848
+ {
849
+ "epoch": 2.606741573033708,
850
+ "grad_norm": 0.86328125,
851
+ "learning_rate": 0.00013039770140639654,
852
+ "loss": 0.6306,
853
+ "step": 2320
854
+ },
855
+ {
856
+ "epoch": 2.629213483146067,
857
+ "grad_norm": 1.3046875,
858
+ "learning_rate": 0.00012871163273340307,
859
+ "loss": 0.582,
860
+ "step": 2340
861
+ },
862
+ {
863
+ "epoch": 2.6516853932584272,
864
+ "grad_norm": 0.83203125,
865
+ "learning_rate": 0.00012701662058417688,
866
+ "loss": 0.6326,
867
+ "step": 2360
868
+ },
869
+ {
870
+ "epoch": 2.6741573033707864,
871
+ "grad_norm": 1.1640625,
872
+ "learning_rate": 0.00012531319294335086,
873
+ "loss": 0.6907,
874
+ "step": 2380
875
+ },
876
+ {
877
+ "epoch": 2.696629213483146,
878
+ "grad_norm": 1.2734375,
879
+ "learning_rate": 0.00012360188041692582,
880
+ "loss": 0.656,
881
+ "step": 2400
882
+ },
883
+ {
884
+ "epoch": 2.7191011235955056,
885
+ "grad_norm": 1.234375,
886
+ "learning_rate": 0.00012188321606699016,
887
+ "loss": 0.5817,
888
+ "step": 2420
889
+ },
890
+ {
891
+ "epoch": 2.741573033707865,
892
+ "grad_norm": 0.97265625,
893
+ "learning_rate": 0.00012015773524567479,
894
+ "loss": 0.5046,
895
+ "step": 2440
896
+ },
897
+ {
898
+ "epoch": 2.764044943820225,
899
+ "grad_norm": 1.3515625,
900
+ "learning_rate": 0.00011842597542839462,
901
+ "loss": 0.6293,
902
+ "step": 2460
903
+ },
904
+ {
905
+ "epoch": 2.7865168539325844,
906
+ "grad_norm": 1.1796875,
907
+ "learning_rate": 0.00011668847604642861,
908
+ "loss": 0.6067,
909
+ "step": 2480
910
+ },
911
+ {
912
+ "epoch": 2.808988764044944,
913
+ "grad_norm": 1.1484375,
914
+ "learning_rate": 0.00011494577831889067,
915
+ "loss": 0.5669,
916
+ "step": 2500
917
+ },
918
+ {
919
+ "epoch": 2.808988764044944,
920
+ "eval_loss": 0.8295581340789795,
921
+ "eval_runtime": 206.704,
922
+ "eval_samples_per_second": 1.853,
923
+ "eval_steps_per_second": 1.853,
924
+ "step": 2500
925
+ },
926
+ {
927
+ "epoch": 2.831460674157303,
928
+ "grad_norm": 1.1484375,
929
+ "learning_rate": 0.00011319842508414365,
930
+ "loss": 0.5429,
931
+ "step": 2520
932
+ },
933
+ {
934
+ "epoch": 2.853932584269663,
935
+ "grad_norm": 1.03125,
936
+ "learning_rate": 0.00011144696063070883,
937
+ "loss": 0.5481,
938
+ "step": 2540
939
+ },
940
+ {
941
+ "epoch": 2.8764044943820224,
942
+ "grad_norm": 0.78515625,
943
+ "learning_rate": 0.00010969193052772396,
944
+ "loss": 0.5308,
945
+ "step": 2560
946
+ },
947
+ {
948
+ "epoch": 2.898876404494382,
949
+ "grad_norm": 0.703125,
950
+ "learning_rate": 0.00010793388145500198,
951
+ "loss": 0.4527,
952
+ "step": 2580
953
+ },
954
+ {
955
+ "epoch": 2.9213483146067416,
956
+ "grad_norm": 1.390625,
957
+ "learning_rate": 0.00010617336103274424,
958
+ "loss": 0.5333,
959
+ "step": 2600
960
+ },
961
+ {
962
+ "epoch": 2.943820224719101,
963
+ "grad_norm": 0.9296875,
964
+ "learning_rate": 0.00010441091765096047,
965
+ "loss": 0.5886,
966
+ "step": 2620
967
+ },
968
+ {
969
+ "epoch": 2.966292134831461,
970
+ "grad_norm": 1.296875,
971
+ "learning_rate": 0.0001026471002986491,
972
+ "loss": 0.626,
973
+ "step": 2640
974
+ },
975
+ {
976
+ "epoch": 2.98876404494382,
977
+ "grad_norm": 0.953125,
978
+ "learning_rate": 0.00010088245839279082,
979
+ "loss": 0.6703,
980
+ "step": 2660
981
+ },
982
+ {
983
+ "epoch": 3.0112359550561796,
984
+ "grad_norm": 0.64453125,
985
+ "learning_rate": 9.911754160720923e-05,
986
+ "loss": 0.4819,
987
+ "step": 2680
988
+ },
989
+ {
990
+ "epoch": 3.033707865168539,
991
+ "grad_norm": 0.71875,
992
+ "learning_rate": 9.735289970135095e-05,
993
+ "loss": 0.4379,
994
+ "step": 2700
995
+ },
996
+ {
997
+ "epoch": 3.056179775280899,
998
+ "grad_norm": 1.7734375,
999
+ "learning_rate": 9.558908234903954e-05,
1000
+ "loss": 0.3811,
1001
+ "step": 2720
1002
+ },
1003
+ {
1004
+ "epoch": 3.0786516853932584,
1005
+ "grad_norm": 0.6953125,
1006
+ "learning_rate": 9.382663896725578e-05,
1007
+ "loss": 0.3855,
1008
+ "step": 2740
1009
+ },
1010
+ {
1011
+ "epoch": 3.101123595505618,
1012
+ "grad_norm": 1.4140625,
1013
+ "learning_rate": 9.206611854499805e-05,
1014
+ "loss": 0.4749,
1015
+ "step": 2760
1016
+ },
1017
+ {
1018
+ "epoch": 3.1235955056179776,
1019
+ "grad_norm": 0.609375,
1020
+ "learning_rate": 9.030806947227607e-05,
1021
+ "loss": 0.501,
1022
+ "step": 2780
1023
+ },
1024
+ {
1025
+ "epoch": 3.146067415730337,
1026
+ "grad_norm": 1.1015625,
1027
+ "learning_rate": 8.855303936929117e-05,
1028
+ "loss": 0.4239,
1029
+ "step": 2800
1030
+ },
1031
+ {
1032
+ "epoch": 3.168539325842697,
1033
+ "grad_norm": 0.84375,
1034
+ "learning_rate": 8.680157491585636e-05,
1035
+ "loss": 0.5388,
1036
+ "step": 2820
1037
+ },
1038
+ {
1039
+ "epoch": 3.191011235955056,
1040
+ "grad_norm": 1.2578125,
1041
+ "learning_rate": 8.505422168110934e-05,
1042
+ "loss": 0.3715,
1043
+ "step": 2840
1044
+ },
1045
+ {
1046
+ "epoch": 3.2134831460674156,
1047
+ "grad_norm": 0.94921875,
1048
+ "learning_rate": 8.331152395357141e-05,
1049
+ "loss": 0.4274,
1050
+ "step": 2860
1051
+ },
1052
+ {
1053
+ "epoch": 3.235955056179775,
1054
+ "grad_norm": 1.046875,
1055
+ "learning_rate": 8.157402457160539e-05,
1056
+ "loss": 0.4368,
1057
+ "step": 2880
1058
+ },
1059
+ {
1060
+ "epoch": 3.258426966292135,
1061
+ "grad_norm": 1.2421875,
1062
+ "learning_rate": 7.984226475432522e-05,
1063
+ "loss": 0.4026,
1064
+ "step": 2900
1065
+ },
1066
+ {
1067
+ "epoch": 3.2808988764044944,
1068
+ "grad_norm": 0.8359375,
1069
+ "learning_rate": 7.811678393300987e-05,
1070
+ "loss": 0.3971,
1071
+ "step": 2920
1072
+ },
1073
+ {
1074
+ "epoch": 3.303370786516854,
1075
+ "grad_norm": 0.94921875,
1076
+ "learning_rate": 7.63981195830742e-05,
1077
+ "loss": 0.395,
1078
+ "step": 2940
1079
+ },
1080
+ {
1081
+ "epoch": 3.3258426966292136,
1082
+ "grad_norm": 1.328125,
1083
+ "learning_rate": 7.468680705664914e-05,
1084
+ "loss": 0.4165,
1085
+ "step": 2960
1086
+ },
1087
+ {
1088
+ "epoch": 3.348314606741573,
1089
+ "grad_norm": 0.88671875,
1090
+ "learning_rate": 7.298337941582314e-05,
1091
+ "loss": 0.4071,
1092
+ "step": 2980
1093
+ },
1094
+ {
1095
+ "epoch": 3.370786516853933,
1096
+ "grad_norm": 1.4296875,
1097
+ "learning_rate": 7.128836726659696e-05,
1098
+ "loss": 0.4205,
1099
+ "step": 3000
1100
+ },
1101
+ {
1102
+ "epoch": 3.370786516853933,
1103
+ "eval_loss": 0.8820343613624573,
1104
+ "eval_runtime": 206.8369,
1105
+ "eval_samples_per_second": 1.852,
1106
+ "eval_steps_per_second": 1.852,
1107
+ "step": 3000
1108
+ },
1109
+ {
1110
+ "epoch": 3.393258426966292,
1111
+ "grad_norm": 1.46875,
1112
+ "learning_rate": 6.960229859360353e-05,
1113
+ "loss": 0.3759,
1114
+ "step": 3020
1115
+ },
1116
+ {
1117
+ "epoch": 3.4157303370786516,
1118
+ "grad_norm": 1.2265625,
1119
+ "learning_rate": 6.792569859564445e-05,
1120
+ "loss": 0.4457,
1121
+ "step": 3040
1122
+ },
1123
+ {
1124
+ "epoch": 3.438202247191011,
1125
+ "grad_norm": 1.5390625,
1126
+ "learning_rate": 6.625908952209418e-05,
1127
+ "loss": 0.4088,
1128
+ "step": 3060
1129
+ },
1130
+ {
1131
+ "epoch": 3.460674157303371,
1132
+ "grad_norm": 0.92578125,
1133
+ "learning_rate": 6.460299051022285e-05,
1134
+ "loss": 0.4221,
1135
+ "step": 3080
1136
+ },
1137
+ {
1138
+ "epoch": 3.4831460674157304,
1139
+ "grad_norm": 1.34375,
1140
+ "learning_rate": 6.295791742348865e-05,
1141
+ "loss": 0.4304,
1142
+ "step": 3100
1143
+ },
1144
+ {
1145
+ "epoch": 3.50561797752809,
1146
+ "grad_norm": 1.0,
1147
+ "learning_rate": 6.132438269084985e-05,
1148
+ "loss": 0.3612,
1149
+ "step": 3120
1150
+ },
1151
+ {
1152
+ "epoch": 3.5280898876404496,
1153
+ "grad_norm": 1.4609375,
1154
+ "learning_rate": 5.970289514714677e-05,
1155
+ "loss": 0.4692,
1156
+ "step": 3140
1157
+ },
1158
+ {
1159
+ "epoch": 3.550561797752809,
1160
+ "grad_norm": 1.578125,
1161
+ "learning_rate": 5.8093959874603176e-05,
1162
+ "loss": 0.4579,
1163
+ "step": 3160
1164
+ },
1165
+ {
1166
+ "epoch": 3.5730337078651684,
1167
+ "grad_norm": 0.66796875,
1168
+ "learning_rate": 5.649807804549663e-05,
1169
+ "loss": 0.3754,
1170
+ "step": 3180
1171
+ },
1172
+ {
1173
+ "epoch": 3.595505617977528,
1174
+ "grad_norm": 1.1328125,
1175
+ "learning_rate": 5.491574676604682e-05,
1176
+ "loss": 0.3685,
1177
+ "step": 3200
1178
+ },
1179
+ {
1180
+ "epoch": 3.6179775280898876,
1181
+ "grad_norm": 1.09375,
1182
+ "learning_rate": 5.334745892157035e-05,
1183
+ "loss": 0.3809,
1184
+ "step": 3220
1185
+ },
1186
+ {
1187
+ "epoch": 3.640449438202247,
1188
+ "grad_norm": 1.4453125,
1189
+ "learning_rate": 5.179370302295037e-05,
1190
+ "loss": 0.4809,
1191
+ "step": 3240
1192
+ },
1193
+ {
1194
+ "epoch": 3.662921348314607,
1195
+ "grad_norm": 1.015625,
1196
+ "learning_rate": 5.02549630544688e-05,
1197
+ "loss": 0.3798,
1198
+ "step": 3260
1199
+ },
1200
+ {
1201
+ "epoch": 3.6853932584269664,
1202
+ "grad_norm": 1.1796875,
1203
+ "learning_rate": 4.8731718323048516e-05,
1204
+ "loss": 0.4153,
1205
+ "step": 3280
1206
+ },
1207
+ {
1208
+ "epoch": 3.7078651685393256,
1209
+ "grad_norm": 0.98046875,
1210
+ "learning_rate": 4.722444330895256e-05,
1211
+ "loss": 0.4612,
1212
+ "step": 3300
1213
+ },
1214
+ {
1215
+ "epoch": 3.7303370786516856,
1216
+ "grad_norm": 1.4921875,
1217
+ "learning_rate": 4.573360751798689e-05,
1218
+ "loss": 0.469,
1219
+ "step": 3320
1220
+ },
1221
+ {
1222
+ "epoch": 3.752808988764045,
1223
+ "grad_norm": 1.8203125,
1224
+ "learning_rate": 4.425967533525229e-05,
1225
+ "loss": 0.4523,
1226
+ "step": 3340
1227
+ },
1228
+ {
1229
+ "epoch": 3.7752808988764044,
1230
+ "grad_norm": 1.359375,
1231
+ "learning_rate": 4.2803105880491925e-05,
1232
+ "loss": 0.4214,
1233
+ "step": 3360
1234
+ },
1235
+ {
1236
+ "epoch": 3.797752808988764,
1237
+ "grad_norm": 1.734375,
1238
+ "learning_rate": 4.136435286507849e-05,
1239
+ "loss": 0.4981,
1240
+ "step": 3380
1241
+ },
1242
+ {
1243
+ "epoch": 3.8202247191011236,
1244
+ "grad_norm": 1.3515625,
1245
+ "learning_rate": 3.994386445068632e-05,
1246
+ "loss": 0.4029,
1247
+ "step": 3400
1248
+ },
1249
+ {
1250
+ "epoch": 3.842696629213483,
1251
+ "grad_norm": 1.4609375,
1252
+ "learning_rate": 3.854208310969204e-05,
1253
+ "loss": 0.3747,
1254
+ "step": 3420
1255
+ },
1256
+ {
1257
+ "epoch": 3.865168539325843,
1258
+ "grad_norm": 1.1015625,
1259
+ "learning_rate": 3.715944548734755e-05,
1260
+ "loss": 0.4113,
1261
+ "step": 3440
1262
+ },
1263
+ {
1264
+ "epoch": 3.8876404494382024,
1265
+ "grad_norm": 0.52734375,
1266
+ "learning_rate": 3.5796382265767937e-05,
1267
+ "loss": 0.3896,
1268
+ "step": 3460
1269
+ },
1270
+ {
1271
+ "epoch": 3.9101123595505616,
1272
+ "grad_norm": 1.0625,
1273
+ "learning_rate": 3.445331802977709e-05,
1274
+ "loss": 0.4709,
1275
+ "step": 3480
1276
+ },
1277
+ {
1278
+ "epoch": 3.932584269662921,
1279
+ "grad_norm": 1.4296875,
1280
+ "learning_rate": 3.313067113465222e-05,
1281
+ "loss": 0.3812,
1282
+ "step": 3500
1283
+ },
1284
+ {
1285
+ "epoch": 3.932584269662921,
1286
+ "eval_loss": 0.8859002590179443,
1287
+ "eval_runtime": 206.8365,
1288
+ "eval_samples_per_second": 1.852,
1289
+ "eval_steps_per_second": 1.852,
1290
+ "step": 3500
1291
+ },
1292
+ {
1293
+ "epoch": 3.955056179775281,
1294
+ "grad_norm": 1.09375,
1295
+ "learning_rate": 3.182885357580934e-05,
1296
+ "loss": 0.3906,
1297
+ "step": 3520
1298
+ },
1299
+ {
1300
+ "epoch": 3.9775280898876404,
1301
+ "grad_norm": 1.0390625,
1302
+ "learning_rate": 3.054827086046931e-05,
1303
+ "loss": 0.3987,
1304
+ "step": 3540
1305
+ },
1306
+ {
1307
+ "epoch": 4.0,
1308
+ "grad_norm": 0.671875,
1309
+ "learning_rate": 2.9289321881345254e-05,
1310
+ "loss": 0.3499,
1311
+ "step": 3560
1312
+ },
1313
+ {
1314
+ "epoch": 4.022471910112359,
1315
+ "grad_norm": 0.79296875,
1316
+ "learning_rate": 2.8052398792390154e-05,
1317
+ "loss": 0.3292,
1318
+ "step": 3580
1319
+ },
1320
+ {
1321
+ "epoch": 4.044943820224719,
1322
+ "grad_norm": 0.8125,
1323
+ "learning_rate": 2.6837886886643614e-05,
1324
+ "loss": 0.3343,
1325
+ "step": 3600
1326
+ },
1327
+ {
1328
+ "epoch": 4.067415730337078,
1329
+ "grad_norm": 0.66015625,
1330
+ "learning_rate": 2.5646164476215716e-05,
1331
+ "loss": 0.3236,
1332
+ "step": 3620
1333
+ },
1334
+ {
1335
+ "epoch": 4.089887640449438,
1336
+ "grad_norm": 1.25,
1337
+ "learning_rate": 2.447760277444543e-05,
1338
+ "loss": 0.2892,
1339
+ "step": 3640
1340
+ },
1341
+ {
1342
+ "epoch": 4.112359550561798,
1343
+ "grad_norm": 1.3515625,
1344
+ "learning_rate": 2.3332565780270165e-05,
1345
+ "loss": 0.2801,
1346
+ "step": 3660
1347
+ },
1348
+ {
1349
+ "epoch": 4.134831460674158,
1350
+ "grad_norm": 1.3671875,
1351
+ "learning_rate": 2.2211410164842606e-05,
1352
+ "loss": 0.3082,
1353
+ "step": 3680
1354
+ },
1355
+ {
1356
+ "epoch": 4.157303370786517,
1357
+ "grad_norm": 1.46875,
1358
+ "learning_rate": 2.1114485160430132e-05,
1359
+ "loss": 0.3128,
1360
+ "step": 3700
1361
+ },
1362
+ {
1363
+ "epoch": 4.179775280898877,
1364
+ "grad_norm": 0.8515625,
1365
+ "learning_rate": 2.0042132451631378e-05,
1366
+ "loss": 0.3846,
1367
+ "step": 3720
1368
+ },
1369
+ {
1370
+ "epoch": 4.202247191011236,
1371
+ "grad_norm": 1.1328125,
1372
+ "learning_rate": 1.899468606894379e-05,
1373
+ "loss": 0.2718,
1374
+ "step": 3740
1375
+ },
1376
+ {
1377
+ "epoch": 4.224719101123595,
1378
+ "grad_norm": 0.80859375,
1379
+ "learning_rate": 1.7972472284715415e-05,
1380
+ "loss": 0.302,
1381
+ "step": 3760
1382
+ },
1383
+ {
1384
+ "epoch": 4.247191011235955,
1385
+ "grad_norm": 1.6953125,
1386
+ "learning_rate": 1.6975809511513353e-05,
1387
+ "loss": 0.3785,
1388
+ "step": 3780
1389
+ },
1390
+ {
1391
+ "epoch": 4.269662921348314,
1392
+ "grad_norm": 1.0078125,
1393
+ "learning_rate": 1.600500820294041e-05,
1394
+ "loss": 0.3845,
1395
+ "step": 3800
1396
+ },
1397
+ {
1398
+ "epoch": 4.292134831460674,
1399
+ "grad_norm": 1.6796875,
1400
+ "learning_rate": 1.5060370756930919e-05,
1401
+ "loss": 0.327,
1402
+ "step": 3820
1403
+ },
1404
+ {
1405
+ "epoch": 4.314606741573034,
1406
+ "grad_norm": 1.359375,
1407
+ "learning_rate": 1.414219142155585e-05,
1408
+ "loss": 0.3589,
1409
+ "step": 3840
1410
+ },
1411
+ {
1412
+ "epoch": 4.337078651685394,
1413
+ "grad_norm": 0.75,
1414
+ "learning_rate": 1.3250756203366632e-05,
1415
+ "loss": 0.4057,
1416
+ "step": 3860
1417
+ },
1418
+ {
1419
+ "epoch": 4.359550561797753,
1420
+ "grad_norm": 0.96484375,
1421
+ "learning_rate": 1.2386342778305993e-05,
1422
+ "loss": 0.3862,
1423
+ "step": 3880
1424
+ },
1425
+ {
1426
+ "epoch": 4.382022471910112,
1427
+ "grad_norm": 1.4921875,
1428
+ "learning_rate": 1.1549220405213878e-05,
1429
+ "loss": 0.3319,
1430
+ "step": 3900
1431
+ },
1432
+ {
1433
+ "epoch": 4.404494382022472,
1434
+ "grad_norm": 1.2890625,
1435
+ "learning_rate": 1.0739649841955136e-05,
1436
+ "loss": 0.2832,
1437
+ "step": 3920
1438
+ },
1439
+ {
1440
+ "epoch": 4.426966292134831,
1441
+ "grad_norm": 1.234375,
1442
+ "learning_rate": 9.957883264195223e-06,
1443
+ "loss": 0.2732,
1444
+ "step": 3940
1445
+ },
1446
+ {
1447
+ "epoch": 4.449438202247191,
1448
+ "grad_norm": 0.828125,
1449
+ "learning_rate": 9.20416418684924e-06,
1450
+ "loss": 0.2587,
1451
+ "step": 3960
1452
+ },
1453
+ {
1454
+ "epoch": 4.47191011235955,
1455
+ "grad_norm": 1.5234375,
1456
+ "learning_rate": 8.478727388228735e-06,
1457
+ "loss": 0.3469,
1458
+ "step": 3980
1459
+ },
1460
+ {
1461
+ "epoch": 4.49438202247191,
1462
+ "grad_norm": 0.90234375,
1463
+ "learning_rate": 7.781798836909826e-06,
1464
+ "loss": 0.3323,
1465
+ "step": 4000
1466
+ },
1467
+ {
1468
+ "epoch": 4.49438202247191,
1469
+ "eval_loss": 0.9360187649726868,
1470
+ "eval_runtime": 206.9644,
1471
+ "eval_samples_per_second": 1.851,
1472
+ "eval_steps_per_second": 1.851,
1473
+ "step": 4000
1474
+ },
1475
+ {
1476
+ "epoch": 4.51685393258427,
1477
+ "grad_norm": 2.046875,
1478
+ "learning_rate": 7.11359562134557e-06,
1479
+ "loss": 0.3441,
1480
+ "step": 4020
1481
+ },
1482
+ {
1483
+ "epoch": 4.539325842696629,
1484
+ "grad_norm": 1.296875,
1485
+ "learning_rate": 6.4743258822443695e-06,
1486
+ "loss": 0.3196,
1487
+ "step": 4040
1488
+ },
1489
+ {
1490
+ "epoch": 4.561797752808989,
1491
+ "grad_norm": 1.515625,
1492
+ "learning_rate": 5.8641887477356215e-06,
1493
+ "loss": 0.3226,
1494
+ "step": 4060
1495
+ },
1496
+ {
1497
+ "epoch": 4.584269662921348,
1498
+ "grad_norm": 1.6875,
1499
+ "learning_rate": 5.283374271342645e-06,
1500
+ "loss": 0.2859,
1501
+ "step": 4080
1502
+ },
1503
+ {
1504
+ "epoch": 4.606741573033708,
1505
+ "grad_norm": 0.87109375,
1506
+ "learning_rate": 4.732063372782336e-06,
1507
+ "loss": 0.3164,
1508
+ "step": 4100
1509
+ },
1510
+ {
1511
+ "epoch": 4.629213483146067,
1512
+ "grad_norm": 1.2734375,
1513
+ "learning_rate": 4.210427781609861e-06,
1514
+ "loss": 0.3275,
1515
+ "step": 4120
1516
+ },
1517
+ {
1518
+ "epoch": 4.651685393258427,
1519
+ "grad_norm": 1.25,
1520
+ "learning_rate": 3.718629983726185e-06,
1521
+ "loss": 0.367,
1522
+ "step": 4140
1523
+ },
1524
+ {
1525
+ "epoch": 4.674157303370786,
1526
+ "grad_norm": 1.6796875,
1527
+ "learning_rate": 3.256823170764689e-06,
1528
+ "loss": 0.3445,
1529
+ "step": 4160
1530
+ },
1531
+ {
1532
+ "epoch": 4.696629213483146,
1533
+ "grad_norm": 0.9453125,
1534
+ "learning_rate": 2.8251511923731655e-06,
1535
+ "loss": 0.4628,
1536
+ "step": 4180
1537
+ },
1538
+ {
1539
+ "epoch": 4.719101123595506,
1540
+ "grad_norm": 0.8515625,
1541
+ "learning_rate": 2.423748511405577e-06,
1542
+ "loss": 0.3252,
1543
+ "step": 4200
1544
+ },
1545
+ {
1546
+ "epoch": 4.741573033707866,
1547
+ "grad_norm": 0.90234375,
1548
+ "learning_rate": 2.052740162037814e-06,
1549
+ "loss": 0.3783,
1550
+ "step": 4220
1551
+ },
1552
+ {
1553
+ "epoch": 4.764044943820225,
1554
+ "grad_norm": 0.7421875,
1555
+ "learning_rate": 1.7122417108203726e-06,
1556
+ "loss": 0.294,
1557
+ "step": 4240
1558
+ },
1559
+ {
1560
+ "epoch": 4.786516853932584,
1561
+ "grad_norm": 0.75390625,
1562
+ "learning_rate": 1.4023592206802382e-06,
1563
+ "loss": 0.3194,
1564
+ "step": 4260
1565
+ },
1566
+ {
1567
+ "epoch": 4.808988764044944,
1568
+ "grad_norm": 1.375,
1569
+ "learning_rate": 1.1231892178829472e-06,
1570
+ "loss": 0.3145,
1571
+ "step": 4280
1572
+ },
1573
+ {
1574
+ "epoch": 4.831460674157303,
1575
+ "grad_norm": 1.4140625,
1576
+ "learning_rate": 8.74818661965382e-07,
1577
+ "loss": 0.3372,
1578
+ "step": 4300
1579
+ },
1580
+ {
1581
+ "epoch": 4.853932584269663,
1582
+ "grad_norm": 1.6640625,
1583
+ "learning_rate": 6.573249186483721e-07,
1584
+ "loss": 0.2791,
1585
+ "step": 4320
1586
+ },
1587
+ {
1588
+ "epoch": 4.876404494382022,
1589
+ "grad_norm": 0.8359375,
1590
+ "learning_rate": 4.707757357379383e-07,
1591
+ "loss": 0.2428,
1592
+ "step": 4340
1593
+ },
1594
+ {
1595
+ "epoch": 4.898876404494382,
1596
+ "grad_norm": 1.3671875,
1597
+ "learning_rate": 3.152292220222064e-07,
1598
+ "loss": 0.3225,
1599
+ "step": 4360
1600
+ },
1601
+ {
1602
+ "epoch": 4.921348314606742,
1603
+ "grad_norm": 1.0625,
1604
+ "learning_rate": 1.9073382917097483e-07,
1605
+ "loss": 0.3164,
1606
+ "step": 4380
1607
+ },
1608
+ {
1609
+ "epoch": 4.943820224719101,
1610
+ "grad_norm": 1.0546875,
1611
+ "learning_rate": 9.732833664334307e-08,
1612
+ "loss": 0.3571,
1613
+ "step": 4400
1614
+ },
1615
+ {
1616
+ "epoch": 4.966292134831461,
1617
+ "grad_norm": 0.87890625,
1618
+ "learning_rate": 3.5041839608151996e-08,
1619
+ "loss": 0.3002,
1620
+ "step": 4420
1621
+ },
1622
+ {
1623
+ "epoch": 4.98876404494382,
1624
+ "grad_norm": 0.6640625,
1625
+ "learning_rate": 3.893739881088987e-09,
1626
+ "loss": 0.366,
1627
+ "step": 4440
1628
+ },
1629
+ {
1630
+ "epoch": 5.0,
1631
+ "step": 4450,
1632
+ "total_flos": 1.024663401529344e+17,
1633
+ "train_loss": 0.6764815047617708,
1634
+ "train_runtime": 9686.8536,
1635
+ "train_samples_per_second": 0.459,
1636
+ "train_steps_per_second": 0.459
1637
+ }
1638
+ ],
1639
+ "logging_steps": 20,
1640
+ "max_steps": 4450,
1641
+ "num_input_tokens_seen": 0,
1642
+ "num_train_epochs": 5,
1643
+ "save_steps": 100,
1644
+ "stateful_callbacks": {
1645
+ "TrainerControl": {
1646
+ "args": {
1647
+ "should_epoch_stop": false,
1648
+ "should_evaluate": false,
1649
+ "should_log": false,
1650
+ "should_save": true,
1651
+ "should_training_stop": false
1652
+ },
1653
+ "attributes": {}
1654
+ }
1655
+ },
1656
+ "total_flos": 1.024663401529344e+17,
1657
+ "train_batch_size": 1,
1658
+ "trial_name": null,
1659
+ "trial_params": null
1660
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7797c8c3afe93974491cdc40e75b1366f5c59ec6a806b3f23f3877c5601245bf
3
+ size 5112