Ubuntu commited on
Commit
857379d
1 Parent(s): 618955c

Please enter the commit message for your changes. Lines starting

Browse files

with '' will be ignored, and an empty message aborts the commit.

Committer: Ubuntu <azureuser@myVm.pweh1myzxgie3nlmnfmiv00d1d.bx.internal.cloudapp.net>

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
deleted: README1.md
deleted: checkpoint-279/README.md
deleted: checkpoint-279/adapter_config.json
deleted: checkpoint-279/adapter_model.safetensors
deleted: checkpoint-279/global_step279/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
deleted: checkpoint-279/global_step279/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
deleted: checkpoint-279/global_step279/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
deleted: checkpoint-279/global_step279/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
deleted: checkpoint-279/global_step279/mp_rank_00_model_states.pt
deleted: checkpoint-279/latest
deleted: checkpoint-279/rng_state_0.pth
deleted: checkpoint-279/rng_state_1.pth
deleted: checkpoint-279/rng_state_2.pth
deleted: checkpoint-279/rng_state_3.pth
deleted: checkpoint-279/scheduler.pt
deleted: checkpoint-279/trainer_state.json
deleted: checkpoint-279/training_args.bin
deleted: checkpoint-279/zero_to_fp32.py
deleted: checkpoint-336/README.md
deleted: checkpoint-336/adapter_config.json
deleted: checkpoint-336/adapter_model.safetensors
deleted: checkpoint-336/global_step336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
deleted: checkpoint-336/global_step336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
deleted: checkpoint-336/global_step336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
deleted: checkpoint-336/global_step336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
deleted: checkpoint-336/global_step336/mp_rank_00_model_states.pt
deleted: checkpoint-336/latest
deleted: checkpoint-336/rng_state_0.pth
deleted: checkpoint-336/rng_state_1.pth
deleted: checkpoint-336/rng_state_2.pth
deleted: checkpoint-336/rng_state_3.pth
deleted: checkpoint-336/scheduler.pt
deleted: checkpoint-336/trainer_state.json
deleted: checkpoint-336/training_args.bin
deleted: checkpoint-336/zero_to_fp32.py
deleted: checkpoint-372/README.md
deleted: checkpoint-372/adapter_config.json
deleted: checkpoint-372/adapter_model.safetensors
deleted: checkpoint-372/global_step372/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
deleted: checkpoint-372/global_step372/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
deleted: checkpoint-372/global_step372/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
deleted: checkpoint-372/global_step372/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
deleted: checkpoint-372/global_step372/mp_rank_00_model_states.pt
deleted: checkpoint-372/latest
deleted: checkpoint-372/rng_state_0.pth
deleted: checkpoint-372/rng_state_1.pth
deleted: checkpoint-372/rng_state_2.pth
deleted: checkpoint-372/rng_state_3.pth
deleted: checkpoint-372/scheduler.pt
deleted: checkpoint-372/trainer_state.json
deleted: checkpoint-372/training_args.bin
deleted: checkpoint-372/zero_to_fp32.py
deleted: checkpoint-448/README.md
deleted: checkpoint-448/adapter_config.json
deleted: checkpoint-448/adapter_model.safetensors
deleted: checkpoint-448/global_step448/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
deleted: checkpoint-448/global_step448/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
deleted: checkpoint-448/global_step448/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
deleted: checkpoint-448/global_step448/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
deleted: checkpoint-448/global_step448/mp_rank_00_model_states.pt
deleted: checkpoint-448/latest
deleted: checkpoint-448/rng_state_0.pth
deleted: checkpoint-448/rng_state_1.pth
deleted: checkpoint-448/rng_state_2.pth
deleted: checkpoint-448/rng_state_3.pth
deleted: checkpoint-448/scheduler.pt
deleted: checkpoint-448/trainer_state.json
deleted: checkpoint-448/training_args.bin
deleted: checkpoint-448/zero_to_fp32.py
deleted: merged/config.json
deleted: merged/generation_config.json
deleted: merged/pytorch_model-00001-of-00002.bin
deleted: merged/pytorch_model-00002-of-00002.bin
deleted: merged/pytorch_model.bin.index.json
deleted: merged/special_tokens_map.json
deleted: merged/tokenizer.json
deleted: merged/tokenizer.model
deleted: merged/tokenizer_config.json
deleted: runs/Mar21_07-07-55_8205afe3ecd2/events.out.tfevents.1711004877.8205afe3ecd2.2618.0

This view is limited to 50 files because it contains too many changes. 聽 See raw diff
Files changed (50) hide show
  1. README1.md +0 -153
  2. checkpoint-279/README.md +0 -202
  3. checkpoint-279/adapter_config.json +0 -33
  4. checkpoint-279/adapter_model.safetensors +0 -3
  5. checkpoint-279/global_step279/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0 -3
  6. checkpoint-279/global_step279/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0 -3
  7. checkpoint-279/global_step279/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0 -3
  8. checkpoint-279/global_step279/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0 -3
  9. checkpoint-279/global_step279/mp_rank_00_model_states.pt +0 -3
  10. checkpoint-279/latest +0 -1
  11. checkpoint-279/rng_state_0.pth +0 -3
  12. checkpoint-279/rng_state_1.pth +0 -3
  13. checkpoint-279/rng_state_2.pth +0 -3
  14. checkpoint-279/rng_state_3.pth +0 -3
  15. checkpoint-279/scheduler.pt +0 -3
  16. checkpoint-279/trainer_state.json +0 -2070
  17. checkpoint-279/training_args.bin +0 -3
  18. checkpoint-279/zero_to_fp32.py +0 -592
  19. checkpoint-336/README.md +0 -202
  20. checkpoint-336/adapter_config.json +0 -33
  21. checkpoint-336/adapter_model.safetensors +0 -3
  22. checkpoint-336/global_step336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0 -3
  23. checkpoint-336/global_step336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0 -3
  24. checkpoint-336/global_step336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0 -3
  25. checkpoint-336/global_step336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0 -3
  26. checkpoint-336/global_step336/mp_rank_00_model_states.pt +0 -3
  27. checkpoint-336/latest +0 -1
  28. checkpoint-336/rng_state_0.pth +0 -3
  29. checkpoint-336/rng_state_1.pth +0 -3
  30. checkpoint-336/rng_state_2.pth +0 -3
  31. checkpoint-336/rng_state_3.pth +0 -3
  32. checkpoint-336/scheduler.pt +0 -3
  33. checkpoint-336/trainer_state.json +0 -2477
  34. checkpoint-336/training_args.bin +0 -3
  35. checkpoint-336/zero_to_fp32.py +0 -592
  36. checkpoint-372/README.md +0 -202
  37. checkpoint-372/adapter_config.json +0 -33
  38. checkpoint-372/adapter_model.safetensors +0 -3
  39. checkpoint-372/global_step372/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0 -3
  40. checkpoint-372/global_step372/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0 -3
  41. checkpoint-372/global_step372/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0 -3
  42. checkpoint-372/global_step372/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0 -3
  43. checkpoint-372/global_step372/mp_rank_00_model_states.pt +0 -3
  44. checkpoint-372/latest +0 -1
  45. checkpoint-372/rng_state_0.pth +0 -3
  46. checkpoint-372/rng_state_1.pth +0 -3
  47. checkpoint-372/rng_state_2.pth +0 -3
  48. checkpoint-372/rng_state_3.pth +0 -3
  49. checkpoint-372/scheduler.pt +0 -3
  50. checkpoint-372/trainer_state.json +0 -2753
README1.md DELETED
@@ -1,153 +0,0 @@
1
- ---
2
- license: other
3
- library_name: peft
4
- tags:
5
- - generated_from_trainer
6
- base_model: google/gemma-7b-it
7
- model-index:
8
- - name: out
9
- results: []
10
- ---
11
-
12
-
13
- [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
14
- <details><summary>See axolotl config</summary>
15
-
16
- axolotl version: `0.4.0`
17
- ```yaml
18
- # use google/gemma-7b if you have access
19
- base_model: google/gemma-7b-it
20
- model_type: AutoModelForCausalLM
21
- tokenizer_type: AutoTokenizer
22
-
23
- load_in_8bit: false
24
- load_in_4bit: true
25
- strict: false
26
-
27
- # huggingface repo
28
- datasets:
29
- - path: ./python-oasst/chunk_1.jsonl
30
- type: oasst
31
- val_set_size: 0.1
32
- output_dir: ./out
33
-
34
- adapter: qlora
35
- lora_r: 32
36
- lora_alpha: 16
37
- lora_dropout: 0.05
38
- lora_target_linear: true
39
-
40
- sequence_len: 4096
41
- sample_packing: false
42
- pad_to_sequence_len: true
43
-
44
- wandb_project: gemma-7b-it
45
- wandb_entity:
46
- wandb_watch:
47
- wandb_name:
48
- wandb_log_model:
49
-
50
-
51
- gradient_accumulation_steps: 6
52
- micro_batch_size: 4
53
- num_epochs: 4
54
- optimizer: adamw_bnb_8bit
55
- lr_scheduler: cosine
56
- learning_rate: 0.0002
57
-
58
- train_on_inputs: true
59
- group_by_length: false
60
- bf16: auto
61
- fp16:
62
- tf32: false
63
-
64
- gradient_checkpointing: true
65
- early_stopping_patience:
66
- resume_from_checkpoint:
67
- local_rank:
68
- logging_steps: 1
69
- xformers_attention:
70
- flash_attention: true
71
-
72
- warmup_ratio: 0.1
73
- evals_per_epoch: 4
74
- eval_table_size:
75
- eval_max_new_tokens: 128
76
- saves_per_epoch: 1
77
- debug:
78
- deepspeed: deepspeed_configs/zero1.json
79
- weight_decay: 0.0
80
- fsdp:
81
- fsdp_config:
82
- special_tokens:
83
-
84
- ```
85
-
86
- </details><br>
87
-
88
- # out
89
-
90
- This model is a fine-tuned version of [google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it) on the None dataset.
91
- It achieves the following results on the evaluation set:
92
- - Loss: 1.1911
93
-
94
- ## Model description
95
-
96
- More information needed
97
-
98
- ## Intended uses & limitations
99
-
100
- More information needed
101
-
102
- ## Training and evaluation data
103
-
104
- More information needed
105
-
106
- ## Training procedure
107
-
108
- ### Training hyperparameters
109
-
110
- The following hyperparameters were used during training:
111
- - learning_rate: 0.0002
112
- - train_batch_size: 4
113
- - eval_batch_size: 4
114
- - seed: 42
115
- - distributed_type: multi-GPU
116
- - num_devices: 4
117
- - gradient_accumulation_steps: 6
118
- - total_train_batch_size: 96
119
- - total_eval_batch_size: 16
120
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
121
- - lr_scheduler_type: cosine
122
- - lr_scheduler_warmup_steps: 9
123
- - num_epochs: 4
124
-
125
- ### Training results
126
-
127
- | Training Loss | Epoch | Step | Validation Loss |
128
- |:-------------:|:-----:|:----:|:---------------:|
129
- | 5.0474 | 0.01 | 1 | 5.9279 |
130
- | 1.2191 | 0.26 | 24 | 1.2947 |
131
- | 1.1165 | 0.51 | 48 | 1.1679 |
132
- | 1.0711 | 0.77 | 72 | 1.1377 |
133
- | 0.9546 | 1.02 | 96 | 1.1303 |
134
- | 0.9309 | 1.28 | 120 | 1.1298 |
135
- | 0.9588 | 1.54 | 144 | 1.1242 |
136
- | 0.8553 | 1.79 | 168 | 1.1259 |
137
- | 0.8231 | 2.05 | 192 | 1.1449 |
138
- | 0.8154 | 2.31 | 216 | 1.1514 |
139
- | 0.7354 | 2.56 | 240 | 1.1471 |
140
- | 0.7577 | 2.82 | 264 | 1.1479 |
141
- | 0.6647 | 3.07 | 288 | 1.1923 |
142
- | 0.6928 | 3.33 | 312 | 1.1856 |
143
- | 0.731 | 3.59 | 336 | 1.1890 |
144
- | 0.7193 | 3.84 | 360 | 1.1911 |
145
-
146
-
147
- ### Framework versions
148
-
149
- - PEFT 0.9.0
150
- - Transformers 4.39.0.dev0
151
- - Pytorch 2.1.2+cu118
152
- - Datasets 2.18.0
153
- - Tokenizers 0.15.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-279/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: google/gemma-7b-it
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.9.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-279/adapter_config.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "google/gemma-7b-it",
5
- "bias": "none",
6
- "fan_in_fan_out": null,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layers_pattern": null,
10
- "layers_to_transform": null,
11
- "loftq_config": {},
12
- "lora_alpha": 16,
13
- "lora_dropout": 0.05,
14
- "megatron_config": null,
15
- "megatron_core": "megatron.core",
16
- "modules_to_save": null,
17
- "peft_type": "LORA",
18
- "r": 32,
19
- "rank_pattern": {},
20
- "revision": null,
21
- "target_modules": [
22
- "down_proj",
23
- "o_proj",
24
- "k_proj",
25
- "q_proj",
26
- "gate_proj",
27
- "up_proj",
28
- "v_proj"
29
- ],
30
- "task_type": "CAUSAL_LM",
31
- "use_dora": false,
32
- "use_rslora": false
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-279/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0831f70d185dae9ca69f58be3eab596067ac52e75e3e97b46d23ecd486b83942
3
- size 200068904
 
 
 
 
checkpoint-279/global_step279/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:90f2db91b1ca035dfa781beb0567b7cfaaf6646de04cc9a82d8e80069e7a5b09
3
- size 150126608
 
 
 
 
checkpoint-279/global_step279/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9ec41ba7f5c3131e00c854ec2bbfca98e6a3321e5f2ddf6efdc6056fa008c5a
3
- size 150126672
 
 
 
 
checkpoint-279/global_step279/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:41f227ca1d3c19b4cd53567e28a2d395c2e804bd38dfd9bb3c937adab1daf5a3
3
- size 150126736
 
 
 
 
checkpoint-279/global_step279/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e486ddf3459c4f6befb004a9374e7e4fb9bd64bba72dd2e6f7051ee89939988
3
- size 150126736
 
 
 
 
checkpoint-279/global_step279/mp_rank_00_model_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:feed7b7a8694c54651374fb581d67d60790a016e23023446231557add62ffc80
3
- size 1896781286
 
 
 
 
checkpoint-279/latest DELETED
@@ -1 +0,0 @@
1
- global_step279
 
 
checkpoint-279/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a84c3f9fa55e23a5c4d93b108c705b57ba9a5ed816191e6dfbb6e72ad2857e6d
3
- size 15024
 
 
 
 
checkpoint-279/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbb1da31ff41578c72556d0a8b9b94abf6be26bf16b6456ecd87d2b611f5b9bd
3
- size 15024
 
 
 
 
checkpoint-279/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:11a7b38529914886a43976df69af7f331315329e1d38788c57003ca4cd1a849f
3
- size 15024
 
 
 
 
checkpoint-279/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d65b4248464f467db8226c5cc4ba4aa32e06af0bf915b61ea8a2db71d16b5ce
3
- size 15024
 
 
 
 
checkpoint-279/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:099f524a0aa9353b01bf7d70e5a899c6e8ee8efc46e982213631888df6e5111b
3
- size 1064
 
 
 
 
checkpoint-279/trainer_state.json DELETED
@@ -1,2070 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.97864768683274,
5
- "eval_steps": 24,
6
- "global_step": 279,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.01,
13
- "grad_norm": 1.8206765789002874,
14
- "learning_rate": 2.2222222222222223e-05,
15
- "loss": 5.0474,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.01,
20
- "eval_loss": 5.927858829498291,
21
- "eval_runtime": 117.3665,
22
- "eval_samples_per_second": 8.512,
23
- "eval_steps_per_second": 0.537,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.02,
28
- "grad_norm": 1.9889295079554647,
29
- "learning_rate": 4.4444444444444447e-05,
30
- "loss": 5.5569,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.03,
35
- "grad_norm": 1.8931443004310682,
36
- "learning_rate": 6.666666666666667e-05,
37
- "loss": 5.2383,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.04,
42
- "grad_norm": 2.195266234429632,
43
- "learning_rate": 8.888888888888889e-05,
44
- "loss": 5.4943,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.05,
49
- "grad_norm": 2.6001064132041503,
50
- "learning_rate": 0.00011111111111111112,
51
- "loss": 5.2602,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.06,
56
- "grad_norm": 3.26301463076567,
57
- "learning_rate": 0.00013333333333333334,
58
- "loss": 4.8182,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.07,
63
- "grad_norm": 3.476044691292363,
64
- "learning_rate": 0.00015555555555555556,
65
- "loss": 4.0432,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.09,
70
- "grad_norm": 3.378803229553045,
71
- "learning_rate": 0.00017777777777777779,
72
- "loss": 3.5212,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.1,
77
- "grad_norm": 3.9419449437137017,
78
- "learning_rate": 0.0002,
79
- "loss": 3.2239,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.11,
84
- "grad_norm": 5.8833082175146485,
85
- "learning_rate": 0.00019999625498303932,
86
- "loss": 3.4319,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.12,
91
- "grad_norm": 5.4690223843996515,
92
- "learning_rate": 0.0001999850202126604,
93
- "loss": 2.8167,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.13,
98
- "grad_norm": 7.009614336449043,
99
- "learning_rate": 0.00019996629653035126,
100
- "loss": 2.7966,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.14,
105
- "grad_norm": 6.254841874500106,
106
- "learning_rate": 0.0001999400853385221,
107
- "loss": 2.1336,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.15,
112
- "grad_norm": 6.037710889841169,
113
- "learning_rate": 0.00019990638860040006,
114
- "loss": 1.85,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.16,
119
- "grad_norm": 1.0500019118881985,
120
- "learning_rate": 0.00019986520883988232,
121
- "loss": 1.5964,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.17,
126
- "grad_norm": 0.6169710624824223,
127
- "learning_rate": 0.00019981654914134686,
128
- "loss": 1.4307,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.18,
133
- "grad_norm": 1.86114059095932,
134
- "learning_rate": 0.00019976041314942155,
135
- "loss": 1.4285,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.19,
140
- "grad_norm": 1.6513877610200167,
141
- "learning_rate": 0.00019969680506871137,
142
- "loss": 1.4621,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.2,
147
- "grad_norm": 1.4395882738454628,
148
- "learning_rate": 0.000199625729663483,
149
- "loss": 1.3561,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.21,
154
- "grad_norm": 0.70847060238536,
155
- "learning_rate": 0.00019954719225730847,
156
- "loss": 1.3565,
157
- "step": 20
158
- },
159
- {
160
- "epoch": 0.22,
161
- "grad_norm": 0.4331630595385925,
162
- "learning_rate": 0.00019946119873266613,
163
- "loss": 1.3374,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.23,
168
- "grad_norm": 0.5580281682185451,
169
- "learning_rate": 0.0001993677555305002,
170
- "loss": 1.313,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.25,
175
- "grad_norm": 0.5217443953771937,
176
- "learning_rate": 0.00019926686964973813,
177
- "loss": 1.2541,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.26,
182
- "grad_norm": 0.36823120314463453,
183
- "learning_rate": 0.00019915854864676664,
184
- "loss": 1.2191,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.26,
189
- "eval_loss": 1.2946609258651733,
190
- "eval_runtime": 118.9039,
191
- "eval_samples_per_second": 8.402,
192
- "eval_steps_per_second": 0.53,
193
- "step": 24
194
- },
195
- {
196
- "epoch": 0.27,
197
- "grad_norm": 0.5797477063688413,
198
- "learning_rate": 0.0001990428006348656,
199
- "loss": 1.24,
200
- "step": 25
201
- },
202
- {
203
- "epoch": 0.28,
204
- "grad_norm": 0.41369538857234545,
205
- "learning_rate": 0.00019891963428360043,
206
- "loss": 1.209,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.29,
211
- "grad_norm": 0.36666008426797836,
212
- "learning_rate": 0.00019878905881817252,
213
- "loss": 1.2543,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.3,
218
- "grad_norm": 0.3976779691989045,
219
- "learning_rate": 0.00019865108401872857,
220
- "loss": 1.2431,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.31,
225
- "grad_norm": 0.4992861718630414,
226
- "learning_rate": 0.00019850572021962788,
227
- "loss": 1.2471,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.32,
232
- "grad_norm": 0.33729072192890136,
233
- "learning_rate": 0.00019835297830866826,
234
- "loss": 1.1933,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.33,
239
- "grad_norm": 0.29373457949318904,
240
- "learning_rate": 0.00019819286972627066,
241
- "loss": 1.1761,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.34,
246
- "grad_norm": 0.5339184947140588,
247
- "learning_rate": 0.0001980254064646223,
248
- "loss": 1.165,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.35,
253
- "grad_norm": 0.38755069216510263,
254
- "learning_rate": 0.00019785060106677818,
255
- "loss": 1.1236,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.36,
260
- "grad_norm": 0.338373181403367,
261
- "learning_rate": 0.00019766846662572191,
262
- "loss": 1.2102,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.37,
267
- "grad_norm": 0.39237714718744304,
268
- "learning_rate": 0.00019747901678338496,
269
- "loss": 1.1642,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.38,
274
- "grad_norm": 0.3614249847081747,
275
- "learning_rate": 0.00019728226572962473,
276
- "loss": 1.1387,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.4,
281
- "grad_norm": 0.28278007479509987,
282
- "learning_rate": 0.00019707822820116193,
283
- "loss": 1.0939,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.41,
288
- "grad_norm": 0.3008254873268798,
289
- "learning_rate": 0.00019686691948047664,
290
- "loss": 1.1346,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.42,
295
- "grad_norm": 0.4263010439416343,
296
- "learning_rate": 0.0001966483553946637,
297
- "loss": 1.1015,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.43,
302
- "grad_norm": 0.32725448028464205,
303
- "learning_rate": 0.00019642255231424729,
304
- "loss": 1.1324,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.44,
309
- "grad_norm": 0.3028242900588441,
310
- "learning_rate": 0.00019618952715195475,
311
- "loss": 1.1147,
312
- "step": 41
313
- },
314
- {
315
- "epoch": 0.45,
316
- "grad_norm": 0.33893311928252234,
317
- "learning_rate": 0.00019594929736144976,
318
- "loss": 1.0978,
319
- "step": 42
320
- },
321
- {
322
- "epoch": 0.46,
323
- "grad_norm": 0.2786082334492372,
324
- "learning_rate": 0.0001957018809360251,
325
- "loss": 1.0933,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.47,
330
- "grad_norm": 0.2732185168098956,
331
- "learning_rate": 0.00019544729640725498,
332
- "loss": 1.084,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.48,
337
- "grad_norm": 0.33386436894143035,
338
- "learning_rate": 0.00019518556284360696,
339
- "loss": 1.0673,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.49,
344
- "grad_norm": 0.2761688734050621,
345
- "learning_rate": 0.00019491669984901379,
346
- "loss": 1.0523,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.5,
351
- "grad_norm": 0.3346957388610895,
352
- "learning_rate": 0.00019464072756140486,
353
- "loss": 1.0913,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.51,
358
- "grad_norm": 0.30196058996924285,
359
- "learning_rate": 0.0001943576666511982,
360
- "loss": 1.1165,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.51,
365
- "eval_loss": 1.167867660522461,
366
- "eval_runtime": 119.1485,
367
- "eval_samples_per_second": 8.384,
368
- "eval_steps_per_second": 0.529,
369
- "step": 48
370
- },
371
- {
372
- "epoch": 0.52,
373
- "grad_norm": 0.27445390350987153,
374
- "learning_rate": 0.00019406753831975203,
375
- "loss": 1.1069,
376
- "step": 49
377
- },
378
- {
379
- "epoch": 0.53,
380
- "grad_norm": 0.34729097228771255,
381
- "learning_rate": 0.00019377036429777672,
382
- "loss": 1.0567,
383
- "step": 50
384
- },
385
- {
386
- "epoch": 0.54,
387
- "grad_norm": 0.31314016575739406,
388
- "learning_rate": 0.0001934661668437073,
389
- "loss": 1.0875,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 0.56,
394
- "grad_norm": 0.29140014335226905,
395
- "learning_rate": 0.0001931549687420364,
396
- "loss": 1.0929,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 0.57,
401
- "grad_norm": 0.2638104110161505,
402
- "learning_rate": 0.00019283679330160726,
403
- "loss": 1.0963,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 0.58,
408
- "grad_norm": 0.2833945318119855,
409
- "learning_rate": 0.0001925116643538684,
410
- "loss": 1.0535,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 0.59,
415
- "grad_norm": 0.28672689795285417,
416
- "learning_rate": 0.0001921796062510882,
417
- "loss": 1.0699,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 0.6,
422
- "grad_norm": 0.261255409262294,
423
- "learning_rate": 0.00019184064386453128,
424
- "loss": 1.0658,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.61,
429
- "grad_norm": 0.24304864434604007,
430
- "learning_rate": 0.00019149480258259533,
431
- "loss": 1.0441,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.62,
436
- "grad_norm": 0.2987107937915846,
437
- "learning_rate": 0.00019114210830890969,
438
- "loss": 1.0061,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.63,
443
- "grad_norm": 0.2617045441373282,
444
- "learning_rate": 0.00019078258746039507,
445
- "loss": 1.0578,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.64,
450
- "grad_norm": 0.2577955355987167,
451
- "learning_rate": 0.00019041626696528503,
452
- "loss": 1.0333,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.65,
457
- "grad_norm": 0.2823058812174375,
458
- "learning_rate": 0.0001900431742611089,
459
- "loss": 1.0837,
460
- "step": 61
461
- },
462
- {
463
- "epoch": 0.66,
464
- "grad_norm": 0.30425238718712166,
465
- "learning_rate": 0.00018966333729263674,
466
- "loss": 1.0619,
467
- "step": 62
468
- },
469
- {
470
- "epoch": 0.67,
471
- "grad_norm": 0.29826831116146957,
472
- "learning_rate": 0.0001892767845097864,
473
- "loss": 1.056,
474
- "step": 63
475
- },
476
- {
477
- "epoch": 0.68,
478
- "grad_norm": 0.22990267950533677,
479
- "learning_rate": 0.00018888354486549237,
480
- "loss": 1.061,
481
- "step": 64
482
- },
483
- {
484
- "epoch": 0.69,
485
- "grad_norm": 0.27604852373975236,
486
- "learning_rate": 0.00018848364781353744,
487
- "loss": 1.0624,
488
- "step": 65
489
- },
490
- {
491
- "epoch": 0.7,
492
- "grad_norm": 0.302101014156969,
493
- "learning_rate": 0.00018807712330634642,
494
- "loss": 1.0965,
495
- "step": 66
496
- },
497
- {
498
- "epoch": 0.72,
499
- "grad_norm": 0.2532153192142023,
500
- "learning_rate": 0.00018766400179274286,
501
- "loss": 1.0972,
502
- "step": 67
503
- },
504
- {
505
- "epoch": 0.73,
506
- "grad_norm": 0.23803088057755897,
507
- "learning_rate": 0.00018724431421566823,
508
- "loss": 1.0823,
509
- "step": 68
510
- },
511
- {
512
- "epoch": 0.74,
513
- "grad_norm": 0.2200041903156331,
514
- "learning_rate": 0.0001868180920098644,
515
- "loss": 1.037,
516
- "step": 69
517
- },
518
- {
519
- "epoch": 0.75,
520
- "grad_norm": 0.31123761066229655,
521
- "learning_rate": 0.00018638536709951917,
522
- "loss": 1.0689,
523
- "step": 70
524
- },
525
- {
526
- "epoch": 0.76,
527
- "grad_norm": 0.2760757149384919,
528
- "learning_rate": 0.00018594617189587512,
529
- "loss": 1.0071,
530
- "step": 71
531
- },
532
- {
533
- "epoch": 0.77,
534
- "grad_norm": 0.2452672521810973,
535
- "learning_rate": 0.00018550053929480202,
536
- "loss": 1.0711,
537
- "step": 72
538
- },
539
- {
540
- "epoch": 0.77,
541
- "eval_loss": 1.1377497911453247,
542
- "eval_runtime": 119.461,
543
- "eval_samples_per_second": 8.363,
544
- "eval_steps_per_second": 0.527,
545
- "step": 72
546
- },
547
- {
548
- "epoch": 0.78,
549
- "grad_norm": 0.30897216290479246,
550
- "learning_rate": 0.0001850485026743328,
551
- "loss": 1.0508,
552
- "step": 73
553
- },
554
- {
555
- "epoch": 0.79,
556
- "grad_norm": 0.24165903393157925,
557
- "learning_rate": 0.00018459009589216364,
558
- "loss": 1.046,
559
- "step": 74
560
- },
561
- {
562
- "epoch": 0.8,
563
- "grad_norm": 0.2509819208307879,
564
- "learning_rate": 0.00018412535328311814,
565
- "loss": 1.0726,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 0.81,
570
- "grad_norm": 0.26145395006758515,
571
- "learning_rate": 0.00018365430965657526,
572
- "loss": 0.9998,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 0.82,
577
- "grad_norm": 0.26920709605794424,
578
- "learning_rate": 0.00018317700029386245,
579
- "loss": 1.065,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 0.83,
584
- "grad_norm": 0.24226754926786417,
585
- "learning_rate": 0.0001826934609456129,
586
- "loss": 1.0489,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 0.84,
591
- "grad_norm": 0.3022365661006827,
592
- "learning_rate": 0.00018220372782908777,
593
- "loss": 1.0372,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 0.85,
598
- "grad_norm": 0.25795710005352673,
599
- "learning_rate": 0.00018170783762546365,
600
- "loss": 1.0128,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 0.86,
605
- "grad_norm": 0.3490748875058354,
606
- "learning_rate": 0.00018120582747708502,
607
- "loss": 1.0168,
608
- "step": 81
609
- },
610
- {
611
- "epoch": 0.88,
612
- "grad_norm": 0.24938209735120945,
613
- "learning_rate": 0.00018069773498468223,
614
- "loss": 0.9586,
615
- "step": 82
616
- },
617
- {
618
- "epoch": 0.89,
619
- "grad_norm": 0.2527612545099894,
620
- "learning_rate": 0.00018018359820455536,
621
- "loss": 1.0385,
622
- "step": 83
623
- },
624
- {
625
- "epoch": 0.9,
626
- "grad_norm": 0.27528879975094916,
627
- "learning_rate": 0.0001796634556457236,
628
- "loss": 1.0328,
629
- "step": 84
630
- },
631
- {
632
- "epoch": 0.91,
633
- "grad_norm": 0.2605002777661913,
634
- "learning_rate": 0.0001791373462670411,
635
- "loss": 0.9966,
636
- "step": 85
637
- },
638
- {
639
- "epoch": 0.92,
640
- "grad_norm": 0.3117107796665858,
641
- "learning_rate": 0.00017860530947427875,
642
- "loss": 0.9772,
643
- "step": 86
644
- },
645
- {
646
- "epoch": 0.93,
647
- "grad_norm": 0.28336227154677734,
648
- "learning_rate": 0.0001780673851171728,
649
- "loss": 1.0724,
650
- "step": 87
651
- },
652
- {
653
- "epoch": 0.94,
654
- "grad_norm": 0.42707817919652674,
655
- "learning_rate": 0.0001775236134864401,
656
- "loss": 1.0038,
657
- "step": 88
658
- },
659
- {
660
- "epoch": 0.95,
661
- "grad_norm": 0.29236016959846456,
662
- "learning_rate": 0.0001769740353107602,
663
- "loss": 1.0083,
664
- "step": 89
665
- },
666
- {
667
- "epoch": 0.96,
668
- "grad_norm": 0.43295063403530637,
669
- "learning_rate": 0.00017641869175372493,
670
- "loss": 1.022,
671
- "step": 90
672
- },
673
- {
674
- "epoch": 0.97,
675
- "grad_norm": 0.3086663897043129,
676
- "learning_rate": 0.00017585762441075503,
677
- "loss": 1.0303,
678
- "step": 91
679
- },
680
- {
681
- "epoch": 0.98,
682
- "grad_norm": 0.2783768981163154,
683
- "learning_rate": 0.0001752908753059849,
684
- "loss": 1.061,
685
- "step": 92
686
- },
687
- {
688
- "epoch": 0.99,
689
- "grad_norm": 0.43168501819843275,
690
- "learning_rate": 0.00017471848688911464,
691
- "loss": 1.0631,
692
- "step": 93
693
- },
694
- {
695
- "epoch": 1.0,
696
- "grad_norm": 0.25487494913299935,
697
- "learning_rate": 0.0001741405020322309,
698
- "loss": 0.9858,
699
- "step": 94
700
- },
701
- {
702
- "epoch": 1.01,
703
- "grad_norm": 0.3229761094582219,
704
- "learning_rate": 0.00017355696402659548,
705
- "loss": 0.9495,
706
- "step": 95
707
- },
708
- {
709
- "epoch": 1.02,
710
- "grad_norm": 0.3178464701266748,
711
- "learning_rate": 0.000172967916579403,
712
- "loss": 0.9546,
713
- "step": 96
714
- },
715
- {
716
- "epoch": 1.02,
717
- "eval_loss": 1.1303094625473022,
718
- "eval_runtime": 119.6761,
719
- "eval_samples_per_second": 8.348,
720
- "eval_steps_per_second": 0.526,
721
- "step": 96
722
- },
723
- {
724
- "epoch": 1.04,
725
- "grad_norm": 0.2534616980189548,
726
- "learning_rate": 0.00017237340381050703,
727
- "loss": 0.9509,
728
- "step": 97
729
- },
730
- {
731
- "epoch": 1.05,
732
- "grad_norm": 0.2354382873554396,
733
- "learning_rate": 0.00017177347024911562,
734
- "loss": 0.9611,
735
- "step": 98
736
- },
737
- {
738
- "epoch": 1.06,
739
- "grad_norm": 0.2754259154521738,
740
- "learning_rate": 0.00017116816083045602,
741
- "loss": 0.9184,
742
- "step": 99
743
- },
744
- {
745
- "epoch": 1.07,
746
- "grad_norm": 0.25868181129480755,
747
- "learning_rate": 0.00017055752089240907,
748
- "loss": 0.957,
749
- "step": 100
750
- },
751
- {
752
- "epoch": 1.08,
753
- "grad_norm": 0.2383943586330267,
754
- "learning_rate": 0.00016994159617211317,
755
- "loss": 0.9638,
756
- "step": 101
757
- },
758
- {
759
- "epoch": 1.09,
760
- "grad_norm": 0.2706420372628291,
761
- "learning_rate": 0.0001693204328025389,
762
- "loss": 0.9115,
763
- "step": 102
764
- },
765
- {
766
- "epoch": 1.1,
767
- "grad_norm": 0.2751042656041904,
768
- "learning_rate": 0.0001686940773090333,
769
- "loss": 0.9277,
770
- "step": 103
771
- },
772
- {
773
- "epoch": 1.11,
774
- "grad_norm": 0.27700872737428867,
775
- "learning_rate": 0.00016806257660583534,
776
- "loss": 0.9248,
777
- "step": 104
778
- },
779
- {
780
- "epoch": 1.12,
781
- "grad_norm": 0.3350046312844708,
782
- "learning_rate": 0.00016742597799256182,
783
- "loss": 0.928,
784
- "step": 105
785
- },
786
- {
787
- "epoch": 1.13,
788
- "grad_norm": 0.4055944986440079,
789
- "learning_rate": 0.00016678432915066488,
790
- "loss": 0.9074,
791
- "step": 106
792
- },
793
- {
794
- "epoch": 1.14,
795
- "grad_norm": 0.2515177402600531,
796
- "learning_rate": 0.00016613767813986044,
797
- "loss": 0.9564,
798
- "step": 107
799
- },
800
- {
801
- "epoch": 1.15,
802
- "grad_norm": 0.2571149695502646,
803
- "learning_rate": 0.00016548607339452853,
804
- "loss": 0.93,
805
- "step": 108
806
- },
807
- {
808
- "epoch": 1.16,
809
- "grad_norm": 0.38608942941048996,
810
- "learning_rate": 0.0001648295637200856,
811
- "loss": 0.9281,
812
- "step": 109
813
- },
814
- {
815
- "epoch": 1.17,
816
- "grad_norm": 0.31939838976976676,
817
- "learning_rate": 0.000164168198289329,
818
- "loss": 0.9914,
819
- "step": 110
820
- },
821
- {
822
- "epoch": 1.19,
823
- "grad_norm": 0.30504937567650897,
824
- "learning_rate": 0.00016350202663875386,
825
- "loss": 0.9549,
826
- "step": 111
827
- },
828
- {
829
- "epoch": 1.2,
830
- "grad_norm": 0.3320388344291162,
831
- "learning_rate": 0.0001628310986648427,
832
- "loss": 0.9086,
833
- "step": 112
834
- },
835
- {
836
- "epoch": 1.21,
837
- "grad_norm": 0.27715569151296165,
838
- "learning_rate": 0.0001621554646203284,
839
- "loss": 0.8537,
840
- "step": 113
841
- },
842
- {
843
- "epoch": 1.22,
844
- "grad_norm": 0.278787508566418,
845
- "learning_rate": 0.0001614751751104301,
846
- "loss": 0.9354,
847
- "step": 114
848
- },
849
- {
850
- "epoch": 1.23,
851
- "grad_norm": 0.24483614460003267,
852
- "learning_rate": 0.00016079028108906282,
853
- "loss": 0.8996,
854
- "step": 115
855
- },
856
- {
857
- "epoch": 1.24,
858
- "grad_norm": 0.37520609596400134,
859
- "learning_rate": 0.0001601008338550211,
860
- "loss": 0.9514,
861
- "step": 116
862
- },
863
- {
864
- "epoch": 1.25,
865
- "grad_norm": 0.2565631505653599,
866
- "learning_rate": 0.00015940688504813662,
867
- "loss": 0.8984,
868
- "step": 117
869
- },
870
- {
871
- "epoch": 1.26,
872
- "grad_norm": 0.26348552476529935,
873
- "learning_rate": 0.00015870848664541044,
874
- "loss": 0.8941,
875
- "step": 118
876
- },
877
- {
878
- "epoch": 1.27,
879
- "grad_norm": 0.32431198985496534,
880
- "learning_rate": 0.00015800569095711982,
881
- "loss": 0.8876,
882
- "step": 119
883
- },
884
- {
885
- "epoch": 1.28,
886
- "grad_norm": 0.29308039763069227,
887
- "learning_rate": 0.00015729855062290022,
888
- "loss": 0.9309,
889
- "step": 120
890
- },
891
- {
892
- "epoch": 1.28,
893
- "eval_loss": 1.129751205444336,
894
- "eval_runtime": 119.1497,
895
- "eval_samples_per_second": 8.384,
896
- "eval_steps_per_second": 0.529,
897
- "step": 120
898
- },
899
- {
900
- "epoch": 1.29,
901
- "grad_norm": 0.2793291380060977,
902
- "learning_rate": 0.0001565871186078025,
903
- "loss": 0.9453,
904
- "step": 121
905
- },
906
- {
907
- "epoch": 1.3,
908
- "grad_norm": 0.28873644301555734,
909
- "learning_rate": 0.000155871448198326,
910
- "loss": 0.9243,
911
- "step": 122
912
- },
913
- {
914
- "epoch": 1.31,
915
- "grad_norm": 0.3086103724578039,
916
- "learning_rate": 0.00015515159299842707,
917
- "loss": 0.8877,
918
- "step": 123
919
- },
920
- {
921
- "epoch": 1.32,
922
- "grad_norm": 0.30407892484693505,
923
- "learning_rate": 0.00015442760692550443,
924
- "loss": 0.9448,
925
- "step": 124
926
- },
927
- {
928
- "epoch": 1.33,
929
- "grad_norm": 0.29771602861368474,
930
- "learning_rate": 0.00015369954420636048,
931
- "loss": 0.889,
932
- "step": 125
933
- },
934
- {
935
- "epoch": 1.35,
936
- "grad_norm": 0.30480490158838136,
937
- "learning_rate": 0.00015296745937313987,
938
- "loss": 0.9405,
939
- "step": 126
940
- },
941
- {
942
- "epoch": 1.36,
943
- "grad_norm": 0.2949192855418127,
944
- "learning_rate": 0.00015223140725924495,
945
- "loss": 0.9382,
946
- "step": 127
947
- },
948
- {
949
- "epoch": 1.37,
950
- "grad_norm": 0.2813631863132807,
951
- "learning_rate": 0.00015149144299522873,
952
- "loss": 0.9526,
953
- "step": 128
954
- },
955
- {
956
- "epoch": 1.38,
957
- "grad_norm": 0.28548924064070513,
958
- "learning_rate": 0.00015074762200466556,
959
- "loss": 0.9174,
960
- "step": 129
961
- },
962
- {
963
- "epoch": 1.39,
964
- "grad_norm": 0.28137053449960464,
965
- "learning_rate": 0.00015000000000000001,
966
- "loss": 0.9244,
967
- "step": 130
968
- },
969
- {
970
- "epoch": 1.4,
971
- "grad_norm": 0.2626750895717777,
972
- "learning_rate": 0.00014924863297837378,
973
- "loss": 0.9335,
974
- "step": 131
975
- },
976
- {
977
- "epoch": 1.41,
978
- "grad_norm": 0.26686502371015536,
979
- "learning_rate": 0.00014849357721743168,
980
- "loss": 0.8948,
981
- "step": 132
982
- },
983
- {
984
- "epoch": 1.42,
985
- "grad_norm": 0.3332273481179679,
986
- "learning_rate": 0.00014773488927110633,
987
- "loss": 0.9274,
988
- "step": 133
989
- },
990
- {
991
- "epoch": 1.43,
992
- "grad_norm": 0.2528048763375234,
993
- "learning_rate": 0.00014697262596538227,
994
- "loss": 0.8731,
995
- "step": 134
996
- },
997
- {
998
- "epoch": 1.44,
999
- "grad_norm": 0.27184211707488076,
1000
- "learning_rate": 0.00014620684439403962,
1001
- "loss": 0.9318,
1002
- "step": 135
1003
- },
1004
- {
1005
- "epoch": 1.45,
1006
- "grad_norm": 0.3051111137538683,
1007
- "learning_rate": 0.0001454376019143779,
1008
- "loss": 0.9447,
1009
- "step": 136
1010
- },
1011
- {
1012
- "epoch": 1.46,
1013
- "grad_norm": 0.28771401659835155,
1014
- "learning_rate": 0.00014466495614291977,
1015
- "loss": 0.9343,
1016
- "step": 137
1017
- },
1018
- {
1019
- "epoch": 1.47,
1020
- "grad_norm": 0.28995797921621524,
1021
- "learning_rate": 0.0001438889649510956,
1022
- "loss": 0.8978,
1023
- "step": 138
1024
- },
1025
- {
1026
- "epoch": 1.48,
1027
- "grad_norm": 0.2749930548874636,
1028
- "learning_rate": 0.00014310968646090883,
1029
- "loss": 0.924,
1030
- "step": 139
1031
- },
1032
- {
1033
- "epoch": 1.49,
1034
- "grad_norm": 0.3097189537380989,
1035
- "learning_rate": 0.0001423271790405828,
1036
- "loss": 0.9574,
1037
- "step": 140
1038
- },
1039
- {
1040
- "epoch": 1.51,
1041
- "grad_norm": 0.2449218990319832,
1042
- "learning_rate": 0.00014154150130018866,
1043
- "loss": 0.8475,
1044
- "step": 141
1045
- },
1046
- {
1047
- "epoch": 1.52,
1048
- "grad_norm": 0.24856388098419674,
1049
- "learning_rate": 0.0001407527120872557,
1050
- "loss": 0.9381,
1051
- "step": 142
1052
- },
1053
- {
1054
- "epoch": 1.53,
1055
- "grad_norm": 0.3169861882853132,
1056
- "learning_rate": 0.00013996087048236358,
1057
- "loss": 0.9141,
1058
- "step": 143
1059
- },
1060
- {
1061
- "epoch": 1.54,
1062
- "grad_norm": 0.30689184261103974,
1063
- "learning_rate": 0.00013916603579471705,
1064
- "loss": 0.9588,
1065
- "step": 144
1066
- },
1067
- {
1068
- "epoch": 1.54,
1069
- "eval_loss": 1.1242448091506958,
1070
- "eval_runtime": 119.0725,
1071
- "eval_samples_per_second": 8.39,
1072
- "eval_steps_per_second": 0.529,
1073
- "step": 144
1074
- },
1075
- {
1076
- "epoch": 1.55,
1077
- "grad_norm": 0.2961514212977567,
1078
- "learning_rate": 0.00013836826755770384,
1079
- "loss": 0.9371,
1080
- "step": 145
1081
- },
1082
- {
1083
- "epoch": 1.56,
1084
- "grad_norm": 0.30790856503439346,
1085
- "learning_rate": 0.00013756762552443553,
1086
- "loss": 0.9612,
1087
- "step": 146
1088
- },
1089
- {
1090
- "epoch": 1.57,
1091
- "grad_norm": 0.3517398492864053,
1092
- "learning_rate": 0.000136764169663272,
1093
- "loss": 0.9253,
1094
- "step": 147
1095
- },
1096
- {
1097
- "epoch": 1.58,
1098
- "grad_norm": 0.26375798832515857,
1099
- "learning_rate": 0.00013595796015332984,
1100
- "loss": 0.8977,
1101
- "step": 148
1102
- },
1103
- {
1104
- "epoch": 1.59,
1105
- "grad_norm": 0.274348892672977,
1106
- "learning_rate": 0.00013514905737997473,
1107
- "loss": 0.8817,
1108
- "step": 149
1109
- },
1110
- {
1111
- "epoch": 1.6,
1112
- "grad_norm": 0.35917564750751624,
1113
- "learning_rate": 0.00013433752193029886,
1114
- "loss": 0.886,
1115
- "step": 150
1116
- },
1117
- {
1118
- "epoch": 1.61,
1119
- "grad_norm": 0.38175124377914293,
1120
- "learning_rate": 0.00013352341458858265,
1121
- "loss": 0.8576,
1122
- "step": 151
1123
- },
1124
- {
1125
- "epoch": 1.62,
1126
- "grad_norm": 0.249633953215678,
1127
- "learning_rate": 0.00013270679633174218,
1128
- "loss": 1.0066,
1129
- "step": 152
1130
- },
1131
- {
1132
- "epoch": 1.63,
1133
- "grad_norm": 0.33494494430574784,
1134
- "learning_rate": 0.00013188772832476188,
1135
- "loss": 0.884,
1136
- "step": 153
1137
- },
1138
- {
1139
- "epoch": 1.64,
1140
- "grad_norm": 0.4176467296744032,
1141
- "learning_rate": 0.00013106627191611332,
1142
- "loss": 0.9041,
1143
- "step": 154
1144
- },
1145
- {
1146
- "epoch": 1.65,
1147
- "grad_norm": 0.27051479454532207,
1148
- "learning_rate": 0.00013024248863316012,
1149
- "loss": 0.8764,
1150
- "step": 155
1151
- },
1152
- {
1153
- "epoch": 1.67,
1154
- "grad_norm": 0.29302599029848847,
1155
- "learning_rate": 0.00012941644017754964,
1156
- "loss": 0.9786,
1157
- "step": 156
1158
- },
1159
- {
1160
- "epoch": 1.68,
1161
- "grad_norm": 0.3127378512248151,
1162
- "learning_rate": 0.00012858818842059145,
1163
- "loss": 0.9176,
1164
- "step": 157
1165
- },
1166
- {
1167
- "epoch": 1.69,
1168
- "grad_norm": 0.40647077063662906,
1169
- "learning_rate": 0.00012775779539862304,
1170
- "loss": 0.9387,
1171
- "step": 158
1172
- },
1173
- {
1174
- "epoch": 1.7,
1175
- "grad_norm": 0.29290601694481777,
1176
- "learning_rate": 0.00012692532330836346,
1177
- "loss": 0.9192,
1178
- "step": 159
1179
- },
1180
- {
1181
- "epoch": 1.71,
1182
- "grad_norm": 0.2819168741245354,
1183
- "learning_rate": 0.0001260908345022547,
1184
- "loss": 0.9253,
1185
- "step": 160
1186
- },
1187
- {
1188
- "epoch": 1.72,
1189
- "grad_norm": 0.3772714091394927,
1190
- "learning_rate": 0.00012525439148379128,
1191
- "loss": 0.9264,
1192
- "step": 161
1193
- },
1194
- {
1195
- "epoch": 1.73,
1196
- "grad_norm": 0.29399851067321503,
1197
- "learning_rate": 0.00012441605690283915,
1198
- "loss": 0.9357,
1199
- "step": 162
1200
- },
1201
- {
1202
- "epoch": 1.74,
1203
- "grad_norm": 0.2623180246832513,
1204
- "learning_rate": 0.00012357589355094275,
1205
- "loss": 0.8516,
1206
- "step": 163
1207
- },
1208
- {
1209
- "epoch": 1.75,
1210
- "grad_norm": 0.27796942024085824,
1211
- "learning_rate": 0.00012273396435662212,
1212
- "loss": 0.9328,
1213
- "step": 164
1214
- },
1215
- {
1216
- "epoch": 1.76,
1217
- "grad_norm": 0.3107670297529076,
1218
- "learning_rate": 0.0001218903323806595,
1219
- "loss": 0.8769,
1220
- "step": 165
1221
- },
1222
- {
1223
- "epoch": 1.77,
1224
- "grad_norm": 0.2865573350738354,
1225
- "learning_rate": 0.00012104506081137608,
1226
- "loss": 0.9015,
1227
- "step": 166
1228
- },
1229
- {
1230
- "epoch": 1.78,
1231
- "grad_norm": 0.30595087117636693,
1232
- "learning_rate": 0.00012019821295989912,
1233
- "loss": 0.94,
1234
- "step": 167
1235
- },
1236
- {
1237
- "epoch": 1.79,
1238
- "grad_norm": 0.32540365653257874,
1239
- "learning_rate": 0.00011934985225541998,
1240
- "loss": 0.8553,
1241
- "step": 168
1242
- },
1243
- {
1244
- "epoch": 1.79,
1245
- "eval_loss": 1.1259374618530273,
1246
- "eval_runtime": 119.4351,
1247
- "eval_samples_per_second": 8.364,
1248
- "eval_steps_per_second": 0.527,
1249
- "step": 168
1250
- },
1251
- {
1252
- "epoch": 1.8,
1253
- "grad_norm": 0.3058868303314457,
1254
- "learning_rate": 0.00011850004224044315,
1255
- "loss": 0.9074,
1256
- "step": 169
1257
- },
1258
- {
1259
- "epoch": 1.81,
1260
- "grad_norm": 0.33266760488242775,
1261
- "learning_rate": 0.0001176488465660271,
1262
- "loss": 0.8799,
1263
- "step": 170
1264
- },
1265
- {
1266
- "epoch": 1.83,
1267
- "grad_norm": 0.3101183375673487,
1268
- "learning_rate": 0.00011679632898701649,
1269
- "loss": 0.9004,
1270
- "step": 171
1271
- },
1272
- {
1273
- "epoch": 1.84,
1274
- "grad_norm": 0.31535579418195775,
1275
- "learning_rate": 0.00011594255335726724,
1276
- "loss": 0.9238,
1277
- "step": 172
1278
- },
1279
- {
1280
- "epoch": 1.85,
1281
- "grad_norm": 0.28341827112854334,
1282
- "learning_rate": 0.00011508758362486358,
1283
- "loss": 0.9138,
1284
- "step": 173
1285
- },
1286
- {
1287
- "epoch": 1.86,
1288
- "grad_norm": 0.25699888796695625,
1289
- "learning_rate": 0.00011423148382732853,
1290
- "loss": 0.9175,
1291
- "step": 174
1292
- },
1293
- {
1294
- "epoch": 1.87,
1295
- "grad_norm": 0.29504332662698246,
1296
- "learning_rate": 0.0001133743180868273,
1297
- "loss": 0.9023,
1298
- "step": 175
1299
- },
1300
- {
1301
- "epoch": 1.88,
1302
- "grad_norm": 0.2993175263873948,
1303
- "learning_rate": 0.0001125161506053646,
1304
- "loss": 0.8893,
1305
- "step": 176
1306
- },
1307
- {
1308
- "epoch": 1.89,
1309
- "grad_norm": 0.2762659379409218,
1310
- "learning_rate": 0.00011165704565997593,
1311
- "loss": 0.9071,
1312
- "step": 177
1313
- },
1314
- {
1315
- "epoch": 1.9,
1316
- "grad_norm": 0.23620994229530515,
1317
- "learning_rate": 0.00011079706759791311,
1318
- "loss": 0.8796,
1319
- "step": 178
1320
- },
1321
- {
1322
- "epoch": 1.91,
1323
- "grad_norm": 0.28317619721877,
1324
- "learning_rate": 0.00010993628083182467,
1325
- "loss": 0.8983,
1326
- "step": 179
1327
- },
1328
- {
1329
- "epoch": 1.92,
1330
- "grad_norm": 0.3252854551640304,
1331
- "learning_rate": 0.00010907474983493144,
1332
- "loss": 0.8947,
1333
- "step": 180
1334
- },
1335
- {
1336
- "epoch": 1.93,
1337
- "grad_norm": 0.2579136274422669,
1338
- "learning_rate": 0.00010821253913619726,
1339
- "loss": 0.8726,
1340
- "step": 181
1341
- },
1342
- {
1343
- "epoch": 1.94,
1344
- "grad_norm": 0.27201912720918364,
1345
- "learning_rate": 0.00010734971331549603,
1346
- "loss": 0.891,
1347
- "step": 182
1348
- },
1349
- {
1350
- "epoch": 1.95,
1351
- "grad_norm": 0.41257277193589503,
1352
- "learning_rate": 0.0001064863369987743,
1353
- "loss": 0.9188,
1354
- "step": 183
1355
- },
1356
- {
1357
- "epoch": 1.96,
1358
- "grad_norm": 0.264920112831242,
1359
- "learning_rate": 0.00010562247485321115,
1360
- "loss": 0.8761,
1361
- "step": 184
1362
- },
1363
- {
1364
- "epoch": 1.98,
1365
- "grad_norm": 0.28166441056422037,
1366
- "learning_rate": 0.00010475819158237425,
1367
- "loss": 0.8805,
1368
- "step": 185
1369
- },
1370
- {
1371
- "epoch": 1.99,
1372
- "grad_norm": 0.2818961139392159,
1373
- "learning_rate": 0.00010389355192137377,
1374
- "loss": 0.8934,
1375
- "step": 186
1376
- },
1377
- {
1378
- "epoch": 2.0,
1379
- "grad_norm": 0.27424787600345923,
1380
- "learning_rate": 0.00010302862063201367,
1381
- "loss": 0.9237,
1382
- "step": 187
1383
- },
1384
- {
1385
- "epoch": 2.01,
1386
- "grad_norm": 0.25570082666079225,
1387
- "learning_rate": 0.00010216346249794087,
1388
- "loss": 0.8656,
1389
- "step": 188
1390
- },
1391
- {
1392
- "epoch": 2.02,
1393
- "grad_norm": 0.2712359904481713,
1394
- "learning_rate": 0.0001012981423197931,
1395
- "loss": 0.7627,
1396
- "step": 189
1397
- },
1398
- {
1399
- "epoch": 2.03,
1400
- "grad_norm": 0.25054404547068676,
1401
- "learning_rate": 0.00010043272491034523,
1402
- "loss": 0.8142,
1403
- "step": 190
1404
- },
1405
- {
1406
- "epoch": 2.04,
1407
- "grad_norm": 0.28520868420260026,
1408
- "learning_rate": 9.956727508965481e-05,
1409
- "loss": 0.7953,
1410
- "step": 191
1411
- },
1412
- {
1413
- "epoch": 2.05,
1414
- "grad_norm": 0.29413880984694873,
1415
- "learning_rate": 9.870185768020693e-05,
1416
- "loss": 0.8231,
1417
- "step": 192
1418
- },
1419
- {
1420
- "epoch": 2.05,
1421
- "eval_loss": 1.144862413406372,
1422
- "eval_runtime": 119.3004,
1423
- "eval_samples_per_second": 8.374,
1424
- "eval_steps_per_second": 0.528,
1425
- "step": 192
1426
- },
1427
- {
1428
- "epoch": 2.06,
1429
- "grad_norm": 0.28378300985247035,
1430
- "learning_rate": 9.783653750205915e-05,
1431
- "loss": 0.7478,
1432
- "step": 193
1433
- },
1434
- {
1435
- "epoch": 2.07,
1436
- "grad_norm": 0.31792721348179676,
1437
- "learning_rate": 9.697137936798634e-05,
1438
- "loss": 0.7961,
1439
- "step": 194
1440
- },
1441
- {
1442
- "epoch": 2.08,
1443
- "grad_norm": 0.3291666436295964,
1444
- "learning_rate": 9.610644807862625e-05,
1445
- "loss": 0.7434,
1446
- "step": 195
1447
- },
1448
- {
1449
- "epoch": 2.09,
1450
- "grad_norm": 0.301579259001567,
1451
- "learning_rate": 9.524180841762577e-05,
1452
- "loss": 0.7779,
1453
- "step": 196
1454
- },
1455
- {
1456
- "epoch": 2.1,
1457
- "grad_norm": 0.30252161240414444,
1458
- "learning_rate": 9.437752514678887e-05,
1459
- "loss": 0.7689,
1460
- "step": 197
1461
- },
1462
- {
1463
- "epoch": 2.11,
1464
- "grad_norm": 0.3350657085129171,
1465
- "learning_rate": 9.35136630012257e-05,
1466
- "loss": 0.7574,
1467
- "step": 198
1468
- },
1469
- {
1470
- "epoch": 2.12,
1471
- "grad_norm": 0.3053109929956358,
1472
- "learning_rate": 9.265028668450402e-05,
1473
- "loss": 0.7729,
1474
- "step": 199
1475
- },
1476
- {
1477
- "epoch": 2.14,
1478
- "grad_norm": 0.30367223609567207,
1479
- "learning_rate": 9.178746086380275e-05,
1480
- "loss": 0.8111,
1481
- "step": 200
1482
- },
1483
- {
1484
- "epoch": 2.15,
1485
- "grad_norm": 0.3366440949136126,
1486
- "learning_rate": 9.092525016506858e-05,
1487
- "loss": 0.7986,
1488
- "step": 201
1489
- },
1490
- {
1491
- "epoch": 2.16,
1492
- "grad_norm": 0.3228036608413652,
1493
- "learning_rate": 9.006371916817534e-05,
1494
- "loss": 0.8382,
1495
- "step": 202
1496
- },
1497
- {
1498
- "epoch": 2.17,
1499
- "grad_norm": 0.2919040789403488,
1500
- "learning_rate": 8.920293240208694e-05,
1501
- "loss": 0.7696,
1502
- "step": 203
1503
- },
1504
- {
1505
- "epoch": 2.18,
1506
- "grad_norm": 0.30084198177583166,
1507
- "learning_rate": 8.83429543400241e-05,
1508
- "loss": 0.7671,
1509
- "step": 204
1510
- },
1511
- {
1512
- "epoch": 2.19,
1513
- "grad_norm": 0.33931609000743107,
1514
- "learning_rate": 8.748384939463543e-05,
1515
- "loss": 0.7553,
1516
- "step": 205
1517
- },
1518
- {
1519
- "epoch": 2.2,
1520
- "grad_norm": 0.30413284924824485,
1521
- "learning_rate": 8.662568191317273e-05,
1522
- "loss": 0.7324,
1523
- "step": 206
1524
- },
1525
- {
1526
- "epoch": 2.21,
1527
- "grad_norm": 0.3014038998090481,
1528
- "learning_rate": 8.57685161726715e-05,
1529
- "loss": 0.7567,
1530
- "step": 207
1531
- },
1532
- {
1533
- "epoch": 2.22,
1534
- "grad_norm": 0.3176466329519527,
1535
- "learning_rate": 8.491241637513644e-05,
1536
- "loss": 0.8222,
1537
- "step": 208
1538
- },
1539
- {
1540
- "epoch": 2.23,
1541
- "grad_norm": 0.29981213041628285,
1542
- "learning_rate": 8.405744664273278e-05,
1543
- "loss": 0.7077,
1544
- "step": 209
1545
- },
1546
- {
1547
- "epoch": 2.24,
1548
- "grad_norm": 0.2937916452228122,
1549
- "learning_rate": 8.320367101298351e-05,
1550
- "loss": 0.7231,
1551
- "step": 210
1552
- },
1553
- {
1554
- "epoch": 2.25,
1555
- "grad_norm": 0.32040684171320816,
1556
- "learning_rate": 8.235115343397295e-05,
1557
- "loss": 0.7556,
1558
- "step": 211
1559
- },
1560
- {
1561
- "epoch": 2.26,
1562
- "grad_norm": 0.31083028085316033,
1563
- "learning_rate": 8.149995775955686e-05,
1564
- "loss": 0.7514,
1565
- "step": 212
1566
- },
1567
- {
1568
- "epoch": 2.27,
1569
- "grad_norm": 0.3215465383581194,
1570
- "learning_rate": 8.065014774458003e-05,
1571
- "loss": 0.7933,
1572
- "step": 213
1573
- },
1574
- {
1575
- "epoch": 2.28,
1576
- "grad_norm": 0.3081200259196015,
1577
- "learning_rate": 7.980178704010089e-05,
1578
- "loss": 0.8062,
1579
- "step": 214
1580
- },
1581
- {
1582
- "epoch": 2.3,
1583
- "grad_norm": 0.3333248296288759,
1584
- "learning_rate": 7.895493918862396e-05,
1585
- "loss": 0.7784,
1586
- "step": 215
1587
- },
1588
- {
1589
- "epoch": 2.31,
1590
- "grad_norm": 0.3301326097292383,
1591
- "learning_rate": 7.810966761934053e-05,
1592
- "loss": 0.8154,
1593
- "step": 216
1594
- },
1595
- {
1596
- "epoch": 2.31,
1597
- "eval_loss": 1.1513652801513672,
1598
- "eval_runtime": 119.4371,
1599
- "eval_samples_per_second": 8.364,
1600
- "eval_steps_per_second": 0.527,
1601
- "step": 216
1602
- },
1603
- {
1604
- "epoch": 2.32,
1605
- "grad_norm": 0.3166760836422428,
1606
- "learning_rate": 7.726603564337791e-05,
1607
- "loss": 0.7486,
1608
- "step": 217
1609
- },
1610
- {
1611
- "epoch": 2.33,
1612
- "grad_norm": 0.31309757318131876,
1613
- "learning_rate": 7.642410644905726e-05,
1614
- "loss": 0.771,
1615
- "step": 218
1616
- },
1617
- {
1618
- "epoch": 2.34,
1619
- "grad_norm": 0.36968796131043985,
1620
- "learning_rate": 7.558394309716088e-05,
1621
- "loss": 0.8051,
1622
- "step": 219
1623
- },
1624
- {
1625
- "epoch": 2.35,
1626
- "grad_norm": 0.27537675917328025,
1627
- "learning_rate": 7.474560851620873e-05,
1628
- "loss": 0.7536,
1629
- "step": 220
1630
- },
1631
- {
1632
- "epoch": 2.36,
1633
- "grad_norm": 0.2878011945022053,
1634
- "learning_rate": 7.390916549774536e-05,
1635
- "loss": 0.8126,
1636
- "step": 221
1637
- },
1638
- {
1639
- "epoch": 2.37,
1640
- "grad_norm": 0.3172405217395398,
1641
- "learning_rate": 7.307467669163655e-05,
1642
- "loss": 0.8156,
1643
- "step": 222
1644
- },
1645
- {
1646
- "epoch": 2.38,
1647
- "grad_norm": 0.3183651086957915,
1648
- "learning_rate": 7.224220460137701e-05,
1649
- "loss": 0.7821,
1650
- "step": 223
1651
- },
1652
- {
1653
- "epoch": 2.39,
1654
- "grad_norm": 0.3318078467573977,
1655
- "learning_rate": 7.141181157940859e-05,
1656
- "loss": 0.7993,
1657
- "step": 224
1658
- },
1659
- {
1660
- "epoch": 2.4,
1661
- "grad_norm": 0.28446170407344085,
1662
- "learning_rate": 7.058355982245037e-05,
1663
- "loss": 0.7987,
1664
- "step": 225
1665
- },
1666
- {
1667
- "epoch": 2.41,
1668
- "grad_norm": 0.33568352702219995,
1669
- "learning_rate": 6.97575113668399e-05,
1670
- "loss": 0.773,
1671
- "step": 226
1672
- },
1673
- {
1674
- "epoch": 2.42,
1675
- "grad_norm": 0.30820575901544944,
1676
- "learning_rate": 6.893372808388675e-05,
1677
- "loss": 0.813,
1678
- "step": 227
1679
- },
1680
- {
1681
- "epoch": 2.43,
1682
- "grad_norm": 0.3121364386024255,
1683
- "learning_rate": 6.811227167523815e-05,
1684
- "loss": 0.7716,
1685
- "step": 228
1686
- },
1687
- {
1688
- "epoch": 2.44,
1689
- "grad_norm": 0.3211455560922844,
1690
- "learning_rate": 6.729320366825784e-05,
1691
- "loss": 0.7577,
1692
- "step": 229
1693
- },
1694
- {
1695
- "epoch": 2.46,
1696
- "grad_norm": 0.3315601260165869,
1697
- "learning_rate": 6.647658541141735e-05,
1698
- "loss": 0.779,
1699
- "step": 230
1700
- },
1701
- {
1702
- "epoch": 2.47,
1703
- "grad_norm": 0.35482236759964675,
1704
- "learning_rate": 6.566247806970119e-05,
1705
- "loss": 0.7936,
1706
- "step": 231
1707
- },
1708
- {
1709
- "epoch": 2.48,
1710
- "grad_norm": 0.3318703205331905,
1711
- "learning_rate": 6.485094262002529e-05,
1712
- "loss": 0.7721,
1713
- "step": 232
1714
- },
1715
- {
1716
- "epoch": 2.49,
1717
- "grad_norm": 0.313412585518615,
1718
- "learning_rate": 6.404203984667019e-05,
1719
- "loss": 0.7333,
1720
- "step": 233
1721
- },
1722
- {
1723
- "epoch": 2.5,
1724
- "grad_norm": 0.3389693444254627,
1725
- "learning_rate": 6.323583033672799e-05,
1726
- "loss": 0.6991,
1727
- "step": 234
1728
- },
1729
- {
1730
- "epoch": 2.51,
1731
- "grad_norm": 0.33056782619334757,
1732
- "learning_rate": 6.243237447556449e-05,
1733
- "loss": 0.7872,
1734
- "step": 235
1735
- },
1736
- {
1737
- "epoch": 2.52,
1738
- "grad_norm": 0.3064085209522584,
1739
- "learning_rate": 6.163173244229619e-05,
1740
- "loss": 0.7713,
1741
- "step": 236
1742
- },
1743
- {
1744
- "epoch": 2.53,
1745
- "grad_norm": 0.3109445125421656,
1746
- "learning_rate": 6.083396420528298e-05,
1747
- "loss": 0.8228,
1748
- "step": 237
1749
- },
1750
- {
1751
- "epoch": 2.54,
1752
- "grad_norm": 0.35767207742703394,
1753
- "learning_rate": 6.0039129517636435e-05,
1754
- "loss": 0.8167,
1755
- "step": 238
1756
- },
1757
- {
1758
- "epoch": 2.55,
1759
- "grad_norm": 0.32869196909020376,
1760
- "learning_rate": 5.924728791274432e-05,
1761
- "loss": 0.7893,
1762
- "step": 239
1763
- },
1764
- {
1765
- "epoch": 2.56,
1766
- "grad_norm": 0.31178216743238674,
1767
- "learning_rate": 5.845849869981137e-05,
1768
- "loss": 0.7354,
1769
- "step": 240
1770
- },
1771
- {
1772
- "epoch": 2.56,
1773
- "eval_loss": 1.1470853090286255,
1774
- "eval_runtime": 119.0749,
1775
- "eval_samples_per_second": 8.39,
1776
- "eval_steps_per_second": 0.529,
1777
- "step": 240
1778
- },
1779
- {
1780
- "epoch": 2.57,
1781
- "grad_norm": 0.3146586486940167,
1782
- "learning_rate": 5.7672820959417254e-05,
1783
- "loss": 0.785,
1784
- "step": 241
1785
- },
1786
- {
1787
- "epoch": 2.58,
1788
- "grad_norm": 0.3309473634570162,
1789
- "learning_rate": 5.68903135390912e-05,
1790
- "loss": 0.7007,
1791
- "step": 242
1792
- },
1793
- {
1794
- "epoch": 2.59,
1795
- "grad_norm": 0.2927704203363025,
1796
- "learning_rate": 5.611103504890444e-05,
1797
- "loss": 0.778,
1798
- "step": 243
1799
- },
1800
- {
1801
- "epoch": 2.6,
1802
- "grad_norm": 0.31346541530480915,
1803
- "learning_rate": 5.533504385708024e-05,
1804
- "loss": 0.7272,
1805
- "step": 244
1806
- },
1807
- {
1808
- "epoch": 2.62,
1809
- "grad_norm": 0.2996345434845278,
1810
- "learning_rate": 5.456239808562209e-05,
1811
- "loss": 0.8091,
1812
- "step": 245
1813
- },
1814
- {
1815
- "epoch": 2.63,
1816
- "grad_norm": 0.29407937930772826,
1817
- "learning_rate": 5.379315560596038e-05,
1818
- "loss": 0.7666,
1819
- "step": 246
1820
- },
1821
- {
1822
- "epoch": 2.64,
1823
- "grad_norm": 0.30530254935425627,
1824
- "learning_rate": 5.3027374034617785e-05,
1825
- "loss": 0.7982,
1826
- "step": 247
1827
- },
1828
- {
1829
- "epoch": 2.65,
1830
- "grad_norm": 0.3298149075133802,
1831
- "learning_rate": 5.226511072889371e-05,
1832
- "loss": 0.7962,
1833
- "step": 248
1834
- },
1835
- {
1836
- "epoch": 2.66,
1837
- "grad_norm": 0.33155001378615223,
1838
- "learning_rate": 5.1506422782568345e-05,
1839
- "loss": 0.8087,
1840
- "step": 249
1841
- },
1842
- {
1843
- "epoch": 2.67,
1844
- "grad_norm": 0.32891369446509405,
1845
- "learning_rate": 5.0751367021626215e-05,
1846
- "loss": 0.7702,
1847
- "step": 250
1848
- },
1849
- {
1850
- "epoch": 2.68,
1851
- "grad_norm": 0.3042328939887202,
1852
- "learning_rate": 5.000000000000002e-05,
1853
- "loss": 0.7924,
1854
- "step": 251
1855
- },
1856
- {
1857
- "epoch": 2.69,
1858
- "grad_norm": 0.3037799376581133,
1859
- "learning_rate": 4.9252377995334444e-05,
1860
- "loss": 0.7852,
1861
- "step": 252
1862
- },
1863
- {
1864
- "epoch": 2.7,
1865
- "grad_norm": 0.3435430445603929,
1866
- "learning_rate": 4.85085570047713e-05,
1867
- "loss": 0.7501,
1868
- "step": 253
1869
- },
1870
- {
1871
- "epoch": 2.71,
1872
- "grad_norm": 0.3072160193979946,
1873
- "learning_rate": 4.776859274075506e-05,
1874
- "loss": 0.7462,
1875
- "step": 254
1876
- },
1877
- {
1878
- "epoch": 2.72,
1879
- "grad_norm": 0.3223586439500028,
1880
- "learning_rate": 4.703254062686017e-05,
1881
- "loss": 0.775,
1882
- "step": 255
1883
- },
1884
- {
1885
- "epoch": 2.73,
1886
- "grad_norm": 0.3270406403084203,
1887
- "learning_rate": 4.630045579363957e-05,
1888
- "loss": 0.8306,
1889
- "step": 256
1890
- },
1891
- {
1892
- "epoch": 2.74,
1893
- "grad_norm": 0.3360192842512657,
1894
- "learning_rate": 4.557239307449561e-05,
1895
- "loss": 0.7697,
1896
- "step": 257
1897
- },
1898
- {
1899
- "epoch": 2.75,
1900
- "grad_norm": 0.34282816479900324,
1901
- "learning_rate": 4.484840700157295e-05,
1902
- "loss": 0.7654,
1903
- "step": 258
1904
- },
1905
- {
1906
- "epoch": 2.77,
1907
- "grad_norm": 0.30039142762313786,
1908
- "learning_rate": 4.412855180167406e-05,
1909
- "loss": 0.7703,
1910
- "step": 259
1911
- },
1912
- {
1913
- "epoch": 2.78,
1914
- "grad_norm": 0.34307884673711425,
1915
- "learning_rate": 4.3412881392197526e-05,
1916
- "loss": 0.7993,
1917
- "step": 260
1918
- },
1919
- {
1920
- "epoch": 2.79,
1921
- "grad_norm": 0.33685538845268104,
1922
- "learning_rate": 4.270144937709981e-05,
1923
- "loss": 0.7866,
1924
- "step": 261
1925
- },
1926
- {
1927
- "epoch": 2.8,
1928
- "grad_norm": 0.33166767859224683,
1929
- "learning_rate": 4.19943090428802e-05,
1930
- "loss": 0.8083,
1931
- "step": 262
1932
- },
1933
- {
1934
- "epoch": 2.81,
1935
- "grad_norm": 0.3086370003245581,
1936
- "learning_rate": 4.129151335458957e-05,
1937
- "loss": 0.7938,
1938
- "step": 263
1939
- },
1940
- {
1941
- "epoch": 2.82,
1942
- "grad_norm": 0.3715649674817313,
1943
- "learning_rate": 4.059311495186338e-05,
1944
- "loss": 0.7577,
1945
- "step": 264
1946
- },
1947
- {
1948
- "epoch": 2.82,
1949
- "eval_loss": 1.1478512287139893,
1950
- "eval_runtime": 119.1178,
1951
- "eval_samples_per_second": 8.387,
1952
- "eval_steps_per_second": 0.529,
1953
- "step": 264
1954
- },
1955
- {
1956
- "epoch": 2.83,
1957
- "grad_norm": 0.3298033298390841,
1958
- "learning_rate": 3.9899166144978904e-05,
1959
- "loss": 0.8296,
1960
- "step": 265
1961
- },
1962
- {
1963
- "epoch": 2.84,
1964
- "grad_norm": 0.3294808666769515,
1965
- "learning_rate": 3.920971891093718e-05,
1966
- "loss": 0.8206,
1967
- "step": 266
1968
- },
1969
- {
1970
- "epoch": 2.85,
1971
- "grad_norm": 0.3239672501165848,
1972
- "learning_rate": 3.852482488956992e-05,
1973
- "loss": 0.8116,
1974
- "step": 267
1975
- },
1976
- {
1977
- "epoch": 2.86,
1978
- "grad_norm": 0.3286742994048133,
1979
- "learning_rate": 3.784453537967161e-05,
1980
- "loss": 0.8096,
1981
- "step": 268
1982
- },
1983
- {
1984
- "epoch": 2.87,
1985
- "grad_norm": 0.31259050250842946,
1986
- "learning_rate": 3.7168901335157315e-05,
1987
- "loss": 0.7669,
1988
- "step": 269
1989
- },
1990
- {
1991
- "epoch": 2.88,
1992
- "grad_norm": 0.3308991711135206,
1993
- "learning_rate": 3.649797336124615e-05,
1994
- "loss": 0.8041,
1995
- "step": 270
1996
- },
1997
- {
1998
- "epoch": 2.89,
1999
- "grad_norm": 0.32757727002633424,
2000
- "learning_rate": 3.583180171067101e-05,
2001
- "loss": 0.7673,
2002
- "step": 271
2003
- },
2004
- {
2005
- "epoch": 2.9,
2006
- "grad_norm": 0.3342551756453125,
2007
- "learning_rate": 3.517043627991441e-05,
2008
- "loss": 0.8005,
2009
- "step": 272
2010
- },
2011
- {
2012
- "epoch": 2.91,
2013
- "grad_norm": 0.31643754309861705,
2014
- "learning_rate": 3.45139266054715e-05,
2015
- "loss": 0.787,
2016
- "step": 273
2017
- },
2018
- {
2019
- "epoch": 2.93,
2020
- "grad_norm": 0.3140452683879005,
2021
- "learning_rate": 3.3862321860139576e-05,
2022
- "loss": 0.7888,
2023
- "step": 274
2024
- },
2025
- {
2026
- "epoch": 2.94,
2027
- "grad_norm": 0.30706221155036223,
2028
- "learning_rate": 3.3215670849335155e-05,
2029
- "loss": 0.827,
2030
- "step": 275
2031
- },
2032
- {
2033
- "epoch": 2.95,
2034
- "grad_norm": 0.3185483102727301,
2035
- "learning_rate": 3.257402200743821e-05,
2036
- "loss": 0.7779,
2037
- "step": 276
2038
- },
2039
- {
2040
- "epoch": 2.96,
2041
- "grad_norm": 0.3032818796307545,
2042
- "learning_rate": 3.19374233941647e-05,
2043
- "loss": 0.7993,
2044
- "step": 277
2045
- },
2046
- {
2047
- "epoch": 2.97,
2048
- "grad_norm": 0.3057758504695884,
2049
- "learning_rate": 3.130592269096671e-05,
2050
- "loss": 0.768,
2051
- "step": 278
2052
- },
2053
- {
2054
- "epoch": 2.98,
2055
- "grad_norm": 0.3245404038219604,
2056
- "learning_rate": 3.0679567197461134e-05,
2057
- "loss": 0.7706,
2058
- "step": 279
2059
- }
2060
- ],
2061
- "logging_steps": 1,
2062
- "max_steps": 372,
2063
- "num_input_tokens_seen": 0,
2064
- "num_train_epochs": 4,
2065
- "save_steps": 93,
2066
- "total_flos": 5.168039211319689e+18,
2067
- "train_batch_size": 4,
2068
- "trial_name": null,
2069
- "trial_params": null
2070
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-279/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c56515a18cd914d4eee44c09952d3a756ea623b0b6e69e8dfaeb0dbc7b665f46
3
- size 6776
 
 
 
 
checkpoint-279/zero_to_fp32.py DELETED
@@ -1,592 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Copyright (c) Microsoft Corporation.
4
- # SPDX-License-Identifier: Apache-2.0
5
-
6
- # DeepSpeed Team
7
-
8
- # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
- # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
- # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
- # application.
12
- #
13
- # example: python zero_to_fp32.py . pytorch_model.bin
14
-
15
- import argparse
16
- import torch
17
- import glob
18
- import math
19
- import os
20
- import re
21
- from collections import OrderedDict
22
- from dataclasses import dataclass
23
-
24
- # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
- # DeepSpeed data structures it has to be available in the current python environment.
26
- from deepspeed.utils import logger
27
- from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
- FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
- FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
-
31
-
32
- @dataclass
33
- class zero_model_state:
34
- buffers: dict()
35
- param_shapes: dict()
36
- shared_params: list
37
- ds_version: int
38
- frozen_param_shapes: dict()
39
- frozen_param_fragments: dict()
40
-
41
-
42
- debug = 0
43
-
44
- # load to cpu
45
- device = torch.device('cpu')
46
-
47
-
48
- def atoi(text):
49
- return int(text) if text.isdigit() else text
50
-
51
-
52
- def natural_keys(text):
53
- '''
54
- alist.sort(key=natural_keys) sorts in human order
55
- http://nedbatchelder.com/blog/200712/human_sorting.html
56
- (See Toothy's implementation in the comments)
57
- '''
58
- return [atoi(c) for c in re.split(r'(\d+)', text)]
59
-
60
-
61
- def get_model_state_file(checkpoint_dir, zero_stage):
62
- if not os.path.isdir(checkpoint_dir):
63
- raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
-
65
- # there should be only one file
66
- if zero_stage <= 2:
67
- file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
- elif zero_stage == 3:
69
- file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
-
71
- if not os.path.exists(file):
72
- raise FileNotFoundError(f"can't find model states file at '{file}'")
73
-
74
- return file
75
-
76
-
77
- def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
- # XXX: need to test that this simple glob rule works for multi-node setup too
79
- ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
-
81
- if len(ckpt_files) == 0:
82
- raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
-
84
- return ckpt_files
85
-
86
-
87
- def get_optim_files(checkpoint_dir):
88
- return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
-
90
-
91
- def get_model_state_files(checkpoint_dir):
92
- return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
-
94
-
95
- def parse_model_states(files):
96
- zero_model_states = []
97
- for file in files:
98
- state_dict = torch.load(file, map_location=device)
99
-
100
- if BUFFER_NAMES not in state_dict:
101
- raise ValueError(f"{file} is not a model state checkpoint")
102
- buffer_names = state_dict[BUFFER_NAMES]
103
- if debug:
104
- print("Found buffers:", buffer_names)
105
-
106
- # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
- buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
- param_shapes = state_dict[PARAM_SHAPES]
109
-
110
- # collect parameters that are included in param_shapes
111
- param_names = []
112
- for s in param_shapes:
113
- for name in s.keys():
114
- param_names.append(name)
115
-
116
- # update with frozen parameters
117
- frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
- if frozen_param_shapes is not None:
119
- if debug:
120
- print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
- param_names += list(frozen_param_shapes.keys())
122
-
123
- # handle shared params
124
- shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
-
126
- ds_version = state_dict.get(DS_VERSION, None)
127
-
128
- frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
-
130
- z_model_state = zero_model_state(buffers=buffers,
131
- param_shapes=param_shapes,
132
- shared_params=shared_params,
133
- ds_version=ds_version,
134
- frozen_param_shapes=frozen_param_shapes,
135
- frozen_param_fragments=frozen_param_fragments)
136
- zero_model_states.append(z_model_state)
137
-
138
- return zero_model_states
139
-
140
-
141
- def parse_optim_states(files, ds_checkpoint_dir):
142
-
143
- total_files = len(files)
144
- state_dicts = []
145
- for f in files:
146
- state_dict = torch.load(f, map_location=device)
147
- # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
- # and also handle the case where it was already removed by another helper script
149
- state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
- state_dicts.append(state_dict)
151
-
152
- if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
- raise ValueError(f"{files[0]} is not a zero checkpoint")
154
- zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
- world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
-
157
- # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
- # parameters can be different from data parallelism for non-expert parameters. So we can just
159
- # use the max of the partition_count to get the dp world_size.
160
-
161
- if type(world_size) is list:
162
- world_size = max(world_size)
163
-
164
- if world_size != total_files:
165
- raise ValueError(
166
- f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
- "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
- )
169
-
170
- # the groups are named differently in each stage
171
- if zero_stage <= 2:
172
- fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
- elif zero_stage == 3:
174
- fp32_groups_key = FP32_FLAT_GROUPS
175
- else:
176
- raise ValueError(f"unknown zero stage {zero_stage}")
177
-
178
- if zero_stage <= 2:
179
- fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
- elif zero_stage == 3:
181
- # if there is more than one param group, there will be multiple flattened tensors - one
182
- # flattened tensor per group - for simplicity merge them into a single tensor
183
- #
184
- # XXX: could make the script more memory efficient for when there are multiple groups - it
185
- # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
-
187
- fp32_flat_groups = [
188
- torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
- ]
190
-
191
- return zero_stage, world_size, fp32_flat_groups
192
-
193
-
194
- def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
- """
196
- Returns fp32 state_dict reconstructed from ds checkpoint
197
-
198
- Args:
199
- - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
-
201
- """
202
- print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
-
204
- optim_files = get_optim_files(ds_checkpoint_dir)
205
- zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
- print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
-
208
- model_files = get_model_state_files(ds_checkpoint_dir)
209
-
210
- zero_model_states = parse_model_states(model_files)
211
- print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
-
213
- if zero_stage <= 2:
214
- return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
- elif zero_stage == 3:
216
- return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
-
218
-
219
- def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
- return
222
-
223
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
- frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
-
226
- if debug:
227
- num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
- print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
-
230
- wanted_params = len(frozen_param_shapes)
231
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
- avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
- print(f'Frozen params: Have {avail_numel} numels to process.')
234
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
-
236
- total_params = 0
237
- total_numel = 0
238
- for name, shape in frozen_param_shapes.items():
239
- total_params += 1
240
- unpartitioned_numel = shape.numel()
241
- total_numel += unpartitioned_numel
242
-
243
- state_dict[name] = frozen_param_fragments[name]
244
-
245
- if debug:
246
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
-
248
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
-
250
-
251
- def _has_callable(obj, fn):
252
- attr = getattr(obj, fn, None)
253
- return callable(attr)
254
-
255
-
256
- def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
257
- param_shapes = zero_model_states[0].param_shapes
258
-
259
- # Reconstruction protocol:
260
- #
261
- # XXX: document this
262
-
263
- if debug:
264
- for i in range(world_size):
265
- for j in range(len(fp32_flat_groups[0])):
266
- print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
267
-
268
- # XXX: memory usage doubles here (zero2)
269
- num_param_groups = len(fp32_flat_groups[0])
270
- merged_single_partition_of_fp32_groups = []
271
- for i in range(num_param_groups):
272
- merged_partitions = [sd[i] for sd in fp32_flat_groups]
273
- full_single_fp32_vector = torch.cat(merged_partitions, 0)
274
- merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
275
- avail_numel = sum(
276
- [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
277
-
278
- if debug:
279
- wanted_params = sum([len(shapes) for shapes in param_shapes])
280
- wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
281
- # not asserting if there is a mismatch due to possible padding
282
- print(f"Have {avail_numel} numels to process.")
283
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
284
-
285
- # params
286
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
287
- # out-of-core computing solution
288
- total_numel = 0
289
- total_params = 0
290
- for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
291
- offset = 0
292
- avail_numel = full_single_fp32_vector.numel()
293
- for name, shape in shapes.items():
294
-
295
- unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
296
- total_numel += unpartitioned_numel
297
- total_params += 1
298
-
299
- if debug:
300
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
301
- state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
302
- offset += unpartitioned_numel
303
-
304
- # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
305
- # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
306
- # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
307
- # live optimizer object, so we are checking that the numbers are within the right range
308
- align_to = 2 * world_size
309
-
310
- def zero2_align(x):
311
- return align_to * math.ceil(x / align_to)
312
-
313
- if debug:
314
- print(f"original offset={offset}, avail_numel={avail_numel}")
315
-
316
- offset = zero2_align(offset)
317
- avail_numel = zero2_align(avail_numel)
318
-
319
- if debug:
320
- print(f"aligned offset={offset}, avail_numel={avail_numel}")
321
-
322
- # Sanity check
323
- if offset != avail_numel:
324
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
325
-
326
- print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
327
-
328
-
329
- def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
330
- state_dict = OrderedDict()
331
-
332
- # buffers
333
- buffers = zero_model_states[0].buffers
334
- state_dict.update(buffers)
335
- if debug:
336
- print(f"added {len(buffers)} buffers")
337
-
338
- _zero2_merge_frozen_params(state_dict, zero_model_states)
339
-
340
- _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
341
-
342
- # recover shared parameters
343
- for pair in zero_model_states[0].shared_params:
344
- if pair[1] in state_dict:
345
- state_dict[pair[0]] = state_dict[pair[1]]
346
-
347
- return state_dict
348
-
349
-
350
- def zero3_partitioned_param_info(unpartitioned_numel, world_size):
351
- remainder = unpartitioned_numel % world_size
352
- padding_numel = (world_size - remainder) if remainder else 0
353
- partitioned_numel = math.ceil(unpartitioned_numel / world_size)
354
- return partitioned_numel, padding_numel
355
-
356
-
357
- def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
358
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
359
- return
360
-
361
- if debug:
362
- for i in range(world_size):
363
- num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
364
- print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
365
-
366
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
367
- wanted_params = len(frozen_param_shapes)
368
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
369
- avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
370
- print(f'Frozen params: Have {avail_numel} numels to process.')
371
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
372
-
373
- total_params = 0
374
- total_numel = 0
375
- for name, shape in zero_model_states[0].frozen_param_shapes.items():
376
- total_params += 1
377
- unpartitioned_numel = shape.numel()
378
- total_numel += unpartitioned_numel
379
-
380
- param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
381
- state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
382
-
383
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
384
-
385
- if debug:
386
- print(
387
- f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
388
- )
389
-
390
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
391
-
392
-
393
- def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
394
- param_shapes = zero_model_states[0].param_shapes
395
- avail_numel = fp32_flat_groups[0].numel() * world_size
396
- # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
397
- # param, re-consolidating each param, while dealing with padding if any
398
-
399
- # merge list of dicts, preserving order
400
- param_shapes = {k: v for d in param_shapes for k, v in d.items()}
401
-
402
- if debug:
403
- for i in range(world_size):
404
- print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
405
-
406
- wanted_params = len(param_shapes)
407
- wanted_numel = sum(shape.numel() for shape in param_shapes.values())
408
- # not asserting if there is a mismatch due to possible padding
409
- avail_numel = fp32_flat_groups[0].numel() * world_size
410
- print(f"Trainable params: Have {avail_numel} numels to process.")
411
- print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
412
-
413
- # params
414
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
415
- # out-of-core computing solution
416
- offset = 0
417
- total_numel = 0
418
- total_params = 0
419
- for name, shape in param_shapes.items():
420
-
421
- unpartitioned_numel = shape.numel()
422
- total_numel += unpartitioned_numel
423
- total_params += 1
424
-
425
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
426
-
427
- if debug:
428
- print(
429
- f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
430
- )
431
-
432
- # XXX: memory usage doubles here
433
- state_dict[name] = torch.cat(
434
- tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
435
- 0).narrow(0, 0, unpartitioned_numel).view(shape)
436
- offset += partitioned_numel
437
-
438
- offset *= world_size
439
-
440
- # Sanity check
441
- if offset != avail_numel:
442
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
443
-
444
- print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
445
-
446
-
447
- def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
448
- state_dict = OrderedDict()
449
-
450
- # buffers
451
- buffers = zero_model_states[0].buffers
452
- state_dict.update(buffers)
453
- if debug:
454
- print(f"added {len(buffers)} buffers")
455
-
456
- _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
457
-
458
- _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
459
-
460
- # recover shared parameters
461
- for pair in zero_model_states[0].shared_params:
462
- if pair[1] in state_dict:
463
- state_dict[pair[0]] = state_dict[pair[1]]
464
-
465
- return state_dict
466
-
467
-
468
- def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
469
- """
470
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
471
- ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
472
- via a model hub.
473
-
474
- Args:
475
- - ``checkpoint_dir``: path to the desired checkpoint folder
476
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
477
-
478
- Returns:
479
- - pytorch ``state_dict``
480
-
481
- Note: this approach may not work if your application doesn't have sufficient free CPU memory and
482
- you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
483
- the checkpoint.
484
-
485
- A typical usage might be ::
486
-
487
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
488
- # do the training and checkpoint saving
489
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
490
- model = model.cpu() # move to cpu
491
- model.load_state_dict(state_dict)
492
- # submit to model hub or save the model to share with others
493
-
494
- In this example the ``model`` will no longer be usable in the deepspeed context of the same
495
- application. i.e. you will need to re-initialize the deepspeed engine, since
496
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
497
-
498
- If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
499
-
500
- """
501
- if tag is None:
502
- latest_path = os.path.join(checkpoint_dir, 'latest')
503
- if os.path.isfile(latest_path):
504
- with open(latest_path, 'r') as fd:
505
- tag = fd.read().strip()
506
- else:
507
- raise ValueError(f"Unable to find 'latest' file at {latest_path}")
508
-
509
- ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
510
-
511
- if not os.path.isdir(ds_checkpoint_dir):
512
- raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
513
-
514
- return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
515
-
516
-
517
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
518
- """
519
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
520
- loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
521
-
522
- Args:
523
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
524
- - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
525
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
526
- """
527
-
528
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
529
- print(f"Saving fp32 state dict to {output_file}")
530
- torch.save(state_dict, output_file)
531
-
532
-
533
- def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
534
- """
535
- 1. Put the provided model to cpu
536
- 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
537
- 3. Load it into the provided model
538
-
539
- Args:
540
- - ``model``: the model object to update
541
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
542
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
543
-
544
- Returns:
545
- - ``model`: modified model
546
-
547
- Make sure you have plenty of CPU memory available before you call this function. If you don't
548
- have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
549
- conveniently placed for you in the checkpoint folder.
550
-
551
- A typical usage might be ::
552
-
553
- from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
554
- model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
555
- # submit to model hub or save the model to share with others
556
-
557
- Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
558
- of the same application. i.e. you will need to re-initialize the deepspeed engine, since
559
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
560
-
561
- """
562
- logger.info(f"Extracting fp32 weights")
563
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
564
-
565
- logger.info(f"Overwriting model with fp32 weights")
566
- model = model.cpu()
567
- model.load_state_dict(state_dict, strict=False)
568
-
569
- return model
570
-
571
-
572
- if __name__ == "__main__":
573
-
574
- parser = argparse.ArgumentParser()
575
- parser.add_argument("checkpoint_dir",
576
- type=str,
577
- help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
578
- parser.add_argument(
579
- "output_file",
580
- type=str,
581
- help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
582
- parser.add_argument("-t",
583
- "--tag",
584
- type=str,
585
- default=None,
586
- help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
587
- parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
588
- args = parser.parse_args()
589
-
590
- debug = args.debug
591
-
592
- convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-336/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: google/gemma-2b
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.9.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-336/adapter_config.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "google/gemma-2b",
5
- "bias": "none",
6
- "fan_in_fan_out": null,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layers_pattern": null,
10
- "layers_to_transform": null,
11
- "loftq_config": {},
12
- "lora_alpha": 16,
13
- "lora_dropout": 0.05,
14
- "megatron_config": null,
15
- "megatron_core": "megatron.core",
16
- "modules_to_save": null,
17
- "peft_type": "LORA",
18
- "r": 32,
19
- "rank_pattern": {},
20
- "revision": null,
21
- "target_modules": [
22
- "up_proj",
23
- "q_proj",
24
- "v_proj",
25
- "down_proj",
26
- "gate_proj",
27
- "k_proj",
28
- "o_proj"
29
- ],
30
- "task_type": "CAUSAL_LM",
31
- "use_dora": false,
32
- "use_rslora": false
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-336/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ce5c9479f7b4e2f4f1c71ed29d0ec95f79e1731de4be9d3f7759abe3043fcdc
3
- size 78480320
 
 
 
 
checkpoint-336/global_step336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f204930ec4f2a105b656f8596b32abc5228db4def6b1aa8c6f63fe8c492820e
3
- size 58886928
 
 
 
 
checkpoint-336/global_step336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd427c55f17c0510ec2ed53fe5e319eb0a2c4761d4083df28d11ba7aa84e5a15
3
- size 58885968
 
 
 
 
checkpoint-336/global_step336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6271e5b9edc1d160ad0326ac1a89d8d44ef09363904f40271525aff81aa3b01d
3
- size 58886992
 
 
 
 
checkpoint-336/global_step336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5abf7c61e69335a8e881c7220e7017eb5372fdf817a3b0d26486e4faab795701
3
- size 58886032
 
 
 
 
checkpoint-336/global_step336/mp_rank_00_model_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dbda4b13cb1e71570782ac3ce184727dbacb34070d7b08deeb937890375555c
3
- size 1159049922
 
 
 
 
checkpoint-336/latest DELETED
@@ -1 +0,0 @@
1
- global_step336
 
 
checkpoint-336/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:28b9cac536dcc2f0fcb0db1a7ed44d898a5e257f0e6a2dde4782893acb56ce7d
3
- size 15024
 
 
 
 
checkpoint-336/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3ee31ce56c4f2248ab7aaf5beaf8d895447d28644df750b83cc2177262498de
3
- size 15024
 
 
 
 
checkpoint-336/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a615e7b3e06287a0e82a15b753b1c48c658347992fbb7d59ee5836d824655ebd
3
- size 15024
 
 
 
 
checkpoint-336/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe5c5388f4cf688aa51717160bed97071e825a07ba7d9a22897241c258de91d9
3
- size 15024
 
 
 
 
checkpoint-336/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:033700c231840b794630147afe6dca04265ec61bb681c241b2e3012bcb9cc8a3
3
- size 1064
 
 
 
 
checkpoint-336/trainer_state.json DELETED
@@ -1,2477 +0,0 @@
1
- {
2
- "best_metric": 1.203959345817566,
3
- "best_model_checkpoint": "./out/checkpoint-112",
4
- "epoch": 2.991097922848665,
5
- "eval_steps": 28,
6
- "global_step": 336,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.01,
13
- "grad_norm": 4.313233023002325,
14
- "learning_rate": 1.8181818181818182e-05,
15
- "loss": 1.9528,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.01,
20
- "eval_loss": 2.1875686645507812,
21
- "eval_runtime": 12.8608,
22
- "eval_samples_per_second": 23.327,
23
- "eval_steps_per_second": 2.955,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.02,
28
- "grad_norm": 4.039172290955229,
29
- "learning_rate": 3.6363636363636364e-05,
30
- "loss": 1.8358,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.03,
35
- "grad_norm": 4.504705512003857,
36
- "learning_rate": 5.4545454545454546e-05,
37
- "loss": 2.0207,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.04,
42
- "grad_norm": 4.591862504847867,
43
- "learning_rate": 7.272727272727273e-05,
44
- "loss": 1.979,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.04,
49
- "grad_norm": 3.812893581399005,
50
- "learning_rate": 9.090909090909092e-05,
51
- "loss": 1.8356,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.05,
56
- "grad_norm": 0.42886752872747064,
57
- "learning_rate": 0.00010909090909090909,
58
- "loss": 1.6722,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.06,
63
- "grad_norm": 0.22497294481851865,
64
- "learning_rate": 0.00012727272727272728,
65
- "loss": 1.6711,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.07,
70
- "grad_norm": 0.20955259847301927,
71
- "learning_rate": 0.00014545454545454546,
72
- "loss": 1.8546,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.08,
77
- "grad_norm": 0.2200095325539683,
78
- "learning_rate": 0.00016363636363636366,
79
- "loss": 1.7538,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.09,
84
- "grad_norm": 0.19187339879899318,
85
- "learning_rate": 0.00018181818181818183,
86
- "loss": 1.6137,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.1,
91
- "grad_norm": 0.2113395673717837,
92
- "learning_rate": 0.0002,
93
- "loss": 1.5225,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.11,
98
- "grad_norm": 0.17673768408382828,
99
- "learning_rate": 0.00019999741592564903,
100
- "loss": 1.5303,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.12,
105
- "grad_norm": 0.24120852820548402,
106
- "learning_rate": 0.00019998966383614488,
107
- "loss": 1.5089,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.12,
112
- "grad_norm": 0.3089489160535682,
113
- "learning_rate": 0.00019997674413212708,
114
- "loss": 1.4525,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.13,
119
- "grad_norm": 0.2656143410731927,
120
- "learning_rate": 0.00019995865748130516,
121
- "loss": 1.4648,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.14,
126
- "grad_norm": 3.769410316227205,
127
- "learning_rate": 0.0001999354048184241,
128
- "loss": 1.3439,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.15,
133
- "grad_norm": 0.32102180658823753,
134
- "learning_rate": 0.00019990698734521613,
135
- "loss": 1.4644,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.16,
140
- "grad_norm": 0.22094428128919438,
141
- "learning_rate": 0.0001998734065303385,
142
- "loss": 1.1927,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.17,
147
- "grad_norm": 0.22344487218098863,
148
- "learning_rate": 0.00019983466410929764,
149
- "loss": 1.2916,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.18,
154
- "grad_norm": 0.25036262498479456,
155
- "learning_rate": 0.0001997907620843595,
156
- "loss": 1.2982,
157
- "step": 20
158
- },
159
- {
160
- "epoch": 0.19,
161
- "grad_norm": 0.22671119151539426,
162
- "learning_rate": 0.00019974170272444604,
163
- "loss": 1.2146,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.2,
168
- "grad_norm": 0.259249080403425,
169
- "learning_rate": 0.00019968748856501788,
170
- "loss": 1.2072,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.2,
175
- "grad_norm": 0.23538477651406017,
176
- "learning_rate": 0.00019962812240794343,
177
- "loss": 1.3281,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.21,
182
- "grad_norm": 0.2659115087625978,
183
- "learning_rate": 0.000199563607321354,
184
- "loss": 1.1396,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.22,
189
- "grad_norm": 0.23617264858854836,
190
- "learning_rate": 0.0001994939466394851,
191
- "loss": 1.1389,
192
- "step": 25
193
- },
194
- {
195
- "epoch": 0.23,
196
- "grad_norm": 0.20514227454180176,
197
- "learning_rate": 0.00019941914396250446,
198
- "loss": 1.249,
199
- "step": 26
200
- },
201
- {
202
- "epoch": 0.24,
203
- "grad_norm": 0.19660894225830144,
204
- "learning_rate": 0.00019933920315632557,
205
- "loss": 1.1776,
206
- "step": 27
207
- },
208
- {
209
- "epoch": 0.25,
210
- "grad_norm": 0.2067663909729571,
211
- "learning_rate": 0.00019925412835240826,
212
- "loss": 1.1327,
213
- "step": 28
214
- },
215
- {
216
- "epoch": 0.25,
217
- "eval_loss": 1.2991960048675537,
218
- "eval_runtime": 13.153,
219
- "eval_samples_per_second": 22.808,
220
- "eval_steps_per_second": 2.889,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.26,
225
- "grad_norm": 0.1816588361901526,
226
- "learning_rate": 0.0001991639239475448,
227
- "loss": 1.1247,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.27,
232
- "grad_norm": 0.19626955153633807,
233
- "learning_rate": 0.00019906859460363307,
234
- "loss": 1.1212,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.28,
239
- "grad_norm": 0.21084275590405852,
240
- "learning_rate": 0.00019896814524743528,
241
- "loss": 0.9927,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.28,
246
- "grad_norm": 0.16560054949456768,
247
- "learning_rate": 0.0001988625810703235,
248
- "loss": 1.1249,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.29,
253
- "grad_norm": 0.14950879528294536,
254
- "learning_rate": 0.0001987519075280114,
255
- "loss": 1.1401,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.3,
260
- "grad_norm": 0.1777966882651237,
261
- "learning_rate": 0.00019863613034027224,
262
- "loss": 1.0769,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.31,
267
- "grad_norm": 0.1480537272052743,
268
- "learning_rate": 0.00019851525549064323,
269
- "loss": 1.0686,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.32,
274
- "grad_norm": 0.16911906750319078,
275
- "learning_rate": 0.00019838928922611632,
276
- "loss": 1.0253,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.33,
281
- "grad_norm": 0.15987682972555176,
282
- "learning_rate": 0.00019825823805681543,
283
- "loss": 1.0609,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.34,
288
- "grad_norm": 0.15757332939676763,
289
- "learning_rate": 0.0001981221087556598,
290
- "loss": 1.1086,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.35,
295
- "grad_norm": 0.13201845744757537,
296
- "learning_rate": 0.00019798090835801418,
297
- "loss": 1.073,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.36,
302
- "grad_norm": 0.12544508015984754,
303
- "learning_rate": 0.00019783464416132506,
304
- "loss": 1.0633,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.36,
309
- "grad_norm": 0.14645820383886451,
310
- "learning_rate": 0.00019768332372474366,
311
- "loss": 1.0653,
312
- "step": 41
313
- },
314
- {
315
- "epoch": 0.37,
316
- "grad_norm": 0.14814101902137117,
317
- "learning_rate": 0.00019752695486873517,
318
- "loss": 1.0937,
319
- "step": 42
320
- },
321
- {
322
- "epoch": 0.38,
323
- "grad_norm": 0.13888915595055443,
324
- "learning_rate": 0.00019736554567467452,
325
- "loss": 1.0462,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.39,
330
- "grad_norm": 0.13185349806639524,
331
- "learning_rate": 0.00019719910448442893,
332
- "loss": 1.2177,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.4,
337
- "grad_norm": 0.15271046712350847,
338
- "learning_rate": 0.00019702763989992662,
339
- "loss": 1.0237,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.41,
344
- "grad_norm": 0.17053588557430902,
345
- "learning_rate": 0.00019685116078271223,
346
- "loss": 1.0038,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.42,
351
- "grad_norm": 0.15641087356577812,
352
- "learning_rate": 0.00019666967625348906,
353
- "loss": 1.0886,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.43,
358
- "grad_norm": 0.1544028594191567,
359
- "learning_rate": 0.00019648319569164736,
360
- "loss": 1.1378,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.44,
365
- "grad_norm": 0.14794885994140625,
366
- "learning_rate": 0.00019629172873477995,
367
- "loss": 1.1495,
368
- "step": 49
369
- },
370
- {
371
- "epoch": 0.45,
372
- "grad_norm": 0.1577684884028266,
373
- "learning_rate": 0.0001960952852781838,
374
- "loss": 1.0782,
375
- "step": 50
376
- },
377
- {
378
- "epoch": 0.45,
379
- "grad_norm": 0.15961044045091288,
380
- "learning_rate": 0.0001958938754743489,
381
- "loss": 1.0107,
382
- "step": 51
383
- },
384
- {
385
- "epoch": 0.46,
386
- "grad_norm": 0.14486696586022083,
387
- "learning_rate": 0.0001956875097324334,
388
- "loss": 1.0494,
389
- "step": 52
390
- },
391
- {
392
- "epoch": 0.47,
393
- "grad_norm": 0.14250413725518896,
394
- "learning_rate": 0.00019547619871772574,
395
- "loss": 1.039,
396
- "step": 53
397
- },
398
- {
399
- "epoch": 0.48,
400
- "grad_norm": 0.1196720279125328,
401
- "learning_rate": 0.00019525995335109334,
402
- "loss": 1.0966,
403
- "step": 54
404
- },
405
- {
406
- "epoch": 0.49,
407
- "grad_norm": 0.14984795891635327,
408
- "learning_rate": 0.0001950387848084183,
409
- "loss": 1.0874,
410
- "step": 55
411
- },
412
- {
413
- "epoch": 0.5,
414
- "grad_norm": 0.14891088442480416,
415
- "learning_rate": 0.00019481270452001987,
416
- "loss": 1.097,
417
- "step": 56
418
- },
419
- {
420
- "epoch": 0.5,
421
- "eval_loss": 1.2264304161071777,
422
- "eval_runtime": 13.2279,
423
- "eval_samples_per_second": 22.679,
424
- "eval_steps_per_second": 2.873,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.51,
429
- "grad_norm": 0.17814266552244534,
430
- "learning_rate": 0.00019458172417006347,
431
- "loss": 1.1372,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.52,
436
- "grad_norm": 0.16125636132578247,
437
- "learning_rate": 0.00019434585569595708,
438
- "loss": 1.0623,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.53,
443
- "grad_norm": 0.15203437202125702,
444
- "learning_rate": 0.00019410511128773418,
445
- "loss": 1.0399,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.53,
450
- "grad_norm": 0.1677461135605213,
451
- "learning_rate": 0.0001938595033874238,
452
- "loss": 1.0884,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.54,
457
- "grad_norm": 0.13564559875683407,
458
- "learning_rate": 0.0001936090446884074,
459
- "loss": 1.0176,
460
- "step": 61
461
- },
462
- {
463
- "epoch": 0.55,
464
- "grad_norm": 0.1521886500642157,
465
- "learning_rate": 0.00019335374813476302,
466
- "loss": 1.0146,
467
- "step": 62
468
- },
469
- {
470
- "epoch": 0.56,
471
- "grad_norm": 0.1410132122625916,
472
- "learning_rate": 0.00019309362692059617,
473
- "loss": 1.044,
474
- "step": 63
475
- },
476
- {
477
- "epoch": 0.57,
478
- "grad_norm": 0.15237848179385577,
479
- "learning_rate": 0.00019282869448935798,
480
- "loss": 1.0354,
481
- "step": 64
482
- },
483
- {
484
- "epoch": 0.58,
485
- "grad_norm": 0.13871660988504514,
486
- "learning_rate": 0.00019255896453315052,
487
- "loss": 1.0189,
488
- "step": 65
489
- },
490
- {
491
- "epoch": 0.59,
492
- "grad_norm": 0.14863047478901453,
493
- "learning_rate": 0.000192284450992019,
494
- "loss": 1.0704,
495
- "step": 66
496
- },
497
- {
498
- "epoch": 0.6,
499
- "grad_norm": 0.13794806124403974,
500
- "learning_rate": 0.0001920051680532314,
501
- "loss": 1.0996,
502
- "step": 67
503
- },
504
- {
505
- "epoch": 0.61,
506
- "grad_norm": 0.13030507705779365,
507
- "learning_rate": 0.00019172113015054532,
508
- "loss": 1.0015,
509
- "step": 68
510
- },
511
- {
512
- "epoch": 0.61,
513
- "grad_norm": 0.15092494718902358,
514
- "learning_rate": 0.0001914323519634619,
515
- "loss": 1.0822,
516
- "step": 69
517
- },
518
- {
519
- "epoch": 0.62,
520
- "grad_norm": 0.1350212989006066,
521
- "learning_rate": 0.00019113884841646736,
522
- "loss": 1.0197,
523
- "step": 70
524
- },
525
- {
526
- "epoch": 0.63,
527
- "grad_norm": 0.18991168066586347,
528
- "learning_rate": 0.00019084063467826137,
529
- "loss": 1.046,
530
- "step": 71
531
- },
532
- {
533
- "epoch": 0.64,
534
- "grad_norm": 0.14884381774710187,
535
- "learning_rate": 0.00019053772616097337,
536
- "loss": 1.0346,
537
- "step": 72
538
- },
539
- {
540
- "epoch": 0.65,
541
- "grad_norm": 0.15579311209945296,
542
- "learning_rate": 0.000190230138519366,
543
- "loss": 1.0505,
544
- "step": 73
545
- },
546
- {
547
- "epoch": 0.66,
548
- "grad_norm": 0.16015337150592127,
549
- "learning_rate": 0.000189917887650026,
550
- "loss": 1.0504,
551
- "step": 74
552
- },
553
- {
554
- "epoch": 0.67,
555
- "grad_norm": 0.1443969321518926,
556
- "learning_rate": 0.00018960098969054255,
557
- "loss": 1.0755,
558
- "step": 75
559
- },
560
- {
561
- "epoch": 0.68,
562
- "grad_norm": 0.15722162227095848,
563
- "learning_rate": 0.00018927946101867347,
564
- "loss": 1.0541,
565
- "step": 76
566
- },
567
- {
568
- "epoch": 0.69,
569
- "grad_norm": 0.17009697584926559,
570
- "learning_rate": 0.0001889533182514986,
571
- "loss": 1.0231,
572
- "step": 77
573
- },
574
- {
575
- "epoch": 0.69,
576
- "grad_norm": 0.1256822726781221,
577
- "learning_rate": 0.0001886225782445612,
578
- "loss": 0.8814,
579
- "step": 78
580
- },
581
- {
582
- "epoch": 0.7,
583
- "grad_norm": 0.14019958069756655,
584
- "learning_rate": 0.00018828725809099655,
585
- "loss": 1.0277,
586
- "step": 79
587
- },
588
- {
589
- "epoch": 0.71,
590
- "grad_norm": 0.17159459150063183,
591
- "learning_rate": 0.0001879473751206489,
592
- "loss": 0.9495,
593
- "step": 80
594
- },
595
- {
596
- "epoch": 0.72,
597
- "grad_norm": 0.146430011834186,
598
- "learning_rate": 0.00018760294689917553,
599
- "loss": 1.0598,
600
- "step": 81
601
- },
602
- {
603
- "epoch": 0.73,
604
- "grad_norm": 0.16834256802992476,
605
- "learning_rate": 0.00018725399122713912,
606
- "loss": 1.0237,
607
- "step": 82
608
- },
609
- {
610
- "epoch": 0.74,
611
- "grad_norm": 0.15663699267164208,
612
- "learning_rate": 0.00018690052613908772,
613
- "loss": 0.939,
614
- "step": 83
615
- },
616
- {
617
- "epoch": 0.75,
618
- "grad_norm": 0.15655985150409854,
619
- "learning_rate": 0.0001865425699026226,
620
- "loss": 1.0302,
621
- "step": 84
622
- },
623
- {
624
- "epoch": 0.75,
625
- "eval_loss": 1.2143030166625977,
626
- "eval_runtime": 13.2387,
627
- "eval_samples_per_second": 22.661,
628
- "eval_steps_per_second": 2.87,
629
- "step": 84
630
- },
631
- {
632
- "epoch": 0.76,
633
- "grad_norm": 0.15273470110260864,
634
- "learning_rate": 0.00018618014101745442,
635
- "loss": 1.0127,
636
- "step": 85
637
- },
638
- {
639
- "epoch": 0.77,
640
- "grad_norm": 0.1723243680259614,
641
- "learning_rate": 0.0001858132582144469,
642
- "loss": 0.9306,
643
- "step": 86
644
- },
645
- {
646
- "epoch": 0.77,
647
- "grad_norm": 0.14747098547446996,
648
- "learning_rate": 0.00018544194045464886,
649
- "loss": 1.0073,
650
- "step": 87
651
- },
652
- {
653
- "epoch": 0.78,
654
- "grad_norm": 0.17208333285514918,
655
- "learning_rate": 0.00018506620692831428,
656
- "loss": 1.0328,
657
- "step": 88
658
- },
659
- {
660
- "epoch": 0.79,
661
- "grad_norm": 0.14918051024971962,
662
- "learning_rate": 0.0001846860770539105,
663
- "loss": 1.0022,
664
- "step": 89
665
- },
666
- {
667
- "epoch": 0.8,
668
- "grad_norm": 0.156315164090714,
669
- "learning_rate": 0.00018430157047711474,
670
- "loss": 1.0293,
671
- "step": 90
672
- },
673
- {
674
- "epoch": 0.81,
675
- "grad_norm": 0.2013424548288477,
676
- "learning_rate": 0.00018391270706979862,
677
- "loss": 0.9395,
678
- "step": 91
679
- },
680
- {
681
- "epoch": 0.82,
682
- "grad_norm": 0.17909726353002614,
683
- "learning_rate": 0.00018351950692900126,
684
- "loss": 0.9756,
685
- "step": 92
686
- },
687
- {
688
- "epoch": 0.83,
689
- "grad_norm": 0.16939245158726288,
690
- "learning_rate": 0.00018312199037589068,
691
- "loss": 0.9576,
692
- "step": 93
693
- },
694
- {
695
- "epoch": 0.84,
696
- "grad_norm": 0.14685720680893694,
697
- "learning_rate": 0.00018272017795471345,
698
- "loss": 1.0045,
699
- "step": 94
700
- },
701
- {
702
- "epoch": 0.85,
703
- "grad_norm": 0.17464839085505987,
704
- "learning_rate": 0.000182314090431733,
705
- "loss": 0.9862,
706
- "step": 95
707
- },
708
- {
709
- "epoch": 0.85,
710
- "grad_norm": 0.16060904136932572,
711
- "learning_rate": 0.00018190374879415632,
712
- "loss": 1.0022,
713
- "step": 96
714
- },
715
- {
716
- "epoch": 0.86,
717
- "grad_norm": 0.18715193350083867,
718
- "learning_rate": 0.00018148917424904953,
719
- "loss": 1.042,
720
- "step": 97
721
- },
722
- {
723
- "epoch": 0.87,
724
- "grad_norm": 0.1675573400576595,
725
- "learning_rate": 0.0001810703882222415,
726
- "loss": 1.0047,
727
- "step": 98
728
- },
729
- {
730
- "epoch": 0.88,
731
- "grad_norm": 0.1871466286989249,
732
- "learning_rate": 0.00018064741235721687,
733
- "loss": 0.9834,
734
- "step": 99
735
- },
736
- {
737
- "epoch": 0.89,
738
- "grad_norm": 0.17453934867565302,
739
- "learning_rate": 0.00018022026851399737,
740
- "loss": 0.9649,
741
- "step": 100
742
- },
743
- {
744
- "epoch": 0.9,
745
- "grad_norm": 0.15960631507184767,
746
- "learning_rate": 0.0001797889787680119,
747
- "loss": 0.9673,
748
- "step": 101
749
- },
750
- {
751
- "epoch": 0.91,
752
- "grad_norm": 0.17844936635366368,
753
- "learning_rate": 0.00017935356540895597,
754
- "loss": 1.0951,
755
- "step": 102
756
- },
757
- {
758
- "epoch": 0.92,
759
- "grad_norm": 0.16733018789000254,
760
- "learning_rate": 0.00017891405093963938,
761
- "loss": 0.9954,
762
- "step": 103
763
- },
764
- {
765
- "epoch": 0.93,
766
- "grad_norm": 0.17305556075296993,
767
- "learning_rate": 0.00017847045807482345,
768
- "loss": 0.892,
769
- "step": 104
770
- },
771
- {
772
- "epoch": 0.93,
773
- "grad_norm": 0.17197614099805034,
774
- "learning_rate": 0.00017802280974004716,
775
- "loss": 1.0494,
776
- "step": 105
777
- },
778
- {
779
- "epoch": 0.94,
780
- "grad_norm": 0.18063836817127235,
781
- "learning_rate": 0.000177571129070442,
782
- "loss": 1.0264,
783
- "step": 106
784
- },
785
- {
786
- "epoch": 0.95,
787
- "grad_norm": 0.14597707005699143,
788
- "learning_rate": 0.00017711543940953668,
789
- "loss": 0.9532,
790
- "step": 107
791
- },
792
- {
793
- "epoch": 0.96,
794
- "grad_norm": 0.1422048149465345,
795
- "learning_rate": 0.00017665576430805053,
796
- "loss": 0.97,
797
- "step": 108
798
- },
799
- {
800
- "epoch": 0.97,
801
- "grad_norm": 0.18313914688655572,
802
- "learning_rate": 0.0001761921275226763,
803
- "loss": 0.9282,
804
- "step": 109
805
- },
806
- {
807
- "epoch": 0.98,
808
- "grad_norm": 0.200679751171441,
809
- "learning_rate": 0.00017572455301485249,
810
- "loss": 1.0,
811
- "step": 110
812
- },
813
- {
814
- "epoch": 0.99,
815
- "grad_norm": 0.17700985594898055,
816
- "learning_rate": 0.00017525306494952498,
817
- "loss": 1.0165,
818
- "step": 111
819
- },
820
- {
821
- "epoch": 1.0,
822
- "grad_norm": 0.19925777202726191,
823
- "learning_rate": 0.0001747776876938981,
824
- "loss": 1.0346,
825
- "step": 112
826
- },
827
- {
828
- "epoch": 1.0,
829
- "eval_loss": 1.203959345817566,
830
- "eval_runtime": 13.2547,
831
- "eval_samples_per_second": 22.634,
832
- "eval_steps_per_second": 2.867,
833
- "step": 112
834
- },
835
- {
836
- "epoch": 1.01,
837
- "grad_norm": 0.1606469603473709,
838
- "learning_rate": 0.00017429844581617532,
839
- "loss": 0.9832,
840
- "step": 113
841
- },
842
- {
843
- "epoch": 1.01,
844
- "grad_norm": 0.16403912763780054,
845
- "learning_rate": 0.00017381536408428948,
846
- "loss": 0.9346,
847
- "step": 114
848
- },
849
- {
850
- "epoch": 1.02,
851
- "grad_norm": 0.1936046893744468,
852
- "learning_rate": 0.00017332846746462288,
853
- "loss": 0.9382,
854
- "step": 115
855
- },
856
- {
857
- "epoch": 1.03,
858
- "grad_norm": 0.14250769247239573,
859
- "learning_rate": 0.0001728377811207168,
860
- "loss": 0.8914,
861
- "step": 116
862
- },
863
- {
864
- "epoch": 1.04,
865
- "grad_norm": 0.17889563599797687,
866
- "learning_rate": 0.00017234333041197126,
867
- "loss": 0.9736,
868
- "step": 117
869
- },
870
- {
871
- "epoch": 1.05,
872
- "grad_norm": 0.20288960866045594,
873
- "learning_rate": 0.00017184514089233405,
874
- "loss": 0.8477,
875
- "step": 118
876
- },
877
- {
878
- "epoch": 1.06,
879
- "grad_norm": 0.20926349930533472,
880
- "learning_rate": 0.00017134323830898037,
881
- "loss": 0.9933,
882
- "step": 119
883
- },
884
- {
885
- "epoch": 1.07,
886
- "grad_norm": 0.21316934416499642,
887
- "learning_rate": 0.00017083764860098205,
888
- "loss": 0.9168,
889
- "step": 120
890
- },
891
- {
892
- "epoch": 1.08,
893
- "grad_norm": 0.21654320387312692,
894
- "learning_rate": 0.0001703283978979671,
895
- "loss": 0.9584,
896
- "step": 121
897
- },
898
- {
899
- "epoch": 1.09,
900
- "grad_norm": 0.23789742308175463,
901
- "learning_rate": 0.00016981551251876904,
902
- "loss": 1.0298,
903
- "step": 122
904
- },
905
- {
906
- "epoch": 1.09,
907
- "grad_norm": 0.16433271793469648,
908
- "learning_rate": 0.00016929901897006698,
909
- "loss": 0.8833,
910
- "step": 123
911
- },
912
- {
913
- "epoch": 1.1,
914
- "grad_norm": 0.16908727866207868,
915
- "learning_rate": 0.0001687789439450156,
916
- "loss": 1.0675,
917
- "step": 124
918
- },
919
- {
920
- "epoch": 1.11,
921
- "grad_norm": 0.1670067931363302,
922
- "learning_rate": 0.00016825531432186543,
923
- "loss": 0.9515,
924
- "step": 125
925
- },
926
- {
927
- "epoch": 1.12,
928
- "grad_norm": 0.17777465531550865,
929
- "learning_rate": 0.00016772815716257412,
930
- "loss": 0.8929,
931
- "step": 126
932
- },
933
- {
934
- "epoch": 1.13,
935
- "grad_norm": 0.18442783204919333,
936
- "learning_rate": 0.00016719749971140754,
937
- "loss": 0.8388,
938
- "step": 127
939
- },
940
- {
941
- "epoch": 1.14,
942
- "grad_norm": 0.19073362304284272,
943
- "learning_rate": 0.0001666633693935319,
944
- "loss": 0.9584,
945
- "step": 128
946
- },
947
- {
948
- "epoch": 1.15,
949
- "grad_norm": 0.20189563405135308,
950
- "learning_rate": 0.00016612579381359622,
951
- "loss": 1.0264,
952
- "step": 129
953
- },
954
- {
955
- "epoch": 1.16,
956
- "grad_norm": 0.1694138210313381,
957
- "learning_rate": 0.00016558480075430594,
958
- "loss": 0.9592,
959
- "step": 130
960
- },
961
- {
962
- "epoch": 1.17,
963
- "grad_norm": 0.19195382946787184,
964
- "learning_rate": 0.00016504041817498678,
965
- "loss": 0.974,
966
- "step": 131
967
- },
968
- {
969
- "epoch": 1.18,
970
- "grad_norm": 0.20684215619155688,
971
- "learning_rate": 0.00016449267421013994,
972
- "loss": 0.8499,
973
- "step": 132
974
- },
975
- {
976
- "epoch": 1.18,
977
- "grad_norm": 0.22003490429847744,
978
- "learning_rate": 0.00016394159716798807,
979
- "loss": 0.9659,
980
- "step": 133
981
- },
982
- {
983
- "epoch": 1.19,
984
- "grad_norm": 0.21977918206745437,
985
- "learning_rate": 0.00016338721552901212,
986
- "loss": 0.9213,
987
- "step": 134
988
- },
989
- {
990
- "epoch": 1.2,
991
- "grad_norm": 0.2076993903333204,
992
- "learning_rate": 0.0001628295579444796,
993
- "loss": 0.8119,
994
- "step": 135
995
- },
996
- {
997
- "epoch": 1.21,
998
- "grad_norm": 0.2001771499954729,
999
- "learning_rate": 0.0001622686532349637,
1000
- "loss": 0.9183,
1001
- "step": 136
1002
- },
1003
- {
1004
- "epoch": 1.22,
1005
- "grad_norm": 0.18671550149366203,
1006
- "learning_rate": 0.00016170453038885394,
1007
- "loss": 0.8836,
1008
- "step": 137
1009
- },
1010
- {
1011
- "epoch": 1.23,
1012
- "grad_norm": 0.20867427207572573,
1013
- "learning_rate": 0.0001611372185608578,
1014
- "loss": 0.9964,
1015
- "step": 138
1016
- },
1017
- {
1018
- "epoch": 1.24,
1019
- "grad_norm": 0.20035138443113176,
1020
- "learning_rate": 0.0001605667470704942,
1021
- "loss": 0.9209,
1022
- "step": 139
1023
- },
1024
- {
1025
- "epoch": 1.25,
1026
- "grad_norm": 0.22696612020505577,
1027
- "learning_rate": 0.0001599931454005781,
1028
- "loss": 1.0162,
1029
- "step": 140
1030
- },
1031
- {
1032
- "epoch": 1.25,
1033
- "eval_loss": 1.2188584804534912,
1034
- "eval_runtime": 13.249,
1035
- "eval_samples_per_second": 22.643,
1036
- "eval_steps_per_second": 2.868,
1037
- "step": 140
1038
- },
1039
- {
1040
- "epoch": 1.26,
1041
- "grad_norm": 0.21554353495018647,
1042
- "learning_rate": 0.00015941644319569665,
1043
- "loss": 1.0487,
1044
- "step": 141
1045
- },
1046
- {
1047
- "epoch": 1.26,
1048
- "grad_norm": 0.22894492131909072,
1049
- "learning_rate": 0.00015883667026067745,
1050
- "loss": 0.9352,
1051
- "step": 142
1052
- },
1053
- {
1054
- "epoch": 1.27,
1055
- "grad_norm": 0.19145184577172686,
1056
- "learning_rate": 0.00015825385655904788,
1057
- "loss": 0.8878,
1058
- "step": 143
1059
- },
1060
- {
1061
- "epoch": 1.28,
1062
- "grad_norm": 0.22544664152936575,
1063
- "learning_rate": 0.00015766803221148673,
1064
- "loss": 1.0,
1065
- "step": 144
1066
- },
1067
- {
1068
- "epoch": 1.29,
1069
- "grad_norm": 0.26000661355557114,
1070
- "learning_rate": 0.00015707922749426737,
1071
- "loss": 0.9339,
1072
- "step": 145
1073
- },
1074
- {
1075
- "epoch": 1.3,
1076
- "grad_norm": 0.24433845134512236,
1077
- "learning_rate": 0.00015648747283769317,
1078
- "loss": 0.9474,
1079
- "step": 146
1080
- },
1081
- {
1082
- "epoch": 1.31,
1083
- "grad_norm": 0.21973931169609887,
1084
- "learning_rate": 0.00015589279882452476,
1085
- "loss": 0.9357,
1086
- "step": 147
1087
- },
1088
- {
1089
- "epoch": 1.32,
1090
- "grad_norm": 0.23929008733305812,
1091
- "learning_rate": 0.0001552952361883994,
1092
- "loss": 0.9985,
1093
- "step": 148
1094
- },
1095
- {
1096
- "epoch": 1.33,
1097
- "grad_norm": 0.23431856747573573,
1098
- "learning_rate": 0.00015469481581224272,
1099
- "loss": 0.8913,
1100
- "step": 149
1101
- },
1102
- {
1103
- "epoch": 1.34,
1104
- "grad_norm": 0.2233543327912565,
1105
- "learning_rate": 0.00015409156872667258,
1106
- "loss": 0.9877,
1107
- "step": 150
1108
- },
1109
- {
1110
- "epoch": 1.34,
1111
- "grad_norm": 0.21281207674183256,
1112
- "learning_rate": 0.0001534855261083954,
1113
- "loss": 0.9071,
1114
- "step": 151
1115
- },
1116
- {
1117
- "epoch": 1.35,
1118
- "grad_norm": 0.20314832700152685,
1119
- "learning_rate": 0.00015287671927859494,
1120
- "loss": 0.9373,
1121
- "step": 152
1122
- },
1123
- {
1124
- "epoch": 1.36,
1125
- "grad_norm": 0.19648565819019825,
1126
- "learning_rate": 0.00015226517970131343,
1127
- "loss": 0.9469,
1128
- "step": 153
1129
- },
1130
- {
1131
- "epoch": 1.37,
1132
- "grad_norm": 0.2262428264639853,
1133
- "learning_rate": 0.00015165093898182562,
1134
- "loss": 1.0066,
1135
- "step": 154
1136
- },
1137
- {
1138
- "epoch": 1.38,
1139
- "grad_norm": 0.22253433035020442,
1140
- "learning_rate": 0.00015103402886500525,
1141
- "loss": 0.8875,
1142
- "step": 155
1143
- },
1144
- {
1145
- "epoch": 1.39,
1146
- "grad_norm": 0.181161648904613,
1147
- "learning_rate": 0.00015041448123368455,
1148
- "loss": 0.9004,
1149
- "step": 156
1150
- },
1151
- {
1152
- "epoch": 1.4,
1153
- "grad_norm": 0.20968483802367816,
1154
- "learning_rate": 0.00014979232810700637,
1155
- "loss": 0.9133,
1156
- "step": 157
1157
- },
1158
- {
1159
- "epoch": 1.41,
1160
- "grad_norm": 0.20540509271288435,
1161
- "learning_rate": 0.0001491676016387694,
1162
- "loss": 0.8876,
1163
- "step": 158
1164
- },
1165
- {
1166
- "epoch": 1.42,
1167
- "grad_norm": 0.18762795731312454,
1168
- "learning_rate": 0.00014854033411576659,
1169
- "loss": 0.933,
1170
- "step": 159
1171
- },
1172
- {
1173
- "epoch": 1.42,
1174
- "grad_norm": 0.23223345997338857,
1175
- "learning_rate": 0.00014791055795611624,
1176
- "loss": 0.9182,
1177
- "step": 160
1178
- },
1179
- {
1180
- "epoch": 1.43,
1181
- "grad_norm": 0.21932384461027146,
1182
- "learning_rate": 0.00014727830570758678,
1183
- "loss": 0.9514,
1184
- "step": 161
1185
- },
1186
- {
1187
- "epoch": 1.44,
1188
- "grad_norm": 0.21819663730951108,
1189
- "learning_rate": 0.0001466436100459146,
1190
- "loss": 0.9162,
1191
- "step": 162
1192
- },
1193
- {
1194
- "epoch": 1.45,
1195
- "grad_norm": 0.2325813323476676,
1196
- "learning_rate": 0.00014600650377311522,
1197
- "loss": 0.9308,
1198
- "step": 163
1199
- },
1200
- {
1201
- "epoch": 1.46,
1202
- "grad_norm": 0.2568337182939043,
1203
- "learning_rate": 0.0001453670198157883,
1204
- "loss": 0.9995,
1205
- "step": 164
1206
- },
1207
- {
1208
- "epoch": 1.47,
1209
- "grad_norm": 0.22578454460723413,
1210
- "learning_rate": 0.00014472519122341566,
1211
- "loss": 0.9052,
1212
- "step": 165
1213
- },
1214
- {
1215
- "epoch": 1.48,
1216
- "grad_norm": 0.23564258958796755,
1217
- "learning_rate": 0.00014408105116665336,
1218
- "loss": 0.9714,
1219
- "step": 166
1220
- },
1221
- {
1222
- "epoch": 1.49,
1223
- "grad_norm": 0.24266133562839415,
1224
- "learning_rate": 0.00014343463293561734,
1225
- "loss": 0.9219,
1226
- "step": 167
1227
- },
1228
- {
1229
- "epoch": 1.5,
1230
- "grad_norm": 0.23472454708184465,
1231
- "learning_rate": 0.00014278596993816308,
1232
- "loss": 0.8762,
1233
- "step": 168
1234
- },
1235
- {
1236
- "epoch": 1.5,
1237
- "eval_loss": 1.2197421789169312,
1238
- "eval_runtime": 13.2616,
1239
- "eval_samples_per_second": 22.622,
1240
- "eval_steps_per_second": 2.865,
1241
- "step": 168
1242
- },
1243
- {
1244
- "epoch": 1.5,
1245
- "grad_norm": 0.23623633375452713,
1246
- "learning_rate": 0.00014213509569815884,
1247
- "loss": 0.8809,
1248
- "step": 169
1249
- },
1250
- {
1251
- "epoch": 1.51,
1252
- "grad_norm": 0.25344275204523486,
1253
- "learning_rate": 0.00014148204385375321,
1254
- "loss": 0.7972,
1255
- "step": 170
1256
- },
1257
- {
1258
- "epoch": 1.52,
1259
- "grad_norm": 0.23111396119549557,
1260
- "learning_rate": 0.0001408268481556366,
1261
- "loss": 0.8228,
1262
- "step": 171
1263
- },
1264
- {
1265
- "epoch": 1.53,
1266
- "grad_norm": 0.2510618369255398,
1267
- "learning_rate": 0.00014016954246529696,
1268
- "loss": 0.8849,
1269
- "step": 172
1270
- },
1271
- {
1272
- "epoch": 1.54,
1273
- "grad_norm": 0.2764366116622668,
1274
- "learning_rate": 0.0001395101607532698,
1275
- "loss": 0.8936,
1276
- "step": 173
1277
- },
1278
- {
1279
- "epoch": 1.55,
1280
- "grad_norm": 0.24325811719582827,
1281
- "learning_rate": 0.00013884873709738257,
1282
- "loss": 0.8602,
1283
- "step": 174
1284
- },
1285
- {
1286
- "epoch": 1.56,
1287
- "grad_norm": 0.213781513838486,
1288
- "learning_rate": 0.00013818530568099327,
1289
- "loss": 0.9492,
1290
- "step": 175
1291
- },
1292
- {
1293
- "epoch": 1.57,
1294
- "grad_norm": 0.2397396374239057,
1295
- "learning_rate": 0.00013751990079122412,
1296
- "loss": 1.0499,
1297
- "step": 176
1298
- },
1299
- {
1300
- "epoch": 1.58,
1301
- "grad_norm": 0.21579907170368723,
1302
- "learning_rate": 0.00013685255681718922,
1303
- "loss": 0.9438,
1304
- "step": 177
1305
- },
1306
- {
1307
- "epoch": 1.58,
1308
- "grad_norm": 0.2359312681928786,
1309
- "learning_rate": 0.0001361833082482175,
1310
- "loss": 0.9289,
1311
- "step": 178
1312
- },
1313
- {
1314
- "epoch": 1.59,
1315
- "grad_norm": 0.2618189093396496,
1316
- "learning_rate": 0.0001355121896720703,
1317
- "loss": 0.981,
1318
- "step": 179
1319
- },
1320
- {
1321
- "epoch": 1.6,
1322
- "grad_norm": 0.20876513773174135,
1323
- "learning_rate": 0.00013483923577315348,
1324
- "loss": 0.82,
1325
- "step": 180
1326
- },
1327
- {
1328
- "epoch": 1.61,
1329
- "grad_norm": 0.22162748553995645,
1330
- "learning_rate": 0.00013416448133072526,
1331
- "loss": 1.0131,
1332
- "step": 181
1333
- },
1334
- {
1335
- "epoch": 1.62,
1336
- "grad_norm": 0.20975549982451164,
1337
- "learning_rate": 0.00013348796121709862,
1338
- "loss": 0.8763,
1339
- "step": 182
1340
- },
1341
- {
1342
- "epoch": 1.63,
1343
- "grad_norm": 0.22840397707525473,
1344
- "learning_rate": 0.00013280971039583906,
1345
- "loss": 0.949,
1346
- "step": 183
1347
- },
1348
- {
1349
- "epoch": 1.64,
1350
- "grad_norm": 0.23384636230161737,
1351
- "learning_rate": 0.0001321297639199575,
1352
- "loss": 0.9567,
1353
- "step": 184
1354
- },
1355
- {
1356
- "epoch": 1.65,
1357
- "grad_norm": 0.22905979409902957,
1358
- "learning_rate": 0.000131448156930099,
1359
- "loss": 0.9153,
1360
- "step": 185
1361
- },
1362
- {
1363
- "epoch": 1.66,
1364
- "grad_norm": 0.27620894683694563,
1365
- "learning_rate": 0.0001307649246527263,
1366
- "loss": 0.8246,
1367
- "step": 186
1368
- },
1369
- {
1370
- "epoch": 1.66,
1371
- "grad_norm": 0.23004170633106227,
1372
- "learning_rate": 0.0001300801023982995,
1373
- "loss": 1.0181,
1374
- "step": 187
1375
- },
1376
- {
1377
- "epoch": 1.67,
1378
- "grad_norm": 0.2219849136264378,
1379
- "learning_rate": 0.00012939372555945112,
1380
- "loss": 0.9535,
1381
- "step": 188
1382
- },
1383
- {
1384
- "epoch": 1.68,
1385
- "grad_norm": 0.24458750452490116,
1386
- "learning_rate": 0.0001287058296091567,
1387
- "loss": 0.8968,
1388
- "step": 189
1389
- },
1390
- {
1391
- "epoch": 1.69,
1392
- "grad_norm": 0.2564337740159555,
1393
- "learning_rate": 0.00012801645009890195,
1394
- "loss": 0.7955,
1395
- "step": 190
1396
- },
1397
- {
1398
- "epoch": 1.7,
1399
- "grad_norm": 0.24100850371438767,
1400
- "learning_rate": 0.0001273256226568451,
1401
- "loss": 0.9235,
1402
- "step": 191
1403
- },
1404
- {
1405
- "epoch": 1.71,
1406
- "grad_norm": 0.24757089527873732,
1407
- "learning_rate": 0.00012663338298597563,
1408
- "loss": 1.007,
1409
- "step": 192
1410
- },
1411
- {
1412
- "epoch": 1.72,
1413
- "grad_norm": 0.24701038583742888,
1414
- "learning_rate": 0.00012593976686226904,
1415
- "loss": 0.9885,
1416
- "step": 193
1417
- },
1418
- {
1419
- "epoch": 1.73,
1420
- "grad_norm": 0.26373721125634964,
1421
- "learning_rate": 0.0001252448101328381,
1422
- "loss": 0.8785,
1423
- "step": 194
1424
- },
1425
- {
1426
- "epoch": 1.74,
1427
- "grad_norm": 0.2227761464470136,
1428
- "learning_rate": 0.00012454854871407994,
1429
- "loss": 0.8806,
1430
- "step": 195
1431
- },
1432
- {
1433
- "epoch": 1.74,
1434
- "grad_norm": 0.2283950634350429,
1435
- "learning_rate": 0.00012385101858982005,
1436
- "loss": 0.9053,
1437
- "step": 196
1438
- },
1439
- {
1440
- "epoch": 1.74,
1441
- "eval_loss": 1.2198154926300049,
1442
- "eval_runtime": 13.2208,
1443
- "eval_samples_per_second": 22.692,
1444
- "eval_steps_per_second": 2.874,
1445
- "step": 196
1446
- },
1447
- {
1448
- "epoch": 1.75,
1449
- "grad_norm": 0.23406423788354982,
1450
- "learning_rate": 0.00012315225580945252,
1451
- "loss": 0.9397,
1452
- "step": 197
1453
- },
1454
- {
1455
- "epoch": 1.76,
1456
- "grad_norm": 0.23807045727443327,
1457
- "learning_rate": 0.0001224522964860769,
1458
- "loss": 0.9712,
1459
- "step": 198
1460
- },
1461
- {
1462
- "epoch": 1.77,
1463
- "grad_norm": 0.2463614808838948,
1464
- "learning_rate": 0.00012175117679463187,
1465
- "loss": 0.8558,
1466
- "step": 199
1467
- },
1468
- {
1469
- "epoch": 1.78,
1470
- "grad_norm": 0.24737417059302014,
1471
- "learning_rate": 0.00012104893297002567,
1472
- "loss": 0.9723,
1473
- "step": 200
1474
- },
1475
- {
1476
- "epoch": 1.79,
1477
- "grad_norm": 0.243750688050595,
1478
- "learning_rate": 0.0001203456013052634,
1479
- "loss": 0.964,
1480
- "step": 201
1481
- },
1482
- {
1483
- "epoch": 1.8,
1484
- "grad_norm": 0.24572059557106538,
1485
- "learning_rate": 0.00011964121814957137,
1486
- "loss": 0.9109,
1487
- "step": 202
1488
- },
1489
- {
1490
- "epoch": 1.81,
1491
- "grad_norm": 0.24044117903962453,
1492
- "learning_rate": 0.00011893581990651848,
1493
- "loss": 1.0019,
1494
- "step": 203
1495
- },
1496
- {
1497
- "epoch": 1.82,
1498
- "grad_norm": 0.2737568489071465,
1499
- "learning_rate": 0.00011822944303213486,
1500
- "loss": 0.8893,
1501
- "step": 204
1502
- },
1503
- {
1504
- "epoch": 1.82,
1505
- "grad_norm": 0.24122455882790084,
1506
- "learning_rate": 0.00011752212403302784,
1507
- "loss": 0.9162,
1508
- "step": 205
1509
- },
1510
- {
1511
- "epoch": 1.83,
1512
- "grad_norm": 0.28991871401626856,
1513
- "learning_rate": 0.00011681389946449504,
1514
- "loss": 0.8555,
1515
- "step": 206
1516
- },
1517
- {
1518
- "epoch": 1.84,
1519
- "grad_norm": 0.23767408810646548,
1520
- "learning_rate": 0.00011610480592863531,
1521
- "loss": 0.9936,
1522
- "step": 207
1523
- },
1524
- {
1525
- "epoch": 1.85,
1526
- "grad_norm": 0.22614733706173062,
1527
- "learning_rate": 0.00011539488007245702,
1528
- "loss": 0.916,
1529
- "step": 208
1530
- },
1531
- {
1532
- "epoch": 1.86,
1533
- "grad_norm": 0.22471992425846515,
1534
- "learning_rate": 0.00011468415858598411,
1535
- "loss": 0.8872,
1536
- "step": 209
1537
- },
1538
- {
1539
- "epoch": 1.87,
1540
- "grad_norm": 0.22675717145909688,
1541
- "learning_rate": 0.00011397267820035986,
1542
- "loss": 0.8393,
1543
- "step": 210
1544
- },
1545
- {
1546
- "epoch": 1.88,
1547
- "grad_norm": 0.2727459336483823,
1548
- "learning_rate": 0.00011326047568594851,
1549
- "loss": 0.8265,
1550
- "step": 211
1551
- },
1552
- {
1553
- "epoch": 1.89,
1554
- "grad_norm": 0.25216778031670767,
1555
- "learning_rate": 0.00011254758785043515,
1556
- "loss": 0.9939,
1557
- "step": 212
1558
- },
1559
- {
1560
- "epoch": 1.9,
1561
- "grad_norm": 0.269147378424304,
1562
- "learning_rate": 0.0001118340515369232,
1563
- "loss": 0.9102,
1564
- "step": 213
1565
- },
1566
- {
1567
- "epoch": 1.91,
1568
- "grad_norm": 0.2216178370833471,
1569
- "learning_rate": 0.00011111990362203033,
1570
- "loss": 0.8778,
1571
- "step": 214
1572
- },
1573
- {
1574
- "epoch": 1.91,
1575
- "grad_norm": 0.2602474934716497,
1576
- "learning_rate": 0.00011040518101398276,
1577
- "loss": 0.9454,
1578
- "step": 215
1579
- },
1580
- {
1581
- "epoch": 1.92,
1582
- "grad_norm": 0.2658635078442998,
1583
- "learning_rate": 0.00010968992065070769,
1584
- "loss": 0.8098,
1585
- "step": 216
1586
- },
1587
- {
1588
- "epoch": 1.93,
1589
- "grad_norm": 0.20997905209488962,
1590
- "learning_rate": 0.00010897415949792427,
1591
- "loss": 0.9318,
1592
- "step": 217
1593
- },
1594
- {
1595
- "epoch": 1.94,
1596
- "grad_norm": 0.24752453752221557,
1597
- "learning_rate": 0.00010825793454723325,
1598
- "loss": 0.949,
1599
- "step": 218
1600
- },
1601
- {
1602
- "epoch": 1.95,
1603
- "grad_norm": 0.255579569750529,
1604
- "learning_rate": 0.0001075412828142051,
1605
- "loss": 0.915,
1606
- "step": 219
1607
- },
1608
- {
1609
- "epoch": 1.96,
1610
- "grad_norm": 0.23186981930561867,
1611
- "learning_rate": 0.0001068242413364671,
1612
- "loss": 0.9132,
1613
- "step": 220
1614
- },
1615
- {
1616
- "epoch": 1.97,
1617
- "grad_norm": 0.35685140391438824,
1618
- "learning_rate": 0.00010610684717178905,
1619
- "loss": 0.9398,
1620
- "step": 221
1621
- },
1622
- {
1623
- "epoch": 1.98,
1624
- "grad_norm": 0.27320389987223703,
1625
- "learning_rate": 0.00010538913739616816,
1626
- "loss": 0.857,
1627
- "step": 222
1628
- },
1629
- {
1630
- "epoch": 1.99,
1631
- "grad_norm": 0.2324276771141761,
1632
- "learning_rate": 0.00010467114910191289,
1633
- "loss": 0.8546,
1634
- "step": 223
1635
- },
1636
- {
1637
- "epoch": 1.99,
1638
- "grad_norm": 0.22820341349854167,
1639
- "learning_rate": 0.00010395291939572593,
1640
- "loss": 0.9301,
1641
- "step": 224
1642
- },
1643
- {
1644
- "epoch": 1.99,
1645
- "eval_loss": 1.2246263027191162,
1646
- "eval_runtime": 13.1981,
1647
- "eval_samples_per_second": 22.731,
1648
- "eval_steps_per_second": 2.879,
1649
- "step": 224
1650
- },
1651
- {
1652
- "epoch": 2.0,
1653
- "grad_norm": 0.2289800489154315,
1654
- "learning_rate": 0.00010323448539678653,
1655
- "loss": 0.9922,
1656
- "step": 225
1657
- },
1658
- {
1659
- "epoch": 2.01,
1660
- "grad_norm": 0.2673353778680862,
1661
- "learning_rate": 0.00010251588423483205,
1662
- "loss": 0.7779,
1663
- "step": 226
1664
- },
1665
- {
1666
- "epoch": 2.02,
1667
- "grad_norm": 0.2420933678952559,
1668
- "learning_rate": 0.0001017971530482392,
1669
- "loss": 0.8044,
1670
- "step": 227
1671
- },
1672
- {
1673
- "epoch": 2.03,
1674
- "grad_norm": 0.21799264660625498,
1675
- "learning_rate": 0.00010107832898210439,
1676
- "loss": 0.8773,
1677
- "step": 228
1678
- },
1679
- {
1680
- "epoch": 2.04,
1681
- "grad_norm": 0.21443255695871016,
1682
- "learning_rate": 0.00010035944918632429,
1683
- "loss": 0.9031,
1684
- "step": 229
1685
- },
1686
- {
1687
- "epoch": 2.05,
1688
- "grad_norm": 0.23983734165788242,
1689
- "learning_rate": 9.96405508136757e-05,
1690
- "loss": 0.9014,
1691
- "step": 230
1692
- },
1693
- {
1694
- "epoch": 2.06,
1695
- "grad_norm": 0.27915481475799336,
1696
- "learning_rate": 9.892167101789564e-05,
1697
- "loss": 0.8853,
1698
- "step": 231
1699
- },
1700
- {
1701
- "epoch": 2.07,
1702
- "grad_norm": 0.2688949371564916,
1703
- "learning_rate": 9.820284695176082e-05,
1704
- "loss": 0.8452,
1705
- "step": 232
1706
- },
1707
- {
1708
- "epoch": 2.07,
1709
- "grad_norm": 0.2623278518867105,
1710
- "learning_rate": 9.748411576516794e-05,
1711
- "loss": 0.8612,
1712
- "step": 233
1713
- },
1714
- {
1715
- "epoch": 2.08,
1716
- "grad_norm": 0.2710502639103885,
1717
- "learning_rate": 9.676551460321349e-05,
1718
- "loss": 0.8108,
1719
- "step": 234
1720
- },
1721
- {
1722
- "epoch": 2.09,
1723
- "grad_norm": 0.282572880285737,
1724
- "learning_rate": 9.60470806042741e-05,
1725
- "loss": 0.7866,
1726
- "step": 235
1727
- },
1728
- {
1729
- "epoch": 2.1,
1730
- "grad_norm": 0.2829396962922612,
1731
- "learning_rate": 9.532885089808713e-05,
1732
- "loss": 0.8557,
1733
- "step": 236
1734
- },
1735
- {
1736
- "epoch": 2.11,
1737
- "grad_norm": 0.2721172338857335,
1738
- "learning_rate": 9.461086260383187e-05,
1739
- "loss": 0.7933,
1740
- "step": 237
1741
- },
1742
- {
1743
- "epoch": 2.12,
1744
- "grad_norm": 0.29736638811364446,
1745
- "learning_rate": 9.389315282821097e-05,
1746
- "loss": 0.7674,
1747
- "step": 238
1748
- },
1749
- {
1750
- "epoch": 2.13,
1751
- "grad_norm": 0.28571679920981263,
1752
- "learning_rate": 9.317575866353292e-05,
1753
- "loss": 0.7442,
1754
- "step": 239
1755
- },
1756
- {
1757
- "epoch": 2.14,
1758
- "grad_norm": 0.264545167150173,
1759
- "learning_rate": 9.245871718579491e-05,
1760
- "loss": 0.8505,
1761
- "step": 240
1762
- },
1763
- {
1764
- "epoch": 2.15,
1765
- "grad_norm": 0.30691085134027757,
1766
- "learning_rate": 9.174206545276677e-05,
1767
- "loss": 0.7898,
1768
- "step": 241
1769
- },
1770
- {
1771
- "epoch": 2.15,
1772
- "grad_norm": 0.31375028121981235,
1773
- "learning_rate": 9.102584050207578e-05,
1774
- "loss": 0.7661,
1775
- "step": 242
1776
- },
1777
- {
1778
- "epoch": 2.16,
1779
- "grad_norm": 0.28421530221837016,
1780
- "learning_rate": 9.031007934929236e-05,
1781
- "loss": 0.8328,
1782
- "step": 243
1783
- },
1784
- {
1785
- "epoch": 2.17,
1786
- "grad_norm": 0.25601367811173414,
1787
- "learning_rate": 8.959481898601728e-05,
1788
- "loss": 0.8281,
1789
- "step": 244
1790
- },
1791
- {
1792
- "epoch": 2.18,
1793
- "grad_norm": 0.2983724947729522,
1794
- "learning_rate": 8.888009637796968e-05,
1795
- "loss": 0.8567,
1796
- "step": 245
1797
- },
1798
- {
1799
- "epoch": 2.19,
1800
- "grad_norm": 0.2545616786933236,
1801
- "learning_rate": 8.81659484630768e-05,
1802
- "loss": 0.9151,
1803
- "step": 246
1804
- },
1805
- {
1806
- "epoch": 2.2,
1807
- "grad_norm": 0.23873712362647942,
1808
- "learning_rate": 8.745241214956483e-05,
1809
- "loss": 0.8818,
1810
- "step": 247
1811
- },
1812
- {
1813
- "epoch": 2.21,
1814
- "grad_norm": 0.285331972404065,
1815
- "learning_rate": 8.673952431405148e-05,
1816
- "loss": 0.7983,
1817
- "step": 248
1818
- },
1819
- {
1820
- "epoch": 2.22,
1821
- "grad_norm": 0.23897707291689843,
1822
- "learning_rate": 8.602732179964017e-05,
1823
- "loss": 0.8758,
1824
- "step": 249
1825
- },
1826
- {
1827
- "epoch": 2.23,
1828
- "grad_norm": 0.2830966091447457,
1829
- "learning_rate": 8.531584141401591e-05,
1830
- "loss": 0.8714,
1831
- "step": 250
1832
- },
1833
- {
1834
- "epoch": 2.23,
1835
- "grad_norm": 0.28872599217076506,
1836
- "learning_rate": 8.4605119927543e-05,
1837
- "loss": 0.8387,
1838
- "step": 251
1839
- },
1840
- {
1841
- "epoch": 2.24,
1842
- "grad_norm": 0.2652236346400331,
1843
- "learning_rate": 8.38951940713647e-05,
1844
- "loss": 0.8232,
1845
- "step": 252
1846
- },
1847
- {
1848
- "epoch": 2.24,
1849
- "eval_loss": 1.2432794570922852,
1850
- "eval_runtime": 13.2405,
1851
- "eval_samples_per_second": 22.658,
1852
- "eval_steps_per_second": 2.87,
1853
- "step": 252
1854
- },
1855
- {
1856
- "epoch": 2.25,
1857
- "grad_norm": 0.299978013524394,
1858
- "learning_rate": 8.318610053550497e-05,
1859
- "loss": 0.7321,
1860
- "step": 253
1861
- },
1862
- {
1863
- "epoch": 2.26,
1864
- "grad_norm": 0.2740002835117391,
1865
- "learning_rate": 8.247787596697218e-05,
1866
- "loss": 0.7605,
1867
- "step": 254
1868
- },
1869
- {
1870
- "epoch": 2.27,
1871
- "grad_norm": 0.2848366030132808,
1872
- "learning_rate": 8.177055696786516e-05,
1873
- "loss": 0.8485,
1874
- "step": 255
1875
- },
1876
- {
1877
- "epoch": 2.28,
1878
- "grad_norm": 0.24847418856075218,
1879
- "learning_rate": 8.106418009348157e-05,
1880
- "loss": 0.7557,
1881
- "step": 256
1882
- },
1883
- {
1884
- "epoch": 2.29,
1885
- "grad_norm": 0.33515508602624905,
1886
- "learning_rate": 8.035878185042868e-05,
1887
- "loss": 0.8015,
1888
- "step": 257
1889
- },
1890
- {
1891
- "epoch": 2.3,
1892
- "grad_norm": 0.2905943721096322,
1893
- "learning_rate": 7.965439869473664e-05,
1894
- "loss": 0.8457,
1895
- "step": 258
1896
- },
1897
- {
1898
- "epoch": 2.31,
1899
- "grad_norm": 0.3140679719552616,
1900
- "learning_rate": 7.895106702997437e-05,
1901
- "loss": 0.8559,
1902
- "step": 259
1903
- },
1904
- {
1905
- "epoch": 2.31,
1906
- "grad_norm": 0.29745105018138573,
1907
- "learning_rate": 7.824882320536814e-05,
1908
- "loss": 0.7453,
1909
- "step": 260
1910
- },
1911
- {
1912
- "epoch": 2.32,
1913
- "grad_norm": 0.29818631731197365,
1914
- "learning_rate": 7.754770351392311e-05,
1915
- "loss": 0.8354,
1916
- "step": 261
1917
- },
1918
- {
1919
- "epoch": 2.33,
1920
- "grad_norm": 0.24721488944366407,
1921
- "learning_rate": 7.684774419054747e-05,
1922
- "loss": 0.7755,
1923
- "step": 262
1924
- },
1925
- {
1926
- "epoch": 2.34,
1927
- "grad_norm": 0.31210442779019465,
1928
- "learning_rate": 7.614898141017996e-05,
1929
- "loss": 0.7208,
1930
- "step": 263
1931
- },
1932
- {
1933
- "epoch": 2.35,
1934
- "grad_norm": 0.2873220240109992,
1935
- "learning_rate": 7.54514512859201e-05,
1936
- "loss": 0.7548,
1937
- "step": 264
1938
- },
1939
- {
1940
- "epoch": 2.36,
1941
- "grad_norm": 0.3006634171776217,
1942
- "learning_rate": 7.475518986716194e-05,
1943
- "loss": 0.7566,
1944
- "step": 265
1945
- },
1946
- {
1947
- "epoch": 2.37,
1948
- "grad_norm": 0.2799417613336026,
1949
- "learning_rate": 7.406023313773097e-05,
1950
- "loss": 0.727,
1951
- "step": 266
1952
- },
1953
- {
1954
- "epoch": 2.38,
1955
- "grad_norm": 0.2451761866231664,
1956
- "learning_rate": 7.336661701402439e-05,
1957
- "loss": 0.9641,
1958
- "step": 267
1959
- },
1960
- {
1961
- "epoch": 2.39,
1962
- "grad_norm": 0.305202611125298,
1963
- "learning_rate": 7.267437734315492e-05,
1964
- "loss": 0.7891,
1965
- "step": 268
1966
- },
1967
- {
1968
- "epoch": 2.39,
1969
- "grad_norm": 0.29107717848747816,
1970
- "learning_rate": 7.198354990109805e-05,
1971
- "loss": 0.9032,
1972
- "step": 269
1973
- },
1974
- {
1975
- "epoch": 2.4,
1976
- "grad_norm": 0.2688898665176787,
1977
- "learning_rate": 7.129417039084333e-05,
1978
- "loss": 0.8416,
1979
- "step": 270
1980
- },
1981
- {
1982
- "epoch": 2.41,
1983
- "grad_norm": 0.2814206029778395,
1984
- "learning_rate": 7.060627444054893e-05,
1985
- "loss": 0.8443,
1986
- "step": 271
1987
- },
1988
- {
1989
- "epoch": 2.42,
1990
- "grad_norm": 0.2862094867555512,
1991
- "learning_rate": 6.99198976017005e-05,
1992
- "loss": 0.8271,
1993
- "step": 272
1994
- },
1995
- {
1996
- "epoch": 2.43,
1997
- "grad_norm": 0.3214647340394826,
1998
- "learning_rate": 6.923507534727373e-05,
1999
- "loss": 0.7793,
2000
- "step": 273
2001
- },
2002
- {
2003
- "epoch": 2.44,
2004
- "grad_norm": 0.3033659714564417,
2005
- "learning_rate": 6.855184306990106e-05,
2006
- "loss": 0.7856,
2007
- "step": 274
2008
- },
2009
- {
2010
- "epoch": 2.45,
2011
- "grad_norm": 0.3024382342577774,
2012
- "learning_rate": 6.78702360800425e-05,
2013
- "loss": 0.8633,
2014
- "step": 275
2015
- },
2016
- {
2017
- "epoch": 2.46,
2018
- "grad_norm": 0.25803598196729505,
2019
- "learning_rate": 6.719028960416098e-05,
2020
- "loss": 0.8428,
2021
- "step": 276
2022
- },
2023
- {
2024
- "epoch": 2.47,
2025
- "grad_norm": 0.35469202971401803,
2026
- "learning_rate": 6.651203878290139e-05,
2027
- "loss": 0.8665,
2028
- "step": 277
2029
- },
2030
- {
2031
- "epoch": 2.47,
2032
- "grad_norm": 0.3122516837597691,
2033
- "learning_rate": 6.583551866927475e-05,
2034
- "loss": 0.8787,
2035
- "step": 278
2036
- },
2037
- {
2038
- "epoch": 2.48,
2039
- "grad_norm": 0.3305470786367901,
2040
- "learning_rate": 6.516076422684654e-05,
2041
- "loss": 0.8765,
2042
- "step": 279
2043
- },
2044
- {
2045
- "epoch": 2.49,
2046
- "grad_norm": 0.3324622666488467,
2047
- "learning_rate": 6.448781032792972e-05,
2048
- "loss": 0.8318,
2049
- "step": 280
2050
- },
2051
- {
2052
- "epoch": 2.49,
2053
- "eval_loss": 1.2546111345291138,
2054
- "eval_runtime": 13.2379,
2055
- "eval_samples_per_second": 22.662,
2056
- "eval_steps_per_second": 2.871,
2057
- "step": 280
2058
- },
2059
- {
2060
- "epoch": 2.5,
2061
- "grad_norm": 0.342341713579355,
2062
- "learning_rate": 6.381669175178248e-05,
2063
- "loss": 0.9517,
2064
- "step": 281
2065
- },
2066
- {
2067
- "epoch": 2.51,
2068
- "grad_norm": 0.33913458352374665,
2069
- "learning_rate": 6.31474431828108e-05,
2070
- "loss": 0.8564,
2071
- "step": 282
2072
- },
2073
- {
2074
- "epoch": 2.52,
2075
- "grad_norm": 0.30528689383480295,
2076
- "learning_rate": 6.248009920877592e-05,
2077
- "loss": 0.8199,
2078
- "step": 283
2079
- },
2080
- {
2081
- "epoch": 2.53,
2082
- "grad_norm": 0.29698648367254743,
2083
- "learning_rate": 6.181469431900672e-05,
2084
- "loss": 0.785,
2085
- "step": 284
2086
- },
2087
- {
2088
- "epoch": 2.54,
2089
- "grad_norm": 0.32239262939282626,
2090
- "learning_rate": 6.115126290261745e-05,
2091
- "loss": 0.7794,
2092
- "step": 285
2093
- },
2094
- {
2095
- "epoch": 2.55,
2096
- "grad_norm": 0.2694595905080167,
2097
- "learning_rate": 6.048983924673022e-05,
2098
- "loss": 0.8056,
2099
- "step": 286
2100
- },
2101
- {
2102
- "epoch": 2.55,
2103
- "grad_norm": 0.3045496751154443,
2104
- "learning_rate": 5.983045753470308e-05,
2105
- "loss": 0.8164,
2106
- "step": 287
2107
- },
2108
- {
2109
- "epoch": 2.56,
2110
- "grad_norm": 0.2927868214627918,
2111
- "learning_rate": 5.917315184436345e-05,
2112
- "loss": 0.8358,
2113
- "step": 288
2114
- },
2115
- {
2116
- "epoch": 2.57,
2117
- "grad_norm": 0.2931914055644858,
2118
- "learning_rate": 5.851795614624682e-05,
2119
- "loss": 0.8011,
2120
- "step": 289
2121
- },
2122
- {
2123
- "epoch": 2.58,
2124
- "grad_norm": 0.3158716819379082,
2125
- "learning_rate": 5.786490430184115e-05,
2126
- "loss": 0.8332,
2127
- "step": 290
2128
- },
2129
- {
2130
- "epoch": 2.59,
2131
- "grad_norm": 0.3482519147352008,
2132
- "learning_rate": 5.72140300618369e-05,
2133
- "loss": 0.7621,
2134
- "step": 291
2135
- },
2136
- {
2137
- "epoch": 2.6,
2138
- "grad_norm": 0.28652801822050894,
2139
- "learning_rate": 5.656536706438267e-05,
2140
- "loss": 0.77,
2141
- "step": 292
2142
- },
2143
- {
2144
- "epoch": 2.61,
2145
- "grad_norm": 0.29691290613407717,
2146
- "learning_rate": 5.591894883334667e-05,
2147
- "loss": 0.9394,
2148
- "step": 293
2149
- },
2150
- {
2151
- "epoch": 2.62,
2152
- "grad_norm": 0.26699581966985203,
2153
- "learning_rate": 5.5274808776584367e-05,
2154
- "loss": 0.7918,
2155
- "step": 294
2156
- },
2157
- {
2158
- "epoch": 2.63,
2159
- "grad_norm": 0.2926923719762685,
2160
- "learning_rate": 5.463298018421171e-05,
2161
- "loss": 0.8723,
2162
- "step": 295
2163
- },
2164
- {
2165
- "epoch": 2.64,
2166
- "grad_norm": 0.3403087263187063,
2167
- "learning_rate": 5.399349622688479e-05,
2168
- "loss": 0.8097,
2169
- "step": 296
2170
- },
2171
- {
2172
- "epoch": 2.64,
2173
- "grad_norm": 0.34261233464532476,
2174
- "learning_rate": 5.335638995408545e-05,
2175
- "loss": 0.9032,
2176
- "step": 297
2177
- },
2178
- {
2179
- "epoch": 2.65,
2180
- "grad_norm": 0.31315234759634086,
2181
- "learning_rate": 5.272169429241325e-05,
2182
- "loss": 0.82,
2183
- "step": 298
2184
- },
2185
- {
2186
- "epoch": 2.66,
2187
- "grad_norm": 0.3179759425444047,
2188
- "learning_rate": 5.208944204388377e-05,
2189
- "loss": 0.8864,
2190
- "step": 299
2191
- },
2192
- {
2193
- "epoch": 2.67,
2194
- "grad_norm": 0.3121296356843828,
2195
- "learning_rate": 5.145966588423341e-05,
2196
- "loss": 0.8258,
2197
- "step": 300
2198
- },
2199
- {
2200
- "epoch": 2.68,
2201
- "grad_norm": 0.268436849924173,
2202
- "learning_rate": 5.0832398361230596e-05,
2203
- "loss": 0.8906,
2204
- "step": 301
2205
- },
2206
- {
2207
- "epoch": 2.69,
2208
- "grad_norm": 0.2961161602467319,
2209
- "learning_rate": 5.020767189299369e-05,
2210
- "loss": 0.8828,
2211
- "step": 302
2212
- },
2213
- {
2214
- "epoch": 2.7,
2215
- "grad_norm": 0.27743957099992345,
2216
- "learning_rate": 4.9585518766315496e-05,
2217
- "loss": 0.8251,
2218
- "step": 303
2219
- },
2220
- {
2221
- "epoch": 2.71,
2222
- "grad_norm": 0.2949909861852426,
2223
- "learning_rate": 4.896597113499479e-05,
2224
- "loss": 0.7911,
2225
- "step": 304
2226
- },
2227
- {
2228
- "epoch": 2.72,
2229
- "grad_norm": 0.3161115451278363,
2230
- "learning_rate": 4.834906101817438e-05,
2231
- "loss": 0.8157,
2232
- "step": 305
2233
- },
2234
- {
2235
- "epoch": 2.72,
2236
- "grad_norm": 0.28720077046065867,
2237
- "learning_rate": 4.773482029868657e-05,
2238
- "loss": 0.82,
2239
- "step": 306
2240
- },
2241
- {
2242
- "epoch": 2.73,
2243
- "grad_norm": 0.4045319357608716,
2244
- "learning_rate": 4.712328072140505e-05,
2245
- "loss": 0.8414,
2246
- "step": 307
2247
- },
2248
- {
2249
- "epoch": 2.74,
2250
- "grad_norm": 0.3070232288390269,
2251
- "learning_rate": 4.651447389160458e-05,
2252
- "loss": 0.8427,
2253
- "step": 308
2254
- },
2255
- {
2256
- "epoch": 2.74,
2257
- "eval_loss": 1.2574400901794434,
2258
- "eval_runtime": 13.2473,
2259
- "eval_samples_per_second": 22.646,
2260
- "eval_steps_per_second": 2.869,
2261
- "step": 308
2262
- },
2263
- {
2264
- "epoch": 2.75,
2265
- "grad_norm": 0.3214782806968351,
2266
- "learning_rate": 4.5908431273327436e-05,
2267
- "loss": 0.8469,
2268
- "step": 309
2269
- },
2270
- {
2271
- "epoch": 2.76,
2272
- "grad_norm": 0.24241410698156174,
2273
- "learning_rate": 4.530518418775733e-05,
2274
- "loss": 0.8346,
2275
- "step": 310
2276
- },
2277
- {
2278
- "epoch": 2.77,
2279
- "grad_norm": 0.3303263594210879,
2280
- "learning_rate": 4.470476381160065e-05,
2281
- "loss": 0.8298,
2282
- "step": 311
2283
- },
2284
- {
2285
- "epoch": 2.78,
2286
- "grad_norm": 0.30711900849760865,
2287
- "learning_rate": 4.4107201175475275e-05,
2288
- "loss": 0.789,
2289
- "step": 312
2290
- },
2291
- {
2292
- "epoch": 2.79,
2293
- "grad_norm": 0.2954465859389713,
2294
- "learning_rate": 4.351252716230685e-05,
2295
- "loss": 0.8029,
2296
- "step": 313
2297
- },
2298
- {
2299
- "epoch": 2.8,
2300
- "grad_norm": 0.29925087091531116,
2301
- "learning_rate": 4.292077250573266e-05,
2302
- "loss": 0.8633,
2303
- "step": 314
2304
- },
2305
- {
2306
- "epoch": 2.8,
2307
- "grad_norm": 0.3177611223775825,
2308
- "learning_rate": 4.2331967788513295e-05,
2309
- "loss": 0.76,
2310
- "step": 315
2311
- },
2312
- {
2313
- "epoch": 2.81,
2314
- "grad_norm": 0.28642407848269513,
2315
- "learning_rate": 4.174614344095213e-05,
2316
- "loss": 0.823,
2317
- "step": 316
2318
- },
2319
- {
2320
- "epoch": 2.82,
2321
- "grad_norm": 0.3243224656005062,
2322
- "learning_rate": 4.116332973932256e-05,
2323
- "loss": 0.7831,
2324
- "step": 317
2325
- },
2326
- {
2327
- "epoch": 2.83,
2328
- "grad_norm": 0.34877334027822726,
2329
- "learning_rate": 4.058355680430337e-05,
2330
- "loss": 0.899,
2331
- "step": 318
2332
- },
2333
- {
2334
- "epoch": 2.84,
2335
- "grad_norm": 0.28640325479143114,
2336
- "learning_rate": 4.0006854599421926e-05,
2337
- "loss": 0.8292,
2338
- "step": 319
2339
- },
2340
- {
2341
- "epoch": 2.85,
2342
- "grad_norm": 0.3135316628014535,
2343
- "learning_rate": 3.943325292950579e-05,
2344
- "loss": 0.8731,
2345
- "step": 320
2346
- },
2347
- {
2348
- "epoch": 2.86,
2349
- "grad_norm": 0.2949970604257085,
2350
- "learning_rate": 3.886278143914219e-05,
2351
- "loss": 0.8402,
2352
- "step": 321
2353
- },
2354
- {
2355
- "epoch": 2.87,
2356
- "grad_norm": 0.30057896586780075,
2357
- "learning_rate": 3.829546961114607e-05,
2358
- "loss": 0.7713,
2359
- "step": 322
2360
- },
2361
- {
2362
- "epoch": 2.88,
2363
- "grad_norm": 0.3558574270285126,
2364
- "learning_rate": 3.773134676503629e-05,
2365
- "loss": 0.8435,
2366
- "step": 323
2367
- },
2368
- {
2369
- "epoch": 2.88,
2370
- "grad_norm": 0.29115288332943334,
2371
- "learning_rate": 3.7170442055520415e-05,
2372
- "loss": 0.9022,
2373
- "step": 324
2374
- },
2375
- {
2376
- "epoch": 2.89,
2377
- "grad_norm": 0.3192074718527619,
2378
- "learning_rate": 3.661278447098789e-05,
2379
- "loss": 0.7662,
2380
- "step": 325
2381
- },
2382
- {
2383
- "epoch": 2.9,
2384
- "grad_norm": 0.33335742888185405,
2385
- "learning_rate": 3.605840283201195e-05,
2386
- "loss": 0.8111,
2387
- "step": 326
2388
- },
2389
- {
2390
- "epoch": 2.91,
2391
- "grad_norm": 0.29748212071395186,
2392
- "learning_rate": 3.550732578986006e-05,
2393
- "loss": 0.7543,
2394
- "step": 327
2395
- },
2396
- {
2397
- "epoch": 2.92,
2398
- "grad_norm": 0.3680409192627914,
2399
- "learning_rate": 3.495958182501325e-05,
2400
- "loss": 0.8124,
2401
- "step": 328
2402
- },
2403
- {
2404
- "epoch": 2.93,
2405
- "grad_norm": 0.27807302364345643,
2406
- "learning_rate": 3.441519924569408e-05,
2407
- "loss": 0.7856,
2408
- "step": 329
2409
- },
2410
- {
2411
- "epoch": 2.94,
2412
- "grad_norm": 0.3050855823733691,
2413
- "learning_rate": 3.387420618640379e-05,
2414
- "loss": 0.8506,
2415
- "step": 330
2416
- },
2417
- {
2418
- "epoch": 2.95,
2419
- "grad_norm": 0.3322620263029238,
2420
- "learning_rate": 3.3336630606468134e-05,
2421
- "loss": 0.8771,
2422
- "step": 331
2423
- },
2424
- {
2425
- "epoch": 2.96,
2426
- "grad_norm": 0.3112008867427982,
2427
- "learning_rate": 3.280250028859248e-05,
2428
- "loss": 0.7785,
2429
- "step": 332
2430
- },
2431
- {
2432
- "epoch": 2.96,
2433
- "grad_norm": 0.2839548329095365,
2434
- "learning_rate": 3.227184283742591e-05,
2435
- "loss": 0.9153,
2436
- "step": 333
2437
- },
2438
- {
2439
- "epoch": 2.97,
2440
- "grad_norm": 0.34615397822650606,
2441
- "learning_rate": 3.174468567813461e-05,
2442
- "loss": 0.7753,
2443
- "step": 334
2444
- },
2445
- {
2446
- "epoch": 2.98,
2447
- "grad_norm": 0.34691866307772695,
2448
- "learning_rate": 3.122105605498442e-05,
2449
- "loss": 0.851,
2450
- "step": 335
2451
- },
2452
- {
2453
- "epoch": 2.99,
2454
- "grad_norm": 0.296369624391198,
2455
- "learning_rate": 3.070098102993302e-05,
2456
- "loss": 0.8572,
2457
- "step": 336
2458
- },
2459
- {
2460
- "epoch": 2.99,
2461
- "eval_loss": 1.2511259317398071,
2462
- "eval_runtime": 13.2202,
2463
- "eval_samples_per_second": 22.692,
2464
- "eval_steps_per_second": 2.874,
2465
- "step": 336
2466
- }
2467
- ],
2468
- "logging_steps": 1,
2469
- "max_steps": 448,
2470
- "num_input_tokens_seen": 0,
2471
- "num_train_epochs": 4,
2472
- "save_steps": 112,
2473
- "total_flos": 4.005448726012232e+17,
2474
- "train_batch_size": 2,
2475
- "trial_name": null,
2476
- "trial_params": null
2477
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-336/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d400c16f982c36b10268ff7e69e878c44d11f5fb692a61770a8e1efb50d4491c
3
- size 6776
 
 
 
 
checkpoint-336/zero_to_fp32.py DELETED
@@ -1,592 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Copyright (c) Microsoft Corporation.
4
- # SPDX-License-Identifier: Apache-2.0
5
-
6
- # DeepSpeed Team
7
-
8
- # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
- # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
- # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
- # application.
12
- #
13
- # example: python zero_to_fp32.py . pytorch_model.bin
14
-
15
- import argparse
16
- import torch
17
- import glob
18
- import math
19
- import os
20
- import re
21
- from collections import OrderedDict
22
- from dataclasses import dataclass
23
-
24
- # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
- # DeepSpeed data structures it has to be available in the current python environment.
26
- from deepspeed.utils import logger
27
- from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
- FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
- FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
-
31
-
32
- @dataclass
33
- class zero_model_state:
34
- buffers: dict()
35
- param_shapes: dict()
36
- shared_params: list
37
- ds_version: int
38
- frozen_param_shapes: dict()
39
- frozen_param_fragments: dict()
40
-
41
-
42
- debug = 0
43
-
44
- # load to cpu
45
- device = torch.device('cpu')
46
-
47
-
48
- def atoi(text):
49
- return int(text) if text.isdigit() else text
50
-
51
-
52
- def natural_keys(text):
53
- '''
54
- alist.sort(key=natural_keys) sorts in human order
55
- http://nedbatchelder.com/blog/200712/human_sorting.html
56
- (See Toothy's implementation in the comments)
57
- '''
58
- return [atoi(c) for c in re.split(r'(\d+)', text)]
59
-
60
-
61
- def get_model_state_file(checkpoint_dir, zero_stage):
62
- if not os.path.isdir(checkpoint_dir):
63
- raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
-
65
- # there should be only one file
66
- if zero_stage <= 2:
67
- file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
- elif zero_stage == 3:
69
- file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
-
71
- if not os.path.exists(file):
72
- raise FileNotFoundError(f"can't find model states file at '{file}'")
73
-
74
- return file
75
-
76
-
77
- def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
- # XXX: need to test that this simple glob rule works for multi-node setup too
79
- ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
-
81
- if len(ckpt_files) == 0:
82
- raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
-
84
- return ckpt_files
85
-
86
-
87
- def get_optim_files(checkpoint_dir):
88
- return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
-
90
-
91
- def get_model_state_files(checkpoint_dir):
92
- return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
-
94
-
95
- def parse_model_states(files):
96
- zero_model_states = []
97
- for file in files:
98
- state_dict = torch.load(file, map_location=device)
99
-
100
- if BUFFER_NAMES not in state_dict:
101
- raise ValueError(f"{file} is not a model state checkpoint")
102
- buffer_names = state_dict[BUFFER_NAMES]
103
- if debug:
104
- print("Found buffers:", buffer_names)
105
-
106
- # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
- buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
- param_shapes = state_dict[PARAM_SHAPES]
109
-
110
- # collect parameters that are included in param_shapes
111
- param_names = []
112
- for s in param_shapes:
113
- for name in s.keys():
114
- param_names.append(name)
115
-
116
- # update with frozen parameters
117
- frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
- if frozen_param_shapes is not None:
119
- if debug:
120
- print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
- param_names += list(frozen_param_shapes.keys())
122
-
123
- # handle shared params
124
- shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
-
126
- ds_version = state_dict.get(DS_VERSION, None)
127
-
128
- frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
-
130
- z_model_state = zero_model_state(buffers=buffers,
131
- param_shapes=param_shapes,
132
- shared_params=shared_params,
133
- ds_version=ds_version,
134
- frozen_param_shapes=frozen_param_shapes,
135
- frozen_param_fragments=frozen_param_fragments)
136
- zero_model_states.append(z_model_state)
137
-
138
- return zero_model_states
139
-
140
-
141
- def parse_optim_states(files, ds_checkpoint_dir):
142
-
143
- total_files = len(files)
144
- state_dicts = []
145
- for f in files:
146
- state_dict = torch.load(f, map_location=device)
147
- # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
- # and also handle the case where it was already removed by another helper script
149
- state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
- state_dicts.append(state_dict)
151
-
152
- if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
- raise ValueError(f"{files[0]} is not a zero checkpoint")
154
- zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
- world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
-
157
- # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
- # parameters can be different from data parallelism for non-expert parameters. So we can just
159
- # use the max of the partition_count to get the dp world_size.
160
-
161
- if type(world_size) is list:
162
- world_size = max(world_size)
163
-
164
- if world_size != total_files:
165
- raise ValueError(
166
- f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
- "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
- )
169
-
170
- # the groups are named differently in each stage
171
- if zero_stage <= 2:
172
- fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
- elif zero_stage == 3:
174
- fp32_groups_key = FP32_FLAT_GROUPS
175
- else:
176
- raise ValueError(f"unknown zero stage {zero_stage}")
177
-
178
- if zero_stage <= 2:
179
- fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
- elif zero_stage == 3:
181
- # if there is more than one param group, there will be multiple flattened tensors - one
182
- # flattened tensor per group - for simplicity merge them into a single tensor
183
- #
184
- # XXX: could make the script more memory efficient for when there are multiple groups - it
185
- # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
-
187
- fp32_flat_groups = [
188
- torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
- ]
190
-
191
- return zero_stage, world_size, fp32_flat_groups
192
-
193
-
194
- def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
- """
196
- Returns fp32 state_dict reconstructed from ds checkpoint
197
-
198
- Args:
199
- - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
-
201
- """
202
- print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
-
204
- optim_files = get_optim_files(ds_checkpoint_dir)
205
- zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
- print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
-
208
- model_files = get_model_state_files(ds_checkpoint_dir)
209
-
210
- zero_model_states = parse_model_states(model_files)
211
- print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
-
213
- if zero_stage <= 2:
214
- return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
- elif zero_stage == 3:
216
- return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
-
218
-
219
- def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
- return
222
-
223
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
- frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
-
226
- if debug:
227
- num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
- print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
-
230
- wanted_params = len(frozen_param_shapes)
231
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
- avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
- print(f'Frozen params: Have {avail_numel} numels to process.')
234
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
-
236
- total_params = 0
237
- total_numel = 0
238
- for name, shape in frozen_param_shapes.items():
239
- total_params += 1
240
- unpartitioned_numel = shape.numel()
241
- total_numel += unpartitioned_numel
242
-
243
- state_dict[name] = frozen_param_fragments[name]
244
-
245
- if debug:
246
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
-
248
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
-
250
-
251
- def _has_callable(obj, fn):
252
- attr = getattr(obj, fn, None)
253
- return callable(attr)
254
-
255
-
256
- def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
257
- param_shapes = zero_model_states[0].param_shapes
258
-
259
- # Reconstruction protocol:
260
- #
261
- # XXX: document this
262
-
263
- if debug:
264
- for i in range(world_size):
265
- for j in range(len(fp32_flat_groups[0])):
266
- print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
267
-
268
- # XXX: memory usage doubles here (zero2)
269
- num_param_groups = len(fp32_flat_groups[0])
270
- merged_single_partition_of_fp32_groups = []
271
- for i in range(num_param_groups):
272
- merged_partitions = [sd[i] for sd in fp32_flat_groups]
273
- full_single_fp32_vector = torch.cat(merged_partitions, 0)
274
- merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
275
- avail_numel = sum(
276
- [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
277
-
278
- if debug:
279
- wanted_params = sum([len(shapes) for shapes in param_shapes])
280
- wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
281
- # not asserting if there is a mismatch due to possible padding
282
- print(f"Have {avail_numel} numels to process.")
283
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
284
-
285
- # params
286
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
287
- # out-of-core computing solution
288
- total_numel = 0
289
- total_params = 0
290
- for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
291
- offset = 0
292
- avail_numel = full_single_fp32_vector.numel()
293
- for name, shape in shapes.items():
294
-
295
- unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
296
- total_numel += unpartitioned_numel
297
- total_params += 1
298
-
299
- if debug:
300
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
301
- state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
302
- offset += unpartitioned_numel
303
-
304
- # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
305
- # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
306
- # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
307
- # live optimizer object, so we are checking that the numbers are within the right range
308
- align_to = 2 * world_size
309
-
310
- def zero2_align(x):
311
- return align_to * math.ceil(x / align_to)
312
-
313
- if debug:
314
- print(f"original offset={offset}, avail_numel={avail_numel}")
315
-
316
- offset = zero2_align(offset)
317
- avail_numel = zero2_align(avail_numel)
318
-
319
- if debug:
320
- print(f"aligned offset={offset}, avail_numel={avail_numel}")
321
-
322
- # Sanity check
323
- if offset != avail_numel:
324
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
325
-
326
- print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
327
-
328
-
329
- def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
330
- state_dict = OrderedDict()
331
-
332
- # buffers
333
- buffers = zero_model_states[0].buffers
334
- state_dict.update(buffers)
335
- if debug:
336
- print(f"added {len(buffers)} buffers")
337
-
338
- _zero2_merge_frozen_params(state_dict, zero_model_states)
339
-
340
- _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
341
-
342
- # recover shared parameters
343
- for pair in zero_model_states[0].shared_params:
344
- if pair[1] in state_dict:
345
- state_dict[pair[0]] = state_dict[pair[1]]
346
-
347
- return state_dict
348
-
349
-
350
- def zero3_partitioned_param_info(unpartitioned_numel, world_size):
351
- remainder = unpartitioned_numel % world_size
352
- padding_numel = (world_size - remainder) if remainder else 0
353
- partitioned_numel = math.ceil(unpartitioned_numel / world_size)
354
- return partitioned_numel, padding_numel
355
-
356
-
357
- def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
358
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
359
- return
360
-
361
- if debug:
362
- for i in range(world_size):
363
- num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
364
- print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
365
-
366
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
367
- wanted_params = len(frozen_param_shapes)
368
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
369
- avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
370
- print(f'Frozen params: Have {avail_numel} numels to process.')
371
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
372
-
373
- total_params = 0
374
- total_numel = 0
375
- for name, shape in zero_model_states[0].frozen_param_shapes.items():
376
- total_params += 1
377
- unpartitioned_numel = shape.numel()
378
- total_numel += unpartitioned_numel
379
-
380
- param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
381
- state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
382
-
383
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
384
-
385
- if debug:
386
- print(
387
- f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
388
- )
389
-
390
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
391
-
392
-
393
- def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
394
- param_shapes = zero_model_states[0].param_shapes
395
- avail_numel = fp32_flat_groups[0].numel() * world_size
396
- # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
397
- # param, re-consolidating each param, while dealing with padding if any
398
-
399
- # merge list of dicts, preserving order
400
- param_shapes = {k: v for d in param_shapes for k, v in d.items()}
401
-
402
- if debug:
403
- for i in range(world_size):
404
- print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
405
-
406
- wanted_params = len(param_shapes)
407
- wanted_numel = sum(shape.numel() for shape in param_shapes.values())
408
- # not asserting if there is a mismatch due to possible padding
409
- avail_numel = fp32_flat_groups[0].numel() * world_size
410
- print(f"Trainable params: Have {avail_numel} numels to process.")
411
- print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
412
-
413
- # params
414
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
415
- # out-of-core computing solution
416
- offset = 0
417
- total_numel = 0
418
- total_params = 0
419
- for name, shape in param_shapes.items():
420
-
421
- unpartitioned_numel = shape.numel()
422
- total_numel += unpartitioned_numel
423
- total_params += 1
424
-
425
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
426
-
427
- if debug:
428
- print(
429
- f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
430
- )
431
-
432
- # XXX: memory usage doubles here
433
- state_dict[name] = torch.cat(
434
- tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
435
- 0).narrow(0, 0, unpartitioned_numel).view(shape)
436
- offset += partitioned_numel
437
-
438
- offset *= world_size
439
-
440
- # Sanity check
441
- if offset != avail_numel:
442
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
443
-
444
- print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
445
-
446
-
447
- def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
448
- state_dict = OrderedDict()
449
-
450
- # buffers
451
- buffers = zero_model_states[0].buffers
452
- state_dict.update(buffers)
453
- if debug:
454
- print(f"added {len(buffers)} buffers")
455
-
456
- _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
457
-
458
- _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
459
-
460
- # recover shared parameters
461
- for pair in zero_model_states[0].shared_params:
462
- if pair[1] in state_dict:
463
- state_dict[pair[0]] = state_dict[pair[1]]
464
-
465
- return state_dict
466
-
467
-
468
- def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
469
- """
470
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
471
- ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
472
- via a model hub.
473
-
474
- Args:
475
- - ``checkpoint_dir``: path to the desired checkpoint folder
476
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
477
-
478
- Returns:
479
- - pytorch ``state_dict``
480
-
481
- Note: this approach may not work if your application doesn't have sufficient free CPU memory and
482
- you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
483
- the checkpoint.
484
-
485
- A typical usage might be ::
486
-
487
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
488
- # do the training and checkpoint saving
489
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
490
- model = model.cpu() # move to cpu
491
- model.load_state_dict(state_dict)
492
- # submit to model hub or save the model to share with others
493
-
494
- In this example the ``model`` will no longer be usable in the deepspeed context of the same
495
- application. i.e. you will need to re-initialize the deepspeed engine, since
496
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
497
-
498
- If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
499
-
500
- """
501
- if tag is None:
502
- latest_path = os.path.join(checkpoint_dir, 'latest')
503
- if os.path.isfile(latest_path):
504
- with open(latest_path, 'r') as fd:
505
- tag = fd.read().strip()
506
- else:
507
- raise ValueError(f"Unable to find 'latest' file at {latest_path}")
508
-
509
- ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
510
-
511
- if not os.path.isdir(ds_checkpoint_dir):
512
- raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
513
-
514
- return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
515
-
516
-
517
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
518
- """
519
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
520
- loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
521
-
522
- Args:
523
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
524
- - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
525
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
526
- """
527
-
528
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
529
- print(f"Saving fp32 state dict to {output_file}")
530
- torch.save(state_dict, output_file)
531
-
532
-
533
- def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
534
- """
535
- 1. Put the provided model to cpu
536
- 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
537
- 3. Load it into the provided model
538
-
539
- Args:
540
- - ``model``: the model object to update
541
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
542
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
543
-
544
- Returns:
545
- - ``model`: modified model
546
-
547
- Make sure you have plenty of CPU memory available before you call this function. If you don't
548
- have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
549
- conveniently placed for you in the checkpoint folder.
550
-
551
- A typical usage might be ::
552
-
553
- from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
554
- model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
555
- # submit to model hub or save the model to share with others
556
-
557
- Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
558
- of the same application. i.e. you will need to re-initialize the deepspeed engine, since
559
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
560
-
561
- """
562
- logger.info(f"Extracting fp32 weights")
563
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
564
-
565
- logger.info(f"Overwriting model with fp32 weights")
566
- model = model.cpu()
567
- model.load_state_dict(state_dict, strict=False)
568
-
569
- return model
570
-
571
-
572
- if __name__ == "__main__":
573
-
574
- parser = argparse.ArgumentParser()
575
- parser.add_argument("checkpoint_dir",
576
- type=str,
577
- help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
578
- parser.add_argument(
579
- "output_file",
580
- type=str,
581
- help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
582
- parser.add_argument("-t",
583
- "--tag",
584
- type=str,
585
- default=None,
586
- help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
587
- parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
588
- args = parser.parse_args()
589
-
590
- debug = args.debug
591
-
592
- convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-372/README.md DELETED
@@ -1,202 +0,0 @@
1
- ---
2
- library_name: peft
3
- base_model: google/gemma-7b-it
4
- ---
5
-
6
- # Model Card for Model ID
7
-
8
- <!-- Provide a quick summary of what the model is/does. -->
9
-
10
-
11
-
12
- ## Model Details
13
-
14
- ### Model Description
15
-
16
- <!-- Provide a longer summary of what this model is. -->
17
-
18
-
19
-
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
- ### Framework versions
201
-
202
- - PEFT 0.9.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-372/adapter_config.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "google/gemma-7b-it",
5
- "bias": "none",
6
- "fan_in_fan_out": null,
7
- "inference_mode": true,
8
- "init_lora_weights": true,
9
- "layers_pattern": null,
10
- "layers_to_transform": null,
11
- "loftq_config": {},
12
- "lora_alpha": 16,
13
- "lora_dropout": 0.05,
14
- "megatron_config": null,
15
- "megatron_core": "megatron.core",
16
- "modules_to_save": null,
17
- "peft_type": "LORA",
18
- "r": 32,
19
- "rank_pattern": {},
20
- "revision": null,
21
- "target_modules": [
22
- "down_proj",
23
- "o_proj",
24
- "k_proj",
25
- "q_proj",
26
- "gate_proj",
27
- "up_proj",
28
- "v_proj"
29
- ],
30
- "task_type": "CAUSAL_LM",
31
- "use_dora": false,
32
- "use_rslora": false
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-372/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c23b965687f7bf2e033e1e8051de69e24c99f3103c06606007e68485ebfabea
3
- size 200068904
 
 
 
 
checkpoint-372/global_step372/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:712292fc7e5a6d570c1376cc3be7e12dab2d34fb7ffe48281da38c8053603a39
3
- size 150126608
 
 
 
 
checkpoint-372/global_step372/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3c6bbd461df3af80fb33496e1907ef542102bfa96434d50c174fe80c0dd98e4
3
- size 150126672
 
 
 
 
checkpoint-372/global_step372/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46fee0787345b6483d7a54f3ceeb3260a7a8bef008c22e24c18225027433ff01
3
- size 150126736
 
 
 
 
checkpoint-372/global_step372/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fbe9084b027f2164f5fa8039ea7d37a722d0e0f9f70b2a76fa605e462a2ad6e
3
- size 150126736
 
 
 
 
checkpoint-372/global_step372/mp_rank_00_model_states.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a38ea9a669e473fff57e6c134dd6703ddacc9f123121be165e81bcdcad09513b
3
- size 1896781286
 
 
 
 
checkpoint-372/latest DELETED
@@ -1 +0,0 @@
1
- global_step372
 
 
checkpoint-372/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ebe22192845fac896cd970f52665ebcfd6b5796077804b55f0d8830fcfa32be
3
- size 15024
 
 
 
 
checkpoint-372/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5bbd2194b05d2155d794f7732bdab8deaa38ee92f4c49fa250d0c9f0fd5f532
3
- size 15024
 
 
 
 
checkpoint-372/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f480e768d1b3f6474c222ba1e9d373d3fa99aeb3a944de3d1648ac20b4077d2a
3
- size 15024
 
 
 
 
checkpoint-372/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:58480874185eed88dd61379fe4c13e95c8e8899caf976ff7beca6c2c29f825de
3
- size 15024
 
 
 
 
checkpoint-372/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a1b84ad14ee5a7b2967e455e771394bef96a10da0abeba0c7fac61961ff2bf
3
- size 1064
 
 
 
 
checkpoint-372/trainer_state.json DELETED
@@ -1,2753 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 3.9715302491103204,
5
- "eval_steps": 24,
6
- "global_step": 372,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.01,
13
- "grad_norm": 1.8206765789002874,
14
- "learning_rate": 2.2222222222222223e-05,
15
- "loss": 5.0474,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.01,
20
- "eval_loss": 5.927858829498291,
21
- "eval_runtime": 117.3665,
22
- "eval_samples_per_second": 8.512,
23
- "eval_steps_per_second": 0.537,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.02,
28
- "grad_norm": 1.9889295079554647,
29
- "learning_rate": 4.4444444444444447e-05,
30
- "loss": 5.5569,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.03,
35
- "grad_norm": 1.8931443004310682,
36
- "learning_rate": 6.666666666666667e-05,
37
- "loss": 5.2383,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.04,
42
- "grad_norm": 2.195266234429632,
43
- "learning_rate": 8.888888888888889e-05,
44
- "loss": 5.4943,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.05,
49
- "grad_norm": 2.6001064132041503,
50
- "learning_rate": 0.00011111111111111112,
51
- "loss": 5.2602,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.06,
56
- "grad_norm": 3.26301463076567,
57
- "learning_rate": 0.00013333333333333334,
58
- "loss": 4.8182,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.07,
63
- "grad_norm": 3.476044691292363,
64
- "learning_rate": 0.00015555555555555556,
65
- "loss": 4.0432,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.09,
70
- "grad_norm": 3.378803229553045,
71
- "learning_rate": 0.00017777777777777779,
72
- "loss": 3.5212,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.1,
77
- "grad_norm": 3.9419449437137017,
78
- "learning_rate": 0.0002,
79
- "loss": 3.2239,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.11,
84
- "grad_norm": 5.8833082175146485,
85
- "learning_rate": 0.00019999625498303932,
86
- "loss": 3.4319,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.12,
91
- "grad_norm": 5.4690223843996515,
92
- "learning_rate": 0.0001999850202126604,
93
- "loss": 2.8167,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.13,
98
- "grad_norm": 7.009614336449043,
99
- "learning_rate": 0.00019996629653035126,
100
- "loss": 2.7966,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.14,
105
- "grad_norm": 6.254841874500106,
106
- "learning_rate": 0.0001999400853385221,
107
- "loss": 2.1336,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.15,
112
- "grad_norm": 6.037710889841169,
113
- "learning_rate": 0.00019990638860040006,
114
- "loss": 1.85,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.16,
119
- "grad_norm": 1.0500019118881985,
120
- "learning_rate": 0.00019986520883988232,
121
- "loss": 1.5964,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.17,
126
- "grad_norm": 0.6169710624824223,
127
- "learning_rate": 0.00019981654914134686,
128
- "loss": 1.4307,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.18,
133
- "grad_norm": 1.86114059095932,
134
- "learning_rate": 0.00019976041314942155,
135
- "loss": 1.4285,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.19,
140
- "grad_norm": 1.6513877610200167,
141
- "learning_rate": 0.00019969680506871137,
142
- "loss": 1.4621,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.2,
147
- "grad_norm": 1.4395882738454628,
148
- "learning_rate": 0.000199625729663483,
149
- "loss": 1.3561,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.21,
154
- "grad_norm": 0.70847060238536,
155
- "learning_rate": 0.00019954719225730847,
156
- "loss": 1.3565,
157
- "step": 20
158
- },
159
- {
160
- "epoch": 0.22,
161
- "grad_norm": 0.4331630595385925,
162
- "learning_rate": 0.00019946119873266613,
163
- "loss": 1.3374,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.23,
168
- "grad_norm": 0.5580281682185451,
169
- "learning_rate": 0.0001993677555305002,
170
- "loss": 1.313,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.25,
175
- "grad_norm": 0.5217443953771937,
176
- "learning_rate": 0.00019926686964973813,
177
- "loss": 1.2541,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.26,
182
- "grad_norm": 0.36823120314463453,
183
- "learning_rate": 0.00019915854864676664,
184
- "loss": 1.2191,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.26,
189
- "eval_loss": 1.2946609258651733,
190
- "eval_runtime": 118.9039,
191
- "eval_samples_per_second": 8.402,
192
- "eval_steps_per_second": 0.53,
193
- "step": 24
194
- },
195
- {
196
- "epoch": 0.27,
197
- "grad_norm": 0.5797477063688413,
198
- "learning_rate": 0.0001990428006348656,
199
- "loss": 1.24,
200
- "step": 25
201
- },
202
- {
203
- "epoch": 0.28,
204
- "grad_norm": 0.41369538857234545,
205
- "learning_rate": 0.00019891963428360043,
206
- "loss": 1.209,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.29,
211
- "grad_norm": 0.36666008426797836,
212
- "learning_rate": 0.00019878905881817252,
213
- "loss": 1.2543,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.3,
218
- "grad_norm": 0.3976779691989045,
219
- "learning_rate": 0.00019865108401872857,
220
- "loss": 1.2431,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.31,
225
- "grad_norm": 0.4992861718630414,
226
- "learning_rate": 0.00019850572021962788,
227
- "loss": 1.2471,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.32,
232
- "grad_norm": 0.33729072192890136,
233
- "learning_rate": 0.00019835297830866826,
234
- "loss": 1.1933,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.33,
239
- "grad_norm": 0.29373457949318904,
240
- "learning_rate": 0.00019819286972627066,
241
- "loss": 1.1761,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.34,
246
- "grad_norm": 0.5339184947140588,
247
- "learning_rate": 0.0001980254064646223,
248
- "loss": 1.165,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.35,
253
- "grad_norm": 0.38755069216510263,
254
- "learning_rate": 0.00019785060106677818,
255
- "loss": 1.1236,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.36,
260
- "grad_norm": 0.338373181403367,
261
- "learning_rate": 0.00019766846662572191,
262
- "loss": 1.2102,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.37,
267
- "grad_norm": 0.39237714718744304,
268
- "learning_rate": 0.00019747901678338496,
269
- "loss": 1.1642,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.38,
274
- "grad_norm": 0.3614249847081747,
275
- "learning_rate": 0.00019728226572962473,
276
- "loss": 1.1387,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.4,
281
- "grad_norm": 0.28278007479509987,
282
- "learning_rate": 0.00019707822820116193,
283
- "loss": 1.0939,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.41,
288
- "grad_norm": 0.3008254873268798,
289
- "learning_rate": 0.00019686691948047664,
290
- "loss": 1.1346,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.42,
295
- "grad_norm": 0.4263010439416343,
296
- "learning_rate": 0.0001966483553946637,
297
- "loss": 1.1015,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.43,
302
- "grad_norm": 0.32725448028464205,
303
- "learning_rate": 0.00019642255231424729,
304
- "loss": 1.1324,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.44,
309
- "grad_norm": 0.3028242900588441,
310
- "learning_rate": 0.00019618952715195475,
311
- "loss": 1.1147,
312
- "step": 41
313
- },
314
- {
315
- "epoch": 0.45,
316
- "grad_norm": 0.33893311928252234,
317
- "learning_rate": 0.00019594929736144976,
318
- "loss": 1.0978,
319
- "step": 42
320
- },
321
- {
322
- "epoch": 0.46,
323
- "grad_norm": 0.2786082334492372,
324
- "learning_rate": 0.0001957018809360251,
325
- "loss": 1.0933,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.47,
330
- "grad_norm": 0.2732185168098956,
331
- "learning_rate": 0.00019544729640725498,
332
- "loss": 1.084,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.48,
337
- "grad_norm": 0.33386436894143035,
338
- "learning_rate": 0.00019518556284360696,
339
- "loss": 1.0673,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.49,
344
- "grad_norm": 0.2761688734050621,
345
- "learning_rate": 0.00019491669984901379,
346
- "loss": 1.0523,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.5,
351
- "grad_norm": 0.3346957388610895,
352
- "learning_rate": 0.00019464072756140486,
353
- "loss": 1.0913,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.51,
358
- "grad_norm": 0.30196058996924285,
359
- "learning_rate": 0.0001943576666511982,
360
- "loss": 1.1165,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.51,
365
- "eval_loss": 1.167867660522461,
366
- "eval_runtime": 119.1485,
367
- "eval_samples_per_second": 8.384,
368
- "eval_steps_per_second": 0.529,
369
- "step": 48
370
- },
371
- {
372
- "epoch": 0.52,
373
- "grad_norm": 0.27445390350987153,
374
- "learning_rate": 0.00019406753831975203,
375
- "loss": 1.1069,
376
- "step": 49
377
- },
378
- {
379
- "epoch": 0.53,
380
- "grad_norm": 0.34729097228771255,
381
- "learning_rate": 0.00019377036429777672,
382
- "loss": 1.0567,
383
- "step": 50
384
- },
385
- {
386
- "epoch": 0.54,
387
- "grad_norm": 0.31314016575739406,
388
- "learning_rate": 0.0001934661668437073,
389
- "loss": 1.0875,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 0.56,
394
- "grad_norm": 0.29140014335226905,
395
- "learning_rate": 0.0001931549687420364,
396
- "loss": 1.0929,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 0.57,
401
- "grad_norm": 0.2638104110161505,
402
- "learning_rate": 0.00019283679330160726,
403
- "loss": 1.0963,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 0.58,
408
- "grad_norm": 0.2833945318119855,
409
- "learning_rate": 0.0001925116643538684,
410
- "loss": 1.0535,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 0.59,
415
- "grad_norm": 0.28672689795285417,
416
- "learning_rate": 0.0001921796062510882,
417
- "loss": 1.0699,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 0.6,
422
- "grad_norm": 0.261255409262294,
423
- "learning_rate": 0.00019184064386453128,
424
- "loss": 1.0658,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.61,
429
- "grad_norm": 0.24304864434604007,
430
- "learning_rate": 0.00019149480258259533,
431
- "loss": 1.0441,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.62,
436
- "grad_norm": 0.2987107937915846,
437
- "learning_rate": 0.00019114210830890969,
438
- "loss": 1.0061,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.63,
443
- "grad_norm": 0.2617045441373282,
444
- "learning_rate": 0.00019078258746039507,
445
- "loss": 1.0578,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.64,
450
- "grad_norm": 0.2577955355987167,
451
- "learning_rate": 0.00019041626696528503,
452
- "loss": 1.0333,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.65,
457
- "grad_norm": 0.2823058812174375,
458
- "learning_rate": 0.0001900431742611089,
459
- "loss": 1.0837,
460
- "step": 61
461
- },
462
- {
463
- "epoch": 0.66,
464
- "grad_norm": 0.30425238718712166,
465
- "learning_rate": 0.00018966333729263674,
466
- "loss": 1.0619,
467
- "step": 62
468
- },
469
- {
470
- "epoch": 0.67,
471
- "grad_norm": 0.29826831116146957,
472
- "learning_rate": 0.0001892767845097864,
473
- "loss": 1.056,
474
- "step": 63
475
- },
476
- {
477
- "epoch": 0.68,
478
- "grad_norm": 0.22990267950533677,
479
- "learning_rate": 0.00018888354486549237,
480
- "loss": 1.061,
481
- "step": 64
482
- },
483
- {
484
- "epoch": 0.69,
485
- "grad_norm": 0.27604852373975236,
486
- "learning_rate": 0.00018848364781353744,
487
- "loss": 1.0624,
488
- "step": 65
489
- },
490
- {
491
- "epoch": 0.7,
492
- "grad_norm": 0.302101014156969,
493
- "learning_rate": 0.00018807712330634642,
494
- "loss": 1.0965,
495
- "step": 66
496
- },
497
- {
498
- "epoch": 0.72,
499
- "grad_norm": 0.2532153192142023,
500
- "learning_rate": 0.00018766400179274286,
501
- "loss": 1.0972,
502
- "step": 67
503
- },
504
- {
505
- "epoch": 0.73,
506
- "grad_norm": 0.23803088057755897,
507
- "learning_rate": 0.00018724431421566823,
508
- "loss": 1.0823,
509
- "step": 68
510
- },
511
- {
512
- "epoch": 0.74,
513
- "grad_norm": 0.2200041903156331,
514
- "learning_rate": 0.0001868180920098644,
515
- "loss": 1.037,
516
- "step": 69
517
- },
518
- {
519
- "epoch": 0.75,
520
- "grad_norm": 0.31123761066229655,
521
- "learning_rate": 0.00018638536709951917,
522
- "loss": 1.0689,
523
- "step": 70
524
- },
525
- {
526
- "epoch": 0.76,
527
- "grad_norm": 0.2760757149384919,
528
- "learning_rate": 0.00018594617189587512,
529
- "loss": 1.0071,
530
- "step": 71
531
- },
532
- {
533
- "epoch": 0.77,
534
- "grad_norm": 0.2452672521810973,
535
- "learning_rate": 0.00018550053929480202,
536
- "loss": 1.0711,
537
- "step": 72
538
- },
539
- {
540
- "epoch": 0.77,
541
- "eval_loss": 1.1377497911453247,
542
- "eval_runtime": 119.461,
543
- "eval_samples_per_second": 8.363,
544
- "eval_steps_per_second": 0.527,
545
- "step": 72
546
- },
547
- {
548
- "epoch": 0.78,
549
- "grad_norm": 0.30897216290479246,
550
- "learning_rate": 0.0001850485026743328,
551
- "loss": 1.0508,
552
- "step": 73
553
- },
554
- {
555
- "epoch": 0.79,
556
- "grad_norm": 0.24165903393157925,
557
- "learning_rate": 0.00018459009589216364,
558
- "loss": 1.046,
559
- "step": 74
560
- },
561
- {
562
- "epoch": 0.8,
563
- "grad_norm": 0.2509819208307879,
564
- "learning_rate": 0.00018412535328311814,
565
- "loss": 1.0726,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 0.81,
570
- "grad_norm": 0.26145395006758515,
571
- "learning_rate": 0.00018365430965657526,
572
- "loss": 0.9998,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 0.82,
577
- "grad_norm": 0.26920709605794424,
578
- "learning_rate": 0.00018317700029386245,
579
- "loss": 1.065,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 0.83,
584
- "grad_norm": 0.24226754926786417,
585
- "learning_rate": 0.0001826934609456129,
586
- "loss": 1.0489,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 0.84,
591
- "grad_norm": 0.3022365661006827,
592
- "learning_rate": 0.00018220372782908777,
593
- "loss": 1.0372,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 0.85,
598
- "grad_norm": 0.25795710005352673,
599
- "learning_rate": 0.00018170783762546365,
600
- "loss": 1.0128,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 0.86,
605
- "grad_norm": 0.3490748875058354,
606
- "learning_rate": 0.00018120582747708502,
607
- "loss": 1.0168,
608
- "step": 81
609
- },
610
- {
611
- "epoch": 0.88,
612
- "grad_norm": 0.24938209735120945,
613
- "learning_rate": 0.00018069773498468223,
614
- "loss": 0.9586,
615
- "step": 82
616
- },
617
- {
618
- "epoch": 0.89,
619
- "grad_norm": 0.2527612545099894,
620
- "learning_rate": 0.00018018359820455536,
621
- "loss": 1.0385,
622
- "step": 83
623
- },
624
- {
625
- "epoch": 0.9,
626
- "grad_norm": 0.27528879975094916,
627
- "learning_rate": 0.0001796634556457236,
628
- "loss": 1.0328,
629
- "step": 84
630
- },
631
- {
632
- "epoch": 0.91,
633
- "grad_norm": 0.2605002777661913,
634
- "learning_rate": 0.0001791373462670411,
635
- "loss": 0.9966,
636
- "step": 85
637
- },
638
- {
639
- "epoch": 0.92,
640
- "grad_norm": 0.3117107796665858,
641
- "learning_rate": 0.00017860530947427875,
642
- "loss": 0.9772,
643
- "step": 86
644
- },
645
- {
646
- "epoch": 0.93,
647
- "grad_norm": 0.28336227154677734,
648
- "learning_rate": 0.0001780673851171728,
649
- "loss": 1.0724,
650
- "step": 87
651
- },
652
- {
653
- "epoch": 0.94,
654
- "grad_norm": 0.42707817919652674,
655
- "learning_rate": 0.0001775236134864401,
656
- "loss": 1.0038,
657
- "step": 88
658
- },
659
- {
660
- "epoch": 0.95,
661
- "grad_norm": 0.29236016959846456,
662
- "learning_rate": 0.0001769740353107602,
663
- "loss": 1.0083,
664
- "step": 89
665
- },
666
- {
667
- "epoch": 0.96,
668
- "grad_norm": 0.43295063403530637,
669
- "learning_rate": 0.00017641869175372493,
670
- "loss": 1.022,
671
- "step": 90
672
- },
673
- {
674
- "epoch": 0.97,
675
- "grad_norm": 0.3086663897043129,
676
- "learning_rate": 0.00017585762441075503,
677
- "loss": 1.0303,
678
- "step": 91
679
- },
680
- {
681
- "epoch": 0.98,
682
- "grad_norm": 0.2783768981163154,
683
- "learning_rate": 0.0001752908753059849,
684
- "loss": 1.061,
685
- "step": 92
686
- },
687
- {
688
- "epoch": 0.99,
689
- "grad_norm": 0.43168501819843275,
690
- "learning_rate": 0.00017471848688911464,
691
- "loss": 1.0631,
692
- "step": 93
693
- },
694
- {
695
- "epoch": 1.0,
696
- "grad_norm": 0.25487494913299935,
697
- "learning_rate": 0.0001741405020322309,
698
- "loss": 0.9858,
699
- "step": 94
700
- },
701
- {
702
- "epoch": 1.01,
703
- "grad_norm": 0.3229761094582219,
704
- "learning_rate": 0.00017355696402659548,
705
- "loss": 0.9495,
706
- "step": 95
707
- },
708
- {
709
- "epoch": 1.02,
710
- "grad_norm": 0.3178464701266748,
711
- "learning_rate": 0.000172967916579403,
712
- "loss": 0.9546,
713
- "step": 96
714
- },
715
- {
716
- "epoch": 1.02,
717
- "eval_loss": 1.1303094625473022,
718
- "eval_runtime": 119.6761,
719
- "eval_samples_per_second": 8.348,
720
- "eval_steps_per_second": 0.526,
721
- "step": 96
722
- },
723
- {
724
- "epoch": 1.04,
725
- "grad_norm": 0.2534616980189548,
726
- "learning_rate": 0.00017237340381050703,
727
- "loss": 0.9509,
728
- "step": 97
729
- },
730
- {
731
- "epoch": 1.05,
732
- "grad_norm": 0.2354382873554396,
733
- "learning_rate": 0.00017177347024911562,
734
- "loss": 0.9611,
735
- "step": 98
736
- },
737
- {
738
- "epoch": 1.06,
739
- "grad_norm": 0.2754259154521738,
740
- "learning_rate": 0.00017116816083045602,
741
- "loss": 0.9184,
742
- "step": 99
743
- },
744
- {
745
- "epoch": 1.07,
746
- "grad_norm": 0.25868181129480755,
747
- "learning_rate": 0.00017055752089240907,
748
- "loss": 0.957,
749
- "step": 100
750
- },
751
- {
752
- "epoch": 1.08,
753
- "grad_norm": 0.2383943586330267,
754
- "learning_rate": 0.00016994159617211317,
755
- "loss": 0.9638,
756
- "step": 101
757
- },
758
- {
759
- "epoch": 1.09,
760
- "grad_norm": 0.2706420372628291,
761
- "learning_rate": 0.0001693204328025389,
762
- "loss": 0.9115,
763
- "step": 102
764
- },
765
- {
766
- "epoch": 1.1,
767
- "grad_norm": 0.2751042656041904,
768
- "learning_rate": 0.0001686940773090333,
769
- "loss": 0.9277,
770
- "step": 103
771
- },
772
- {
773
- "epoch": 1.11,
774
- "grad_norm": 0.27700872737428867,
775
- "learning_rate": 0.00016806257660583534,
776
- "loss": 0.9248,
777
- "step": 104
778
- },
779
- {
780
- "epoch": 1.12,
781
- "grad_norm": 0.3350046312844708,
782
- "learning_rate": 0.00016742597799256182,
783
- "loss": 0.928,
784
- "step": 105
785
- },
786
- {
787
- "epoch": 1.13,
788
- "grad_norm": 0.4055944986440079,
789
- "learning_rate": 0.00016678432915066488,
790
- "loss": 0.9074,
791
- "step": 106
792
- },
793
- {
794
- "epoch": 1.14,
795
- "grad_norm": 0.2515177402600531,
796
- "learning_rate": 0.00016613767813986044,
797
- "loss": 0.9564,
798
- "step": 107
799
- },
800
- {
801
- "epoch": 1.15,
802
- "grad_norm": 0.2571149695502646,
803
- "learning_rate": 0.00016548607339452853,
804
- "loss": 0.93,
805
- "step": 108
806
- },
807
- {
808
- "epoch": 1.16,
809
- "grad_norm": 0.38608942941048996,
810
- "learning_rate": 0.0001648295637200856,
811
- "loss": 0.9281,
812
- "step": 109
813
- },
814
- {
815
- "epoch": 1.17,
816
- "grad_norm": 0.31939838976976676,
817
- "learning_rate": 0.000164168198289329,
818
- "loss": 0.9914,
819
- "step": 110
820
- },
821
- {
822
- "epoch": 1.19,
823
- "grad_norm": 0.30504937567650897,
824
- "learning_rate": 0.00016350202663875386,
825
- "loss": 0.9549,
826
- "step": 111
827
- },
828
- {
829
- "epoch": 1.2,
830
- "grad_norm": 0.3320388344291162,
831
- "learning_rate": 0.0001628310986648427,
832
- "loss": 0.9086,
833
- "step": 112
834
- },
835
- {
836
- "epoch": 1.21,
837
- "grad_norm": 0.27715569151296165,
838
- "learning_rate": 0.0001621554646203284,
839
- "loss": 0.8537,
840
- "step": 113
841
- },
842
- {
843
- "epoch": 1.22,
844
- "grad_norm": 0.278787508566418,
845
- "learning_rate": 0.0001614751751104301,
846
- "loss": 0.9354,
847
- "step": 114
848
- },
849
- {
850
- "epoch": 1.23,
851
- "grad_norm": 0.24483614460003267,
852
- "learning_rate": 0.00016079028108906282,
853
- "loss": 0.8996,
854
- "step": 115
855
- },
856
- {
857
- "epoch": 1.24,
858
- "grad_norm": 0.37520609596400134,
859
- "learning_rate": 0.0001601008338550211,
860
- "loss": 0.9514,
861
- "step": 116
862
- },
863
- {
864
- "epoch": 1.25,
865
- "grad_norm": 0.2565631505653599,
866
- "learning_rate": 0.00015940688504813662,
867
- "loss": 0.8984,
868
- "step": 117
869
- },
870
- {
871
- "epoch": 1.26,
872
- "grad_norm": 0.26348552476529935,
873
- "learning_rate": 0.00015870848664541044,
874
- "loss": 0.8941,
875
- "step": 118
876
- },
877
- {
878
- "epoch": 1.27,
879
- "grad_norm": 0.32431198985496534,
880
- "learning_rate": 0.00015800569095711982,
881
- "loss": 0.8876,
882
- "step": 119
883
- },
884
- {
885
- "epoch": 1.28,
886
- "grad_norm": 0.29308039763069227,
887
- "learning_rate": 0.00015729855062290022,
888
- "loss": 0.9309,
889
- "step": 120
890
- },
891
- {
892
- "epoch": 1.28,
893
- "eval_loss": 1.129751205444336,
894
- "eval_runtime": 119.1497,
895
- "eval_samples_per_second": 8.384,
896
- "eval_steps_per_second": 0.529,
897
- "step": 120
898
- },
899
- {
900
- "epoch": 1.29,
901
- "grad_norm": 0.2793291380060977,
902
- "learning_rate": 0.0001565871186078025,
903
- "loss": 0.9453,
904
- "step": 121
905
- },
906
- {
907
- "epoch": 1.3,
908
- "grad_norm": 0.28873644301555734,
909
- "learning_rate": 0.000155871448198326,
910
- "loss": 0.9243,
911
- "step": 122
912
- },
913
- {
914
- "epoch": 1.31,
915
- "grad_norm": 0.3086103724578039,
916
- "learning_rate": 0.00015515159299842707,
917
- "loss": 0.8877,
918
- "step": 123
919
- },
920
- {
921
- "epoch": 1.32,
922
- "grad_norm": 0.30407892484693505,
923
- "learning_rate": 0.00015442760692550443,
924
- "loss": 0.9448,
925
- "step": 124
926
- },
927
- {
928
- "epoch": 1.33,
929
- "grad_norm": 0.29771602861368474,
930
- "learning_rate": 0.00015369954420636048,
931
- "loss": 0.889,
932
- "step": 125
933
- },
934
- {
935
- "epoch": 1.35,
936
- "grad_norm": 0.30480490158838136,
937
- "learning_rate": 0.00015296745937313987,
938
- "loss": 0.9405,
939
- "step": 126
940
- },
941
- {
942
- "epoch": 1.36,
943
- "grad_norm": 0.2949192855418127,
944
- "learning_rate": 0.00015223140725924495,
945
- "loss": 0.9382,
946
- "step": 127
947
- },
948
- {
949
- "epoch": 1.37,
950
- "grad_norm": 0.2813631863132807,
951
- "learning_rate": 0.00015149144299522873,
952
- "loss": 0.9526,
953
- "step": 128
954
- },
955
- {
956
- "epoch": 1.38,
957
- "grad_norm": 0.28548924064070513,
958
- "learning_rate": 0.00015074762200466556,
959
- "loss": 0.9174,
960
- "step": 129
961
- },
962
- {
963
- "epoch": 1.39,
964
- "grad_norm": 0.28137053449960464,
965
- "learning_rate": 0.00015000000000000001,
966
- "loss": 0.9244,
967
- "step": 130
968
- },
969
- {
970
- "epoch": 1.4,
971
- "grad_norm": 0.2626750895717777,
972
- "learning_rate": 0.00014924863297837378,
973
- "loss": 0.9335,
974
- "step": 131
975
- },
976
- {
977
- "epoch": 1.41,
978
- "grad_norm": 0.26686502371015536,
979
- "learning_rate": 0.00014849357721743168,
980
- "loss": 0.8948,
981
- "step": 132
982
- },
983
- {
984
- "epoch": 1.42,
985
- "grad_norm": 0.3332273481179679,
986
- "learning_rate": 0.00014773488927110633,
987
- "loss": 0.9274,
988
- "step": 133
989
- },
990
- {
991
- "epoch": 1.43,
992
- "grad_norm": 0.2528048763375234,
993
- "learning_rate": 0.00014697262596538227,
994
- "loss": 0.8731,
995
- "step": 134
996
- },
997
- {
998
- "epoch": 1.44,
999
- "grad_norm": 0.27184211707488076,
1000
- "learning_rate": 0.00014620684439403962,
1001
- "loss": 0.9318,
1002
- "step": 135
1003
- },
1004
- {
1005
- "epoch": 1.45,
1006
- "grad_norm": 0.3051111137538683,
1007
- "learning_rate": 0.0001454376019143779,
1008
- "loss": 0.9447,
1009
- "step": 136
1010
- },
1011
- {
1012
- "epoch": 1.46,
1013
- "grad_norm": 0.28771401659835155,
1014
- "learning_rate": 0.00014466495614291977,
1015
- "loss": 0.9343,
1016
- "step": 137
1017
- },
1018
- {
1019
- "epoch": 1.47,
1020
- "grad_norm": 0.28995797921621524,
1021
- "learning_rate": 0.0001438889649510956,
1022
- "loss": 0.8978,
1023
- "step": 138
1024
- },
1025
- {
1026
- "epoch": 1.48,
1027
- "grad_norm": 0.2749930548874636,
1028
- "learning_rate": 0.00014310968646090883,
1029
- "loss": 0.924,
1030
- "step": 139
1031
- },
1032
- {
1033
- "epoch": 1.49,
1034
- "grad_norm": 0.3097189537380989,
1035
- "learning_rate": 0.0001423271790405828,
1036
- "loss": 0.9574,
1037
- "step": 140
1038
- },
1039
- {
1040
- "epoch": 1.51,
1041
- "grad_norm": 0.2449218990319832,
1042
- "learning_rate": 0.00014154150130018866,
1043
- "loss": 0.8475,
1044
- "step": 141
1045
- },
1046
- {
1047
- "epoch": 1.52,
1048
- "grad_norm": 0.24856388098419674,
1049
- "learning_rate": 0.0001407527120872557,
1050
- "loss": 0.9381,
1051
- "step": 142
1052
- },
1053
- {
1054
- "epoch": 1.53,
1055
- "grad_norm": 0.3169861882853132,
1056
- "learning_rate": 0.00013996087048236358,
1057
- "loss": 0.9141,
1058
- "step": 143
1059
- },
1060
- {
1061
- "epoch": 1.54,
1062
- "grad_norm": 0.30689184261103974,
1063
- "learning_rate": 0.00013916603579471705,
1064
- "loss": 0.9588,
1065
- "step": 144
1066
- },
1067
- {
1068
- "epoch": 1.54,
1069
- "eval_loss": 1.1242448091506958,
1070
- "eval_runtime": 119.0725,
1071
- "eval_samples_per_second": 8.39,
1072
- "eval_steps_per_second": 0.529,
1073
- "step": 144
1074
- },
1075
- {
1076
- "epoch": 1.55,
1077
- "grad_norm": 0.2961514212977567,
1078
- "learning_rate": 0.00013836826755770384,
1079
- "loss": 0.9371,
1080
- "step": 145
1081
- },
1082
- {
1083
- "epoch": 1.56,
1084
- "grad_norm": 0.30790856503439346,
1085
- "learning_rate": 0.00013756762552443553,
1086
- "loss": 0.9612,
1087
- "step": 146
1088
- },
1089
- {
1090
- "epoch": 1.57,
1091
- "grad_norm": 0.3517398492864053,
1092
- "learning_rate": 0.000136764169663272,
1093
- "loss": 0.9253,
1094
- "step": 147
1095
- },
1096
- {
1097
- "epoch": 1.58,
1098
- "grad_norm": 0.26375798832515857,
1099
- "learning_rate": 0.00013595796015332984,
1100
- "loss": 0.8977,
1101
- "step": 148
1102
- },
1103
- {
1104
- "epoch": 1.59,
1105
- "grad_norm": 0.274348892672977,
1106
- "learning_rate": 0.00013514905737997473,
1107
- "loss": 0.8817,
1108
- "step": 149
1109
- },
1110
- {
1111
- "epoch": 1.6,
1112
- "grad_norm": 0.35917564750751624,
1113
- "learning_rate": 0.00013433752193029886,
1114
- "loss": 0.886,
1115
- "step": 150
1116
- },
1117
- {
1118
- "epoch": 1.61,
1119
- "grad_norm": 0.38175124377914293,
1120
- "learning_rate": 0.00013352341458858265,
1121
- "loss": 0.8576,
1122
- "step": 151
1123
- },
1124
- {
1125
- "epoch": 1.62,
1126
- "grad_norm": 0.249633953215678,
1127
- "learning_rate": 0.00013270679633174218,
1128
- "loss": 1.0066,
1129
- "step": 152
1130
- },
1131
- {
1132
- "epoch": 1.63,
1133
- "grad_norm": 0.33494494430574784,
1134
- "learning_rate": 0.00013188772832476188,
1135
- "loss": 0.884,
1136
- "step": 153
1137
- },
1138
- {
1139
- "epoch": 1.64,
1140
- "grad_norm": 0.4176467296744032,
1141
- "learning_rate": 0.00013106627191611332,
1142
- "loss": 0.9041,
1143
- "step": 154
1144
- },
1145
- {
1146
- "epoch": 1.65,
1147
- "grad_norm": 0.27051479454532207,
1148
- "learning_rate": 0.00013024248863316012,
1149
- "loss": 0.8764,
1150
- "step": 155
1151
- },
1152
- {
1153
- "epoch": 1.67,
1154
- "grad_norm": 0.29302599029848847,
1155
- "learning_rate": 0.00012941644017754964,
1156
- "loss": 0.9786,
1157
- "step": 156
1158
- },
1159
- {
1160
- "epoch": 1.68,
1161
- "grad_norm": 0.3127378512248151,
1162
- "learning_rate": 0.00012858818842059145,
1163
- "loss": 0.9176,
1164
- "step": 157
1165
- },
1166
- {
1167
- "epoch": 1.69,
1168
- "grad_norm": 0.40647077063662906,
1169
- "learning_rate": 0.00012775779539862304,
1170
- "loss": 0.9387,
1171
- "step": 158
1172
- },
1173
- {
1174
- "epoch": 1.7,
1175
- "grad_norm": 0.29290601694481777,
1176
- "learning_rate": 0.00012692532330836346,
1177
- "loss": 0.9192,
1178
- "step": 159
1179
- },
1180
- {
1181
- "epoch": 1.71,
1182
- "grad_norm": 0.2819168741245354,
1183
- "learning_rate": 0.0001260908345022547,
1184
- "loss": 0.9253,
1185
- "step": 160
1186
- },
1187
- {
1188
- "epoch": 1.72,
1189
- "grad_norm": 0.3772714091394927,
1190
- "learning_rate": 0.00012525439148379128,
1191
- "loss": 0.9264,
1192
- "step": 161
1193
- },
1194
- {
1195
- "epoch": 1.73,
1196
- "grad_norm": 0.29399851067321503,
1197
- "learning_rate": 0.00012441605690283915,
1198
- "loss": 0.9357,
1199
- "step": 162
1200
- },
1201
- {
1202
- "epoch": 1.74,
1203
- "grad_norm": 0.2623180246832513,
1204
- "learning_rate": 0.00012357589355094275,
1205
- "loss": 0.8516,
1206
- "step": 163
1207
- },
1208
- {
1209
- "epoch": 1.75,
1210
- "grad_norm": 0.27796942024085824,
1211
- "learning_rate": 0.00012273396435662212,
1212
- "loss": 0.9328,
1213
- "step": 164
1214
- },
1215
- {
1216
- "epoch": 1.76,
1217
- "grad_norm": 0.3107670297529076,
1218
- "learning_rate": 0.0001218903323806595,
1219
- "loss": 0.8769,
1220
- "step": 165
1221
- },
1222
- {
1223
- "epoch": 1.77,
1224
- "grad_norm": 0.2865573350738354,
1225
- "learning_rate": 0.00012104506081137608,
1226
- "loss": 0.9015,
1227
- "step": 166
1228
- },
1229
- {
1230
- "epoch": 1.78,
1231
- "grad_norm": 0.30595087117636693,
1232
- "learning_rate": 0.00012019821295989912,
1233
- "loss": 0.94,
1234
- "step": 167
1235
- },
1236
- {
1237
- "epoch": 1.79,
1238
- "grad_norm": 0.32540365653257874,
1239
- "learning_rate": 0.00011934985225541998,
1240
- "loss": 0.8553,
1241
- "step": 168
1242
- },
1243
- {
1244
- "epoch": 1.79,
1245
- "eval_loss": 1.1259374618530273,
1246
- "eval_runtime": 119.4351,
1247
- "eval_samples_per_second": 8.364,
1248
- "eval_steps_per_second": 0.527,
1249
- "step": 168
1250
- },
1251
- {
1252
- "epoch": 1.8,
1253
- "grad_norm": 0.3058868303314457,
1254
- "learning_rate": 0.00011850004224044315,
1255
- "loss": 0.9074,
1256
- "step": 169
1257
- },
1258
- {
1259
- "epoch": 1.81,
1260
- "grad_norm": 0.33266760488242775,
1261
- "learning_rate": 0.0001176488465660271,
1262
- "loss": 0.8799,
1263
- "step": 170
1264
- },
1265
- {
1266
- "epoch": 1.83,
1267
- "grad_norm": 0.3101183375673487,
1268
- "learning_rate": 0.00011679632898701649,
1269
- "loss": 0.9004,
1270
- "step": 171
1271
- },
1272
- {
1273
- "epoch": 1.84,
1274
- "grad_norm": 0.31535579418195775,
1275
- "learning_rate": 0.00011594255335726724,
1276
- "loss": 0.9238,
1277
- "step": 172
1278
- },
1279
- {
1280
- "epoch": 1.85,
1281
- "grad_norm": 0.28341827112854334,
1282
- "learning_rate": 0.00011508758362486358,
1283
- "loss": 0.9138,
1284
- "step": 173
1285
- },
1286
- {
1287
- "epoch": 1.86,
1288
- "grad_norm": 0.25699888796695625,
1289
- "learning_rate": 0.00011423148382732853,
1290
- "loss": 0.9175,
1291
- "step": 174
1292
- },
1293
- {
1294
- "epoch": 1.87,
1295
- "grad_norm": 0.29504332662698246,
1296
- "learning_rate": 0.0001133743180868273,
1297
- "loss": 0.9023,
1298
- "step": 175
1299
- },
1300
- {
1301
- "epoch": 1.88,
1302
- "grad_norm": 0.2993175263873948,
1303
- "learning_rate": 0.0001125161506053646,
1304
- "loss": 0.8893,
1305
- "step": 176
1306
- },
1307
- {
1308
- "epoch": 1.89,
1309
- "grad_norm": 0.2762659379409218,
1310
- "learning_rate": 0.00011165704565997593,
1311
- "loss": 0.9071,
1312
- "step": 177
1313
- },
1314
- {
1315
- "epoch": 1.9,
1316
- "grad_norm": 0.23620994229530515,
1317
- "learning_rate": 0.00011079706759791311,
1318
- "loss": 0.8796,
1319
- "step": 178
1320
- },
1321
- {
1322
- "epoch": 1.91,
1323
- "grad_norm": 0.28317619721877,
1324
- "learning_rate": 0.00010993628083182467,
1325
- "loss": 0.8983,
1326
- "step": 179
1327
- },
1328
- {
1329
- "epoch": 1.92,
1330
- "grad_norm": 0.3252854551640304,
1331
- "learning_rate": 0.00010907474983493144,
1332
- "loss": 0.8947,
1333
- "step": 180
1334
- },
1335
- {
1336
- "epoch": 1.93,
1337
- "grad_norm": 0.2579136274422669,
1338
- "learning_rate": 0.00010821253913619726,
1339
- "loss": 0.8726,
1340
- "step": 181
1341
- },
1342
- {
1343
- "epoch": 1.94,
1344
- "grad_norm": 0.27201912720918364,
1345
- "learning_rate": 0.00010734971331549603,
1346
- "loss": 0.891,
1347
- "step": 182
1348
- },
1349
- {
1350
- "epoch": 1.95,
1351
- "grad_norm": 0.41257277193589503,
1352
- "learning_rate": 0.0001064863369987743,
1353
- "loss": 0.9188,
1354
- "step": 183
1355
- },
1356
- {
1357
- "epoch": 1.96,
1358
- "grad_norm": 0.264920112831242,
1359
- "learning_rate": 0.00010562247485321115,
1360
- "loss": 0.8761,
1361
- "step": 184
1362
- },
1363
- {
1364
- "epoch": 1.98,
1365
- "grad_norm": 0.28166441056422037,
1366
- "learning_rate": 0.00010475819158237425,
1367
- "loss": 0.8805,
1368
- "step": 185
1369
- },
1370
- {
1371
- "epoch": 1.99,
1372
- "grad_norm": 0.2818961139392159,
1373
- "learning_rate": 0.00010389355192137377,
1374
- "loss": 0.8934,
1375
- "step": 186
1376
- },
1377
- {
1378
- "epoch": 2.0,
1379
- "grad_norm": 0.27424787600345923,
1380
- "learning_rate": 0.00010302862063201367,
1381
- "loss": 0.9237,
1382
- "step": 187
1383
- },
1384
- {
1385
- "epoch": 2.01,
1386
- "grad_norm": 0.25570082666079225,
1387
- "learning_rate": 0.00010216346249794087,
1388
- "loss": 0.8656,
1389
- "step": 188
1390
- },
1391
- {
1392
- "epoch": 2.02,
1393
- "grad_norm": 0.2712359904481713,
1394
- "learning_rate": 0.0001012981423197931,
1395
- "loss": 0.7627,
1396
- "step": 189
1397
- },
1398
- {
1399
- "epoch": 2.03,
1400
- "grad_norm": 0.25054404547068676,
1401
- "learning_rate": 0.00010043272491034523,
1402
- "loss": 0.8142,
1403
- "step": 190
1404
- },
1405
- {
1406
- "epoch": 2.04,
1407
- "grad_norm": 0.28520868420260026,
1408
- "learning_rate": 9.956727508965481e-05,
1409
- "loss": 0.7953,
1410
- "step": 191
1411
- },
1412
- {
1413
- "epoch": 2.05,
1414
- "grad_norm": 0.29413880984694873,
1415
- "learning_rate": 9.870185768020693e-05,
1416
- "loss": 0.8231,
1417
- "step": 192
1418
- },
1419
- {
1420
- "epoch": 2.05,
1421
- "eval_loss": 1.144862413406372,
1422
- "eval_runtime": 119.3004,
1423
- "eval_samples_per_second": 8.374,
1424
- "eval_steps_per_second": 0.528,
1425
- "step": 192
1426
- },
1427
- {
1428
- "epoch": 2.06,
1429
- "grad_norm": 0.28378300985247035,
1430
- "learning_rate": 9.783653750205915e-05,
1431
- "loss": 0.7478,
1432
- "step": 193
1433
- },
1434
- {
1435
- "epoch": 2.07,
1436
- "grad_norm": 0.31792721348179676,
1437
- "learning_rate": 9.697137936798634e-05,
1438
- "loss": 0.7961,
1439
- "step": 194
1440
- },
1441
- {
1442
- "epoch": 2.08,
1443
- "grad_norm": 0.3291666436295964,
1444
- "learning_rate": 9.610644807862625e-05,
1445
- "loss": 0.7434,
1446
- "step": 195
1447
- },
1448
- {
1449
- "epoch": 2.09,
1450
- "grad_norm": 0.301579259001567,
1451
- "learning_rate": 9.524180841762577e-05,
1452
- "loss": 0.7779,
1453
- "step": 196
1454
- },
1455
- {
1456
- "epoch": 2.1,
1457
- "grad_norm": 0.30252161240414444,
1458
- "learning_rate": 9.437752514678887e-05,
1459
- "loss": 0.7689,
1460
- "step": 197
1461
- },
1462
- {
1463
- "epoch": 2.11,
1464
- "grad_norm": 0.3350657085129171,
1465
- "learning_rate": 9.35136630012257e-05,
1466
- "loss": 0.7574,
1467
- "step": 198
1468
- },
1469
- {
1470
- "epoch": 2.12,
1471
- "grad_norm": 0.3053109929956358,
1472
- "learning_rate": 9.265028668450402e-05,
1473
- "loss": 0.7729,
1474
- "step": 199
1475
- },
1476
- {
1477
- "epoch": 2.14,
1478
- "grad_norm": 0.30367223609567207,
1479
- "learning_rate": 9.178746086380275e-05,
1480
- "loss": 0.8111,
1481
- "step": 200
1482
- },
1483
- {
1484
- "epoch": 2.15,
1485
- "grad_norm": 0.3366440949136126,
1486
- "learning_rate": 9.092525016506858e-05,
1487
- "loss": 0.7986,
1488
- "step": 201
1489
- },
1490
- {
1491
- "epoch": 2.16,
1492
- "grad_norm": 0.3228036608413652,
1493
- "learning_rate": 9.006371916817534e-05,
1494
- "loss": 0.8382,
1495
- "step": 202
1496
- },
1497
- {
1498
- "epoch": 2.17,
1499
- "grad_norm": 0.2919040789403488,
1500
- "learning_rate": 8.920293240208694e-05,
1501
- "loss": 0.7696,
1502
- "step": 203
1503
- },
1504
- {
1505
- "epoch": 2.18,
1506
- "grad_norm": 0.30084198177583166,
1507
- "learning_rate": 8.83429543400241e-05,
1508
- "loss": 0.7671,
1509
- "step": 204
1510
- },
1511
- {
1512
- "epoch": 2.19,
1513
- "grad_norm": 0.33931609000743107,
1514
- "learning_rate": 8.748384939463543e-05,
1515
- "loss": 0.7553,
1516
- "step": 205
1517
- },
1518
- {
1519
- "epoch": 2.2,
1520
- "grad_norm": 0.30413284924824485,
1521
- "learning_rate": 8.662568191317273e-05,
1522
- "loss": 0.7324,
1523
- "step": 206
1524
- },
1525
- {
1526
- "epoch": 2.21,
1527
- "grad_norm": 0.3014038998090481,
1528
- "learning_rate": 8.57685161726715e-05,
1529
- "loss": 0.7567,
1530
- "step": 207
1531
- },
1532
- {
1533
- "epoch": 2.22,
1534
- "grad_norm": 0.3176466329519527,
1535
- "learning_rate": 8.491241637513644e-05,
1536
- "loss": 0.8222,
1537
- "step": 208
1538
- },
1539
- {
1540
- "epoch": 2.23,
1541
- "grad_norm": 0.29981213041628285,
1542
- "learning_rate": 8.405744664273278e-05,
1543
- "loss": 0.7077,
1544
- "step": 209
1545
- },
1546
- {
1547
- "epoch": 2.24,
1548
- "grad_norm": 0.2937916452228122,
1549
- "learning_rate": 8.320367101298351e-05,
1550
- "loss": 0.7231,
1551
- "step": 210
1552
- },
1553
- {
1554
- "epoch": 2.25,
1555
- "grad_norm": 0.32040684171320816,
1556
- "learning_rate": 8.235115343397295e-05,
1557
- "loss": 0.7556,
1558
- "step": 211
1559
- },
1560
- {
1561
- "epoch": 2.26,
1562
- "grad_norm": 0.31083028085316033,
1563
- "learning_rate": 8.149995775955686e-05,
1564
- "loss": 0.7514,
1565
- "step": 212
1566
- },
1567
- {
1568
- "epoch": 2.27,
1569
- "grad_norm": 0.3215465383581194,
1570
- "learning_rate": 8.065014774458003e-05,
1571
- "loss": 0.7933,
1572
- "step": 213
1573
- },
1574
- {
1575
- "epoch": 2.28,
1576
- "grad_norm": 0.3081200259196015,
1577
- "learning_rate": 7.980178704010089e-05,
1578
- "loss": 0.8062,
1579
- "step": 214
1580
- },
1581
- {
1582
- "epoch": 2.3,
1583
- "grad_norm": 0.3333248296288759,
1584
- "learning_rate": 7.895493918862396e-05,
1585
- "loss": 0.7784,
1586
- "step": 215
1587
- },
1588
- {
1589
- "epoch": 2.31,
1590
- "grad_norm": 0.3301326097292383,
1591
- "learning_rate": 7.810966761934053e-05,
1592
- "loss": 0.8154,
1593
- "step": 216
1594
- },
1595
- {
1596
- "epoch": 2.31,
1597
- "eval_loss": 1.1513652801513672,
1598
- "eval_runtime": 119.4371,
1599
- "eval_samples_per_second": 8.364,
1600
- "eval_steps_per_second": 0.527,
1601
- "step": 216
1602
- },
1603
- {
1604
- "epoch": 2.32,
1605
- "grad_norm": 0.3166760836422428,
1606
- "learning_rate": 7.726603564337791e-05,
1607
- "loss": 0.7486,
1608
- "step": 217
1609
- },
1610
- {
1611
- "epoch": 2.33,
1612
- "grad_norm": 0.31309757318131876,
1613
- "learning_rate": 7.642410644905726e-05,
1614
- "loss": 0.771,
1615
- "step": 218
1616
- },
1617
- {
1618
- "epoch": 2.34,
1619
- "grad_norm": 0.36968796131043985,
1620
- "learning_rate": 7.558394309716088e-05,
1621
- "loss": 0.8051,
1622
- "step": 219
1623
- },
1624
- {
1625
- "epoch": 2.35,
1626
- "grad_norm": 0.27537675917328025,
1627
- "learning_rate": 7.474560851620873e-05,
1628
- "loss": 0.7536,
1629
- "step": 220
1630
- },
1631
- {
1632
- "epoch": 2.36,
1633
- "grad_norm": 0.2878011945022053,
1634
- "learning_rate": 7.390916549774536e-05,
1635
- "loss": 0.8126,
1636
- "step": 221
1637
- },
1638
- {
1639
- "epoch": 2.37,
1640
- "grad_norm": 0.3172405217395398,
1641
- "learning_rate": 7.307467669163655e-05,
1642
- "loss": 0.8156,
1643
- "step": 222
1644
- },
1645
- {
1646
- "epoch": 2.38,
1647
- "grad_norm": 0.3183651086957915,
1648
- "learning_rate": 7.224220460137701e-05,
1649
- "loss": 0.7821,
1650
- "step": 223
1651
- },
1652
- {
1653
- "epoch": 2.39,
1654
- "grad_norm": 0.3318078467573977,
1655
- "learning_rate": 7.141181157940859e-05,
1656
- "loss": 0.7993,
1657
- "step": 224
1658
- },
1659
- {
1660
- "epoch": 2.4,
1661
- "grad_norm": 0.28446170407344085,
1662
- "learning_rate": 7.058355982245037e-05,
1663
- "loss": 0.7987,
1664
- "step": 225
1665
- },
1666
- {
1667
- "epoch": 2.41,
1668
- "grad_norm": 0.33568352702219995,
1669
- "learning_rate": 6.97575113668399e-05,
1670
- "loss": 0.773,
1671
- "step": 226
1672
- },
1673
- {
1674
- "epoch": 2.42,
1675
- "grad_norm": 0.30820575901544944,
1676
- "learning_rate": 6.893372808388675e-05,
1677
- "loss": 0.813,
1678
- "step": 227
1679
- },
1680
- {
1681
- "epoch": 2.43,
1682
- "grad_norm": 0.3121364386024255,
1683
- "learning_rate": 6.811227167523815e-05,
1684
- "loss": 0.7716,
1685
- "step": 228
1686
- },
1687
- {
1688
- "epoch": 2.44,
1689
- "grad_norm": 0.3211455560922844,
1690
- "learning_rate": 6.729320366825784e-05,
1691
- "loss": 0.7577,
1692
- "step": 229
1693
- },
1694
- {
1695
- "epoch": 2.46,
1696
- "grad_norm": 0.3315601260165869,
1697
- "learning_rate": 6.647658541141735e-05,
1698
- "loss": 0.779,
1699
- "step": 230
1700
- },
1701
- {
1702
- "epoch": 2.47,
1703
- "grad_norm": 0.35482236759964675,
1704
- "learning_rate": 6.566247806970119e-05,
1705
- "loss": 0.7936,
1706
- "step": 231
1707
- },
1708
- {
1709
- "epoch": 2.48,
1710
- "grad_norm": 0.3318703205331905,
1711
- "learning_rate": 6.485094262002529e-05,
1712
- "loss": 0.7721,
1713
- "step": 232
1714
- },
1715
- {
1716
- "epoch": 2.49,
1717
- "grad_norm": 0.313412585518615,
1718
- "learning_rate": 6.404203984667019e-05,
1719
- "loss": 0.7333,
1720
- "step": 233
1721
- },
1722
- {
1723
- "epoch": 2.5,
1724
- "grad_norm": 0.3389693444254627,
1725
- "learning_rate": 6.323583033672799e-05,
1726
- "loss": 0.6991,
1727
- "step": 234
1728
- },
1729
- {
1730
- "epoch": 2.51,
1731
- "grad_norm": 0.33056782619334757,
1732
- "learning_rate": 6.243237447556449e-05,
1733
- "loss": 0.7872,
1734
- "step": 235
1735
- },
1736
- {
1737
- "epoch": 2.52,
1738
- "grad_norm": 0.3064085209522584,
1739
- "learning_rate": 6.163173244229619e-05,
1740
- "loss": 0.7713,
1741
- "step": 236
1742
- },
1743
- {
1744
- "epoch": 2.53,
1745
- "grad_norm": 0.3109445125421656,
1746
- "learning_rate": 6.083396420528298e-05,
1747
- "loss": 0.8228,
1748
- "step": 237
1749
- },
1750
- {
1751
- "epoch": 2.54,
1752
- "grad_norm": 0.35767207742703394,
1753
- "learning_rate": 6.0039129517636435e-05,
1754
- "loss": 0.8167,
1755
- "step": 238
1756
- },
1757
- {
1758
- "epoch": 2.55,
1759
- "grad_norm": 0.32869196909020376,
1760
- "learning_rate": 5.924728791274432e-05,
1761
- "loss": 0.7893,
1762
- "step": 239
1763
- },
1764
- {
1765
- "epoch": 2.56,
1766
- "grad_norm": 0.31178216743238674,
1767
- "learning_rate": 5.845849869981137e-05,
1768
- "loss": 0.7354,
1769
- "step": 240
1770
- },
1771
- {
1772
- "epoch": 2.56,
1773
- "eval_loss": 1.1470853090286255,
1774
- "eval_runtime": 119.0749,
1775
- "eval_samples_per_second": 8.39,
1776
- "eval_steps_per_second": 0.529,
1777
- "step": 240
1778
- },
1779
- {
1780
- "epoch": 2.57,
1781
- "grad_norm": 0.3146586486940167,
1782
- "learning_rate": 5.7672820959417254e-05,
1783
- "loss": 0.785,
1784
- "step": 241
1785
- },
1786
- {
1787
- "epoch": 2.58,
1788
- "grad_norm": 0.3309473634570162,
1789
- "learning_rate": 5.68903135390912e-05,
1790
- "loss": 0.7007,
1791
- "step": 242
1792
- },
1793
- {
1794
- "epoch": 2.59,
1795
- "grad_norm": 0.2927704203363025,
1796
- "learning_rate": 5.611103504890444e-05,
1797
- "loss": 0.778,
1798
- "step": 243
1799
- },
1800
- {
1801
- "epoch": 2.6,
1802
- "grad_norm": 0.31346541530480915,
1803
- "learning_rate": 5.533504385708024e-05,
1804
- "loss": 0.7272,
1805
- "step": 244
1806
- },
1807
- {
1808
- "epoch": 2.62,
1809
- "grad_norm": 0.2996345434845278,
1810
- "learning_rate": 5.456239808562209e-05,
1811
- "loss": 0.8091,
1812
- "step": 245
1813
- },
1814
- {
1815
- "epoch": 2.63,
1816
- "grad_norm": 0.29407937930772826,
1817
- "learning_rate": 5.379315560596038e-05,
1818
- "loss": 0.7666,
1819
- "step": 246
1820
- },
1821
- {
1822
- "epoch": 2.64,
1823
- "grad_norm": 0.30530254935425627,
1824
- "learning_rate": 5.3027374034617785e-05,
1825
- "loss": 0.7982,
1826
- "step": 247
1827
- },
1828
- {
1829
- "epoch": 2.65,
1830
- "grad_norm": 0.3298149075133802,
1831
- "learning_rate": 5.226511072889371e-05,
1832
- "loss": 0.7962,
1833
- "step": 248
1834
- },
1835
- {
1836
- "epoch": 2.66,
1837
- "grad_norm": 0.33155001378615223,
1838
- "learning_rate": 5.1506422782568345e-05,
1839
- "loss": 0.8087,
1840
- "step": 249
1841
- },
1842
- {
1843
- "epoch": 2.67,
1844
- "grad_norm": 0.32891369446509405,
1845
- "learning_rate": 5.0751367021626215e-05,
1846
- "loss": 0.7702,
1847
- "step": 250
1848
- },
1849
- {
1850
- "epoch": 2.68,
1851
- "grad_norm": 0.3042328939887202,
1852
- "learning_rate": 5.000000000000002e-05,
1853
- "loss": 0.7924,
1854
- "step": 251
1855
- },
1856
- {
1857
- "epoch": 2.69,
1858
- "grad_norm": 0.3037799376581133,
1859
- "learning_rate": 4.9252377995334444e-05,
1860
- "loss": 0.7852,
1861
- "step": 252
1862
- },
1863
- {
1864
- "epoch": 2.7,
1865
- "grad_norm": 0.3435430445603929,
1866
- "learning_rate": 4.85085570047713e-05,
1867
- "loss": 0.7501,
1868
- "step": 253
1869
- },
1870
- {
1871
- "epoch": 2.71,
1872
- "grad_norm": 0.3072160193979946,
1873
- "learning_rate": 4.776859274075506e-05,
1874
- "loss": 0.7462,
1875
- "step": 254
1876
- },
1877
- {
1878
- "epoch": 2.72,
1879
- "grad_norm": 0.3223586439500028,
1880
- "learning_rate": 4.703254062686017e-05,
1881
- "loss": 0.775,
1882
- "step": 255
1883
- },
1884
- {
1885
- "epoch": 2.73,
1886
- "grad_norm": 0.3270406403084203,
1887
- "learning_rate": 4.630045579363957e-05,
1888
- "loss": 0.8306,
1889
- "step": 256
1890
- },
1891
- {
1892
- "epoch": 2.74,
1893
- "grad_norm": 0.3360192842512657,
1894
- "learning_rate": 4.557239307449561e-05,
1895
- "loss": 0.7697,
1896
- "step": 257
1897
- },
1898
- {
1899
- "epoch": 2.75,
1900
- "grad_norm": 0.34282816479900324,
1901
- "learning_rate": 4.484840700157295e-05,
1902
- "loss": 0.7654,
1903
- "step": 258
1904
- },
1905
- {
1906
- "epoch": 2.77,
1907
- "grad_norm": 0.30039142762313786,
1908
- "learning_rate": 4.412855180167406e-05,
1909
- "loss": 0.7703,
1910
- "step": 259
1911
- },
1912
- {
1913
- "epoch": 2.78,
1914
- "grad_norm": 0.34307884673711425,
1915
- "learning_rate": 4.3412881392197526e-05,
1916
- "loss": 0.7993,
1917
- "step": 260
1918
- },
1919
- {
1920
- "epoch": 2.79,
1921
- "grad_norm": 0.33685538845268104,
1922
- "learning_rate": 4.270144937709981e-05,
1923
- "loss": 0.7866,
1924
- "step": 261
1925
- },
1926
- {
1927
- "epoch": 2.8,
1928
- "grad_norm": 0.33166767859224683,
1929
- "learning_rate": 4.19943090428802e-05,
1930
- "loss": 0.8083,
1931
- "step": 262
1932
- },
1933
- {
1934
- "epoch": 2.81,
1935
- "grad_norm": 0.3086370003245581,
1936
- "learning_rate": 4.129151335458957e-05,
1937
- "loss": 0.7938,
1938
- "step": 263
1939
- },
1940
- {
1941
- "epoch": 2.82,
1942
- "grad_norm": 0.3715649674817313,
1943
- "learning_rate": 4.059311495186338e-05,
1944
- "loss": 0.7577,
1945
- "step": 264
1946
- },
1947
- {
1948
- "epoch": 2.82,
1949
- "eval_loss": 1.1478512287139893,
1950
- "eval_runtime": 119.1178,
1951
- "eval_samples_per_second": 8.387,
1952
- "eval_steps_per_second": 0.529,
1953
- "step": 264
1954
- },
1955
- {
1956
- "epoch": 2.83,
1957
- "grad_norm": 0.3298033298390841,
1958
- "learning_rate": 3.9899166144978904e-05,
1959
- "loss": 0.8296,
1960
- "step": 265
1961
- },
1962
- {
1963
- "epoch": 2.84,
1964
- "grad_norm": 0.3294808666769515,
1965
- "learning_rate": 3.920971891093718e-05,
1966
- "loss": 0.8206,
1967
- "step": 266
1968
- },
1969
- {
1970
- "epoch": 2.85,
1971
- "grad_norm": 0.3239672501165848,
1972
- "learning_rate": 3.852482488956992e-05,
1973
- "loss": 0.8116,
1974
- "step": 267
1975
- },
1976
- {
1977
- "epoch": 2.86,
1978
- "grad_norm": 0.3286742994048133,
1979
- "learning_rate": 3.784453537967161e-05,
1980
- "loss": 0.8096,
1981
- "step": 268
1982
- },
1983
- {
1984
- "epoch": 2.87,
1985
- "grad_norm": 0.31259050250842946,
1986
- "learning_rate": 3.7168901335157315e-05,
1987
- "loss": 0.7669,
1988
- "step": 269
1989
- },
1990
- {
1991
- "epoch": 2.88,
1992
- "grad_norm": 0.3308991711135206,
1993
- "learning_rate": 3.649797336124615e-05,
1994
- "loss": 0.8041,
1995
- "step": 270
1996
- },
1997
- {
1998
- "epoch": 2.89,
1999
- "grad_norm": 0.32757727002633424,
2000
- "learning_rate": 3.583180171067101e-05,
2001
- "loss": 0.7673,
2002
- "step": 271
2003
- },
2004
- {
2005
- "epoch": 2.9,
2006
- "grad_norm": 0.3342551756453125,
2007
- "learning_rate": 3.517043627991441e-05,
2008
- "loss": 0.8005,
2009
- "step": 272
2010
- },
2011
- {
2012
- "epoch": 2.91,
2013
- "grad_norm": 0.31643754309861705,
2014
- "learning_rate": 3.45139266054715e-05,
2015
- "loss": 0.787,
2016
- "step": 273
2017
- },
2018
- {
2019
- "epoch": 2.93,
2020
- "grad_norm": 0.3140452683879005,
2021
- "learning_rate": 3.3862321860139576e-05,
2022
- "loss": 0.7888,
2023
- "step": 274
2024
- },
2025
- {
2026
- "epoch": 2.94,
2027
- "grad_norm": 0.30706221155036223,
2028
- "learning_rate": 3.3215670849335155e-05,
2029
- "loss": 0.827,
2030
- "step": 275
2031
- },
2032
- {
2033
- "epoch": 2.95,
2034
- "grad_norm": 0.3185483102727301,
2035
- "learning_rate": 3.257402200743821e-05,
2036
- "loss": 0.7779,
2037
- "step": 276
2038
- },
2039
- {
2040
- "epoch": 2.96,
2041
- "grad_norm": 0.3032818796307545,
2042
- "learning_rate": 3.19374233941647e-05,
2043
- "loss": 0.7993,
2044
- "step": 277
2045
- },
2046
- {
2047
- "epoch": 2.97,
2048
- "grad_norm": 0.3057758504695884,
2049
- "learning_rate": 3.130592269096671e-05,
2050
- "loss": 0.768,
2051
- "step": 278
2052
- },
2053
- {
2054
- "epoch": 2.98,
2055
- "grad_norm": 0.3245404038219604,
2056
- "learning_rate": 3.0679567197461134e-05,
2057
- "loss": 0.7706,
2058
- "step": 279
2059
- },
2060
- {
2061
- "epoch": 2.99,
2062
- "grad_norm": 0.3376535123919746,
2063
- "learning_rate": 3.005840382788685e-05,
2064
- "loss": 0.7825,
2065
- "step": 280
2066
- },
2067
- {
2068
- "epoch": 3.0,
2069
- "grad_norm": 0.34483227716329967,
2070
- "learning_rate": 2.944247910759097e-05,
2071
- "loss": 0.7725,
2072
- "step": 281
2073
- },
2074
- {
2075
- "epoch": 3.01,
2076
- "grad_norm": 0.30532824560617583,
2077
- "learning_rate": 2.8831839169543996e-05,
2078
- "loss": 0.7228,
2079
- "step": 282
2080
- },
2081
- {
2082
- "epoch": 3.02,
2083
- "grad_norm": 0.31015055336513103,
2084
- "learning_rate": 2.8226529750884402e-05,
2085
- "loss": 0.6507,
2086
- "step": 283
2087
- },
2088
- {
2089
- "epoch": 3.03,
2090
- "grad_norm": 0.3125936248719555,
2091
- "learning_rate": 2.7626596189492983e-05,
2092
- "loss": 0.734,
2093
- "step": 284
2094
- },
2095
- {
2096
- "epoch": 3.04,
2097
- "grad_norm": 0.32131974094099536,
2098
- "learning_rate": 2.7032083420597e-05,
2099
- "loss": 0.7168,
2100
- "step": 285
2101
- },
2102
- {
2103
- "epoch": 3.05,
2104
- "grad_norm": 0.3188571782464755,
2105
- "learning_rate": 2.6443035973404496e-05,
2106
- "loss": 0.6591,
2107
- "step": 286
2108
- },
2109
- {
2110
- "epoch": 3.06,
2111
- "grad_norm": 0.33177480330954007,
2112
- "learning_rate": 2.585949796776912e-05,
2113
- "loss": 0.6965,
2114
- "step": 287
2115
- },
2116
- {
2117
- "epoch": 3.07,
2118
- "grad_norm": 0.38345252271104163,
2119
- "learning_rate": 2.528151311088537e-05,
2120
- "loss": 0.6647,
2121
- "step": 288
2122
- },
2123
- {
2124
- "epoch": 3.07,
2125
- "eval_loss": 1.1923209428787231,
2126
- "eval_runtime": 118.9649,
2127
- "eval_samples_per_second": 8.397,
2128
- "eval_steps_per_second": 0.53,
2129
- "step": 288
2130
- },
2131
- {
2132
- "epoch": 3.09,
2133
- "grad_norm": 0.3519739075006545,
2134
- "learning_rate": 2.4709124694015116e-05,
2135
- "loss": 0.7169,
2136
- "step": 289
2137
- },
2138
- {
2139
- "epoch": 3.1,
2140
- "grad_norm": 0.34731968650913336,
2141
- "learning_rate": 2.4142375589244957e-05,
2142
- "loss": 0.7161,
2143
- "step": 290
2144
- },
2145
- {
2146
- "epoch": 3.11,
2147
- "grad_norm": 0.33778912337210615,
2148
- "learning_rate": 2.3581308246275103e-05,
2149
- "loss": 0.7155,
2150
- "step": 291
2151
- },
2152
- {
2153
- "epoch": 3.12,
2154
- "grad_norm": 0.3209719266692497,
2155
- "learning_rate": 2.302596468923981e-05,
2156
- "loss": 0.672,
2157
- "step": 292
2158
- },
2159
- {
2160
- "epoch": 3.13,
2161
- "grad_norm": 0.3450647699674907,
2162
- "learning_rate": 2.247638651355991e-05,
2163
- "loss": 0.696,
2164
- "step": 293
2165
- },
2166
- {
2167
- "epoch": 3.14,
2168
- "grad_norm": 0.34077271836911865,
2169
- "learning_rate": 2.1932614882827197e-05,
2170
- "loss": 0.6947,
2171
- "step": 294
2172
- },
2173
- {
2174
- "epoch": 3.15,
2175
- "grad_norm": 0.35952846733253163,
2176
- "learning_rate": 2.139469052572127e-05,
2177
- "loss": 0.6934,
2178
- "step": 295
2179
- },
2180
- {
2181
- "epoch": 3.16,
2182
- "grad_norm": 0.32425812401737625,
2183
- "learning_rate": 2.0862653732958915e-05,
2184
- "loss": 0.6803,
2185
- "step": 296
2186
- },
2187
- {
2188
- "epoch": 3.17,
2189
- "grad_norm": 0.3354776806257836,
2190
- "learning_rate": 2.03365443542764e-05,
2191
- "loss": 0.6847,
2192
- "step": 297
2193
- },
2194
- {
2195
- "epoch": 3.18,
2196
- "grad_norm": 0.35880539160768654,
2197
- "learning_rate": 1.981640179544466e-05,
2198
- "loss": 0.6447,
2199
- "step": 298
2200
- },
2201
- {
2202
- "epoch": 3.19,
2203
- "grad_norm": 0.3413724716725652,
2204
- "learning_rate": 1.93022650153178e-05,
2205
- "loss": 0.6224,
2206
- "step": 299
2207
- },
2208
- {
2209
- "epoch": 3.2,
2210
- "grad_norm": 0.3424458691012398,
2211
- "learning_rate": 1.879417252291502e-05,
2212
- "loss": 0.6699,
2213
- "step": 300
2214
- },
2215
- {
2216
- "epoch": 3.21,
2217
- "grad_norm": 0.3382667696983362,
2218
- "learning_rate": 1.829216237453637e-05,
2219
- "loss": 0.6797,
2220
- "step": 301
2221
- },
2222
- {
2223
- "epoch": 3.22,
2224
- "grad_norm": 0.324989093083984,
2225
- "learning_rate": 1.7796272170912253e-05,
2226
- "loss": 0.691,
2227
- "step": 302
2228
- },
2229
- {
2230
- "epoch": 3.23,
2231
- "grad_norm": 0.32204008590962435,
2232
- "learning_rate": 1.730653905438714e-05,
2233
- "loss": 0.6636,
2234
- "step": 303
2235
- },
2236
- {
2237
- "epoch": 3.25,
2238
- "grad_norm": 0.34081693500058535,
2239
- "learning_rate": 1.6822999706137567e-05,
2240
- "loss": 0.7285,
2241
- "step": 304
2242
- },
2243
- {
2244
- "epoch": 3.26,
2245
- "grad_norm": 0.3745986061815685,
2246
- "learning_rate": 1.634569034342476e-05,
2247
- "loss": 0.6896,
2248
- "step": 305
2249
- },
2250
- {
2251
- "epoch": 3.27,
2252
- "grad_norm": 0.3460331973628518,
2253
- "learning_rate": 1.587464671688187e-05,
2254
- "loss": 0.6988,
2255
- "step": 306
2256
- },
2257
- {
2258
- "epoch": 3.28,
2259
- "grad_norm": 0.3548738091123995,
2260
- "learning_rate": 1.5409904107836358e-05,
2261
- "loss": 0.7166,
2262
- "step": 307
2263
- },
2264
- {
2265
- "epoch": 3.29,
2266
- "grad_norm": 0.3616387758766049,
2267
- "learning_rate": 1.495149732566723e-05,
2268
- "loss": 0.7033,
2269
- "step": 308
2270
- },
2271
- {
2272
- "epoch": 3.3,
2273
- "grad_norm": 0.3358013853175185,
2274
- "learning_rate": 1.4499460705197998e-05,
2275
- "loss": 0.7096,
2276
- "step": 309
2277
- },
2278
- {
2279
- "epoch": 3.31,
2280
- "grad_norm": 0.34774337705479985,
2281
- "learning_rate": 1.4053828104124867e-05,
2282
- "loss": 0.6695,
2283
- "step": 310
2284
- },
2285
- {
2286
- "epoch": 3.32,
2287
- "grad_norm": 0.3273729493403579,
2288
- "learning_rate": 1.361463290048085e-05,
2289
- "loss": 0.6893,
2290
- "step": 311
2291
- },
2292
- {
2293
- "epoch": 3.33,
2294
- "grad_norm": 0.343572039488577,
2295
- "learning_rate": 1.3181907990135622e-05,
2296
- "loss": 0.6928,
2297
- "step": 312
2298
- },
2299
- {
2300
- "epoch": 3.33,
2301
- "eval_loss": 1.185637354850769,
2302
- "eval_runtime": 119.6535,
2303
- "eval_samples_per_second": 8.349,
2304
- "eval_steps_per_second": 0.527,
2305
- "step": 312
2306
- },
2307
- {
2308
- "epoch": 3.34,
2309
- "grad_norm": 0.36314106359094633,
2310
- "learning_rate": 1.2755685784331783e-05,
2311
- "loss": 0.6887,
2312
- "step": 313
2313
- },
2314
- {
2315
- "epoch": 3.35,
2316
- "grad_norm": 0.3178995942964272,
2317
- "learning_rate": 1.2335998207257137e-05,
2318
- "loss": 0.7163,
2319
- "step": 314
2320
- },
2321
- {
2322
- "epoch": 3.36,
2323
- "grad_norm": 0.3530883103274161,
2324
- "learning_rate": 1.1922876693653585e-05,
2325
- "loss": 0.6264,
2326
- "step": 315
2327
- },
2328
- {
2329
- "epoch": 3.37,
2330
- "grad_norm": 0.34700396777915954,
2331
- "learning_rate": 1.1516352186462586e-05,
2332
- "loss": 0.6414,
2333
- "step": 316
2334
- },
2335
- {
2336
- "epoch": 3.38,
2337
- "grad_norm": 0.3259373809872755,
2338
- "learning_rate": 1.1116455134507664e-05,
2339
- "loss": 0.6461,
2340
- "step": 317
2341
- },
2342
- {
2343
- "epoch": 3.4,
2344
- "grad_norm": 0.35544661568277985,
2345
- "learning_rate": 1.0723215490213634e-05,
2346
- "loss": 0.6723,
2347
- "step": 318
2348
- },
2349
- {
2350
- "epoch": 3.41,
2351
- "grad_norm": 0.3488692435606677,
2352
- "learning_rate": 1.0336662707363287e-05,
2353
- "loss": 0.7782,
2354
- "step": 319
2355
- },
2356
- {
2357
- "epoch": 3.42,
2358
- "grad_norm": 0.35693983497851145,
2359
- "learning_rate": 9.95682573889114e-06,
2360
- "loss": 0.6982,
2361
- "step": 320
2362
- },
2363
- {
2364
- "epoch": 3.43,
2365
- "grad_norm": 0.33248426163504435,
2366
- "learning_rate": 9.583733034714981e-06,
2367
- "loss": 0.6326,
2368
- "step": 321
2369
- },
2370
- {
2371
- "epoch": 3.44,
2372
- "grad_norm": 0.38375415594727097,
2373
- "learning_rate": 9.217412539604942e-06,
2374
- "loss": 0.6971,
2375
- "step": 322
2376
- },
2377
- {
2378
- "epoch": 3.45,
2379
- "grad_norm": 0.37036824832855825,
2380
- "learning_rate": 8.857891691090337e-06,
2381
- "loss": 0.698,
2382
- "step": 323
2383
- },
2384
- {
2385
- "epoch": 3.46,
2386
- "grad_norm": 0.3417675223689608,
2387
- "learning_rate": 8.505197417404687e-06,
2388
- "loss": 0.6836,
2389
- "step": 324
2390
- },
2391
- {
2392
- "epoch": 3.47,
2393
- "grad_norm": 0.3700366171554004,
2394
- "learning_rate": 8.15935613546872e-06,
2395
- "loss": 0.6465,
2396
- "step": 325
2397
- },
2398
- {
2399
- "epoch": 3.48,
2400
- "grad_norm": 0.3492600384568647,
2401
- "learning_rate": 7.820393748911791e-06,
2402
- "loss": 0.6808,
2403
- "step": 326
2404
- },
2405
- {
2406
- "epoch": 3.49,
2407
- "grad_norm": 0.36976467848962297,
2408
- "learning_rate": 7.488335646131628e-06,
2409
- "loss": 0.6615,
2410
- "step": 327
2411
- },
2412
- {
2413
- "epoch": 3.5,
2414
- "grad_norm": 0.3290900616093639,
2415
- "learning_rate": 7.163206698392744e-06,
2416
- "loss": 0.7275,
2417
- "step": 328
2418
- },
2419
- {
2420
- "epoch": 3.51,
2421
- "grad_norm": 0.3566655771775109,
2422
- "learning_rate": 6.845031257963619e-06,
2423
- "loss": 0.7116,
2424
- "step": 329
2425
- },
2426
- {
2427
- "epoch": 3.52,
2428
- "grad_norm": 0.320805379756211,
2429
- "learning_rate": 6.533833156292679e-06,
2430
- "loss": 0.6879,
2431
- "step": 330
2432
- },
2433
- {
2434
- "epoch": 3.53,
2435
- "grad_norm": 0.34337800662165496,
2436
- "learning_rate": 6.229635702223324e-06,
2437
- "loss": 0.7163,
2438
- "step": 331
2439
- },
2440
- {
2441
- "epoch": 3.54,
2442
- "grad_norm": 0.36367722946797926,
2443
- "learning_rate": 5.932461680248014e-06,
2444
- "loss": 0.7021,
2445
- "step": 332
2446
- },
2447
- {
2448
- "epoch": 3.56,
2449
- "grad_norm": 0.36522399153203633,
2450
- "learning_rate": 5.6423333488018095e-06,
2451
- "loss": 0.6861,
2452
- "step": 333
2453
- },
2454
- {
2455
- "epoch": 3.57,
2456
- "grad_norm": 0.365852241324378,
2457
- "learning_rate": 5.359272438595153e-06,
2458
- "loss": 0.6885,
2459
- "step": 334
2460
- },
2461
- {
2462
- "epoch": 3.58,
2463
- "grad_norm": 0.3614672831032901,
2464
- "learning_rate": 5.083300150986259e-06,
2465
- "loss": 0.6783,
2466
- "step": 335
2467
- },
2468
- {
2469
- "epoch": 3.59,
2470
- "grad_norm": 0.3642169019209535,
2471
- "learning_rate": 4.8144371563930476e-06,
2472
- "loss": 0.731,
2473
- "step": 336
2474
- },
2475
- {
2476
- "epoch": 3.59,
2477
- "eval_loss": 1.1889785528182983,
2478
- "eval_runtime": 119.1203,
2479
- "eval_samples_per_second": 8.386,
2480
- "eval_steps_per_second": 0.529,
2481
- "step": 336
2482
- },
2483
- {
2484
- "epoch": 3.6,
2485
- "grad_norm": 0.34161446199658746,
2486
- "learning_rate": 4.552703592745033e-06,
2487
- "loss": 0.7138,
2488
- "step": 337
2489
- },
2490
- {
2491
- "epoch": 3.61,
2492
- "grad_norm": 0.35734876291405965,
2493
- "learning_rate": 4.298119063974914e-06,
2494
- "loss": 0.6725,
2495
- "step": 338
2496
- },
2497
- {
2498
- "epoch": 3.62,
2499
- "grad_norm": 0.32914706405183236,
2500
- "learning_rate": 4.050702638550275e-06,
2501
- "loss": 0.6961,
2502
- "step": 339
2503
- },
2504
- {
2505
- "epoch": 3.63,
2506
- "grad_norm": 0.3570873592035516,
2507
- "learning_rate": 3.810472848045266e-06,
2508
- "loss": 0.6514,
2509
- "step": 340
2510
- },
2511
- {
2512
- "epoch": 3.64,
2513
- "grad_norm": 0.3597890086242411,
2514
- "learning_rate": 3.5774476857527107e-06,
2515
- "loss": 0.6962,
2516
- "step": 341
2517
- },
2518
- {
2519
- "epoch": 3.65,
2520
- "grad_norm": 0.38480539648857937,
2521
- "learning_rate": 3.3516446053363015e-06,
2522
- "loss": 0.6686,
2523
- "step": 342
2524
- },
2525
- {
2526
- "epoch": 3.66,
2527
- "grad_norm": 0.3525111961446683,
2528
- "learning_rate": 3.133080519523368e-06,
2529
- "loss": 0.6667,
2530
- "step": 343
2531
- },
2532
- {
2533
- "epoch": 3.67,
2534
- "grad_norm": 0.3735012861865468,
2535
- "learning_rate": 2.921771798838069e-06,
2536
- "loss": 0.7137,
2537
- "step": 344
2538
- },
2539
- {
2540
- "epoch": 3.68,
2541
- "grad_norm": 0.35587547424949395,
2542
- "learning_rate": 2.717734270375272e-06,
2543
- "loss": 0.6743,
2544
- "step": 345
2545
- },
2546
- {
2547
- "epoch": 3.69,
2548
- "grad_norm": 0.3657817512714091,
2549
- "learning_rate": 2.520983216615047e-06,
2550
- "loss": 0.6373,
2551
- "step": 346
2552
- },
2553
- {
2554
- "epoch": 3.7,
2555
- "grad_norm": 0.3509346866409975,
2556
- "learning_rate": 2.3315333742780942e-06,
2557
- "loss": 0.6703,
2558
- "step": 347
2559
- },
2560
- {
2561
- "epoch": 3.72,
2562
- "grad_norm": 0.3459857927296872,
2563
- "learning_rate": 2.1493989332218468e-06,
2564
- "loss": 0.7223,
2565
- "step": 348
2566
- },
2567
- {
2568
- "epoch": 3.73,
2569
- "grad_norm": 0.38016379979574644,
2570
- "learning_rate": 1.974593535377722e-06,
2571
- "loss": 0.7024,
2572
- "step": 349
2573
- },
2574
- {
2575
- "epoch": 3.74,
2576
- "grad_norm": 0.3191443635681887,
2577
- "learning_rate": 1.8071302737293295e-06,
2578
- "loss": 0.6933,
2579
- "step": 350
2580
- },
2581
- {
2582
- "epoch": 3.75,
2583
- "grad_norm": 0.3586330010035836,
2584
- "learning_rate": 1.6470216913317626e-06,
2585
- "loss": 0.7001,
2586
- "step": 351
2587
- },
2588
- {
2589
- "epoch": 3.76,
2590
- "grad_norm": 0.3850836045039222,
2591
- "learning_rate": 1.4942797803721543e-06,
2592
- "loss": 0.61,
2593
- "step": 352
2594
- },
2595
- {
2596
- "epoch": 3.77,
2597
- "grad_norm": 0.34888020988423163,
2598
- "learning_rate": 1.348915981271437e-06,
2599
- "loss": 0.6528,
2600
- "step": 353
2601
- },
2602
- {
2603
- "epoch": 3.78,
2604
- "grad_norm": 0.354531551308838,
2605
- "learning_rate": 1.2109411818274852e-06,
2606
- "loss": 0.7128,
2607
- "step": 354
2608
- },
2609
- {
2610
- "epoch": 3.79,
2611
- "grad_norm": 0.38330787603530414,
2612
- "learning_rate": 1.0803657163995895e-06,
2613
- "loss": 0.643,
2614
- "step": 355
2615
- },
2616
- {
2617
- "epoch": 3.8,
2618
- "grad_norm": 0.3432828481582299,
2619
- "learning_rate": 9.57199365134387e-07,
2620
- "loss": 0.6318,
2621
- "step": 356
2622
- },
2623
- {
2624
- "epoch": 3.81,
2625
- "grad_norm": 0.37153831756155836,
2626
- "learning_rate": 8.41451353233369e-07,
2627
- "loss": 0.694,
2628
- "step": 357
2629
- },
2630
- {
2631
- "epoch": 3.82,
2632
- "grad_norm": 0.3701195226107489,
2633
- "learning_rate": 7.331303502618903e-07,
2634
- "loss": 0.6561,
2635
- "step": 358
2636
- },
2637
- {
2638
- "epoch": 3.83,
2639
- "grad_norm": 0.3831847969050675,
2640
- "learning_rate": 6.322444694998319e-07,
2641
- "loss": 0.7167,
2642
- "step": 359
2643
- },
2644
- {
2645
- "epoch": 3.84,
2646
- "grad_norm": 0.3382934858827101,
2647
- "learning_rate": 5.388012673338661e-07,
2648
- "loss": 0.7193,
2649
- "step": 360
2650
- },
2651
- {
2652
- "epoch": 3.84,
2653
- "eval_loss": 1.191129207611084,
2654
- "eval_runtime": 119.5001,
2655
- "eval_samples_per_second": 8.36,
2656
- "eval_steps_per_second": 0.527,
2657
- "step": 360
2658
- },
2659
- {
2660
- "epoch": 3.85,
2661
- "grad_norm": 0.35121545616673777,
2662
- "learning_rate": 4.5280774269154115e-07,
2663
- "loss": 0.6838,
2664
- "step": 361
2665
- },
2666
- {
2667
- "epoch": 3.86,
2668
- "grad_norm": 0.3677398099025592,
2669
- "learning_rate": 3.742703365170241e-07,
2670
- "loss": 0.6364,
2671
- "step": 362
2672
- },
2673
- {
2674
- "epoch": 3.88,
2675
- "grad_norm": 0.36901780764712055,
2676
- "learning_rate": 3.0319493128866396e-07,
2677
- "loss": 0.6827,
2678
- "step": 363
2679
- },
2680
- {
2681
- "epoch": 3.89,
2682
- "grad_norm": 0.35056873374955655,
2683
- "learning_rate": 2.395868505784438e-07,
2684
- "loss": 0.6432,
2685
- "step": 364
2686
- },
2687
- {
2688
- "epoch": 3.9,
2689
- "grad_norm": 0.3627218430880944,
2690
- "learning_rate": 1.83450858653178e-07,
2691
- "loss": 0.7243,
2692
- "step": 365
2693
- },
2694
- {
2695
- "epoch": 3.91,
2696
- "grad_norm": 0.32428680992451525,
2697
- "learning_rate": 1.3479116011769767e-07,
2698
- "loss": 0.6574,
2699
- "step": 366
2700
- },
2701
- {
2702
- "epoch": 3.92,
2703
- "grad_norm": 0.349287012328286,
2704
- "learning_rate": 9.361139959993549e-08,
2705
- "loss": 0.7048,
2706
- "step": 367
2707
- },
2708
- {
2709
- "epoch": 3.93,
2710
- "grad_norm": 0.3701389678746645,
2711
- "learning_rate": 5.991466147791113e-08,
2712
- "loss": 0.6742,
2713
- "step": 368
2714
- },
2715
- {
2716
- "epoch": 3.94,
2717
- "grad_norm": 0.3562569316390435,
2718
- "learning_rate": 3.370346964876036e-08,
2719
- "loss": 0.6747,
2720
- "step": 369
2721
- },
2722
- {
2723
- "epoch": 3.95,
2724
- "grad_norm": 0.35498414175449955,
2725
- "learning_rate": 1.4979787339619578e-08,
2726
- "loss": 0.6466,
2727
- "step": 370
2728
- },
2729
- {
2730
- "epoch": 3.96,
2731
- "grad_norm": 0.353867118979457,
2732
- "learning_rate": 3.745016960665648e-09,
2733
- "loss": 0.6586,
2734
- "step": 371
2735
- },
2736
- {
2737
- "epoch": 3.97,
2738
- "grad_norm": 0.3411723289198462,
2739
- "learning_rate": 0.0,
2740
- "loss": 0.6657,
2741
- "step": 372
2742
- }
2743
- ],
2744
- "logging_steps": 1,
2745
- "max_steps": 372,
2746
- "num_input_tokens_seen": 0,
2747
- "num_train_epochs": 4,
2748
- "save_steps": 93,
2749
- "total_flos": 6.890718948426252e+18,
2750
- "train_batch_size": 4,
2751
- "trial_name": null,
2752
- "trial_params": null
2753
- }