xu3kev commited on
Commit
2bb583b
·
verified ·
1 Parent(s): 4dbf161

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/README.md +151 -0
  2. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/adapter_config.json +34 -0
  3. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/adapter_model.bin +3 -0
  4. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/README.md +202 -0
  5. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/adapter_config.json +34 -0
  6. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/adapter_model.safetensors +3 -0
  7. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  8. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  9. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  10. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  11. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  12. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  13. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  14. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  15. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/mp_rank_00_model_states.pt +3 -0
  16. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/latest +1 -0
  17. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_0.pth +3 -0
  18. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_1.pth +3 -0
  19. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_2.pth +3 -0
  20. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_3.pth +3 -0
  21. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_4.pth +3 -0
  22. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_5.pth +3 -0
  23. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_6.pth +3 -0
  24. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_7.pth +3 -0
  25. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/scheduler.pt +3 -0
  26. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/special_tokens_map.json +23 -0
  27. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/tokenizer.json +0 -0
  28. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/tokenizer_config.json +193 -0
  29. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/trainer_state.json +3210 -0
  30. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/training_args.bin +3 -0
  31. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/zero_to_fp32.py +592 -0
  32. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/README.md +202 -0
  33. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/adapter_config.json +34 -0
  34. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/adapter_model.safetensors +3 -0
  35. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  36. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  37. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  38. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  39. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  40. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  41. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  42. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  43. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/mp_rank_00_model_states.pt +3 -0
  44. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/latest +1 -0
  45. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_0.pth +3 -0
  46. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_1.pth +3 -0
  47. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_2.pth +3 -0
  48. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_3.pth +3 -0
  49. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_4.pth +3 -0
  50. gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_5.pth +3 -0
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
7
+ model-index:
8
+ - name: lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ adapter: lora
21
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
22
+ bf16: auto
23
+ dataset_prepared_path: ./logo_ds_preprocess_list_gpt35
24
+ datasets:
25
+ - path: ../logo/fix_logo_synthetic_training_data_full.json
26
+ type:
27
+ field_instruction: input
28
+ field_output: output
29
+ format: '### Instruction:
30
+
31
+ {input}
32
+
33
+ ### Response:
34
+
35
+ '
36
+ no_input_format: '{instruction}'
37
+ debug: null
38
+ deepspeed: ./deepspeed_configs/zero2.json
39
+ early_stopping_patience: null
40
+ eval_sample_packing: true
41
+ evals_per_epoch: 4
42
+ flash_attention: true
43
+ fp16: null
44
+ fsdp: null
45
+ fsdp_config: null
46
+ gradient_accumulation_steps: 2
47
+ gradient_checkpointing: true
48
+ group_by_length: false
49
+ is_llama_derived_model: true
50
+ learning_rate: 0.0002
51
+ load_in_4bit: false
52
+ load_in_8bit: true
53
+ local_rank: null
54
+ logging_steps: 1
55
+ lora_alpha: 1024
56
+ lora_dropout: 0.05
57
+ lora_fan_in_fan_out: null
58
+ lora_model_dir: null
59
+ lora_r: 1024
60
+ lora_target_linear: true
61
+ lr_scheduler: cosine
62
+ micro_batch_size: 4
63
+ model_type: AutoModelForCausalLM
64
+ num_epochs: 2
65
+ optimizer: adamw_bnb_8bit
66
+ output_dir: ./lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024
67
+ pad_to_sequence_len: true
68
+ resume_from_checkpoint: null
69
+ s2_attention: null
70
+ sample_packing: true
71
+ saves_per_epoch: 1
72
+ sequence_len: 1800
73
+ special_tokens:
74
+ bos_token: "<\uFF5Cbegin\u2581of\u2581sentence\uFF5C>"
75
+ eos_token: <|EOT|>
76
+ strict: true
77
+ tf32: false
78
+ tokenizer_type: AutoTokenizer
79
+ train_on_inputs: false
80
+ val_set_size: 0.05
81
+ wandb_entity: null
82
+ wandb_log_model: null
83
+ wandb_name: logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024
84
+ wandb_project: pbe-axo
85
+ wandb_watch: null
86
+ warmup_steps: 50
87
+ weight_decay: 0.0
88
+ xformers_attention: null
89
+
90
+ ```
91
+
92
+ </details><br>
93
+
94
+ # lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024
95
+
96
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) on the None dataset.
97
+ It achieves the following results on the evaluation set:
98
+ - Loss: 0.2380
99
+
100
+ ## Model description
101
+
102
+ More information needed
103
+
104
+ ## Intended uses & limitations
105
+
106
+ More information needed
107
+
108
+ ## Training and evaluation data
109
+
110
+ More information needed
111
+
112
+ ## Training procedure
113
+
114
+ ### Training hyperparameters
115
+
116
+ The following hyperparameters were used during training:
117
+ - learning_rate: 0.0002
118
+ - train_batch_size: 4
119
+ - eval_batch_size: 4
120
+ - seed: 42
121
+ - distributed_type: multi-GPU
122
+ - num_devices: 8
123
+ - gradient_accumulation_steps: 2
124
+ - total_train_batch_size: 64
125
+ - total_eval_batch_size: 32
126
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
127
+ - lr_scheduler_type: cosine
128
+ - lr_scheduler_warmup_steps: 50
129
+ - num_epochs: 2
130
+
131
+ ### Training results
132
+
133
+ | Training Loss | Epoch | Step | Validation Loss |
134
+ |:-------------:|:-----:|:----:|:---------------:|
135
+ | 2.1469 | 0.0 | 1 | 2.1795 |
136
+ | 0.3319 | 0.25 | 113 | 0.3324 |
137
+ | 0.2883 | 0.5 | 226 | 0.2976 |
138
+ | 0.2748 | 0.75 | 339 | 0.2785 |
139
+ | 0.2812 | 1.0 | 452 | 0.2612 |
140
+ | 0.2276 | 1.23 | 565 | 0.2523 |
141
+ | 0.2483 | 1.48 | 678 | 0.2440 |
142
+ | 0.1982 | 1.73 | 791 | 0.2380 |
143
+
144
+
145
+ ### Framework versions
146
+
147
+ - PEFT 0.10.0
148
+ - Transformers 4.40.0.dev0
149
+ - Pytorch 2.1.2+cu121
150
+ - Datasets 2.15.0
151
+ - Tokenizers 0.15.0
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 1024,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 1024,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "gate_proj",
25
+ "k_proj",
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ad79e5fb81e8dc630c9b53d4cc03ef5a40cda07b76bd882dde655b5082e8dee
3
+ size 15765481838
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 1024,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 1024,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "gate_proj",
25
+ "k_proj",
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954715f65f71a8e1f73755b14991824de2de206d91bd289dd1b0a8c0b9c74836
3
+ size 15765462656
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd05cbb029cea1e103a3f0d418ad90c6dbb28fc9251529cce0b84a8462bbd368
3
+ size 11824020348
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f9e1cc6bc6251bfe47d18c813d8ff0c9a6f5f5b5376b1e8d009a8e47d35b392
3
+ size 11824020540
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52a97e6b391e4adee8e6ccd964e94453ba176f900bc8a12b067a4d9a09d34943
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50bf1a9f197b077b476d26152faaa2e8228022b5fe3f718261597ec8c566700d
3
+ size 11824020028
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a632e7a6f5ccd8856abbf05d8b08b725e376e09812b421bee776add7699332
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e980dd30724dd4b8c78c45ef86efc4a03afaebec8f18c6b8a69ad530aa3465a
3
+ size 11824020604
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d59303410adb2e24b0f9114b518e09f0e6cd15c1ac2b2dd22b990090d16d4287
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d73b8cac39bde007b7ab03562a77bae535d3e58e30529f95f9d7730554257acf
3
+ size 11824020028
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/global_step451/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:670616a46d85e329435fc96c86b1706fe519bf1cd64deb5d9ce280940bc3a882
3
+ size 15781541500
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step451
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe694cdc55dd3cdb479144f59be96c91806afa53549f393e915933e3a9179402
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64edf4480002a76d50328e34f60d216d2a41a08fd1948260757d561ceaea5a45
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8e02399ba27844cadf8ff341e6a15801677129947df4d0f21a43c80aa42d671
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63d38cdb4ec7272c83f7a8a0d23357d4d3b30b4add272afa23ca198ea431c729
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f5f8dc5ab49a1f7f965170650d2ae1078d160bdb2616a9d2ba173fdfef2cbf1
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8049b251c21930a3bf8f68b218d94a1474ea9cde1bd411d4c0dd51f7aeea5f5
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85fac4fa6629a125ce826ed9762d2d438acaaf998c88bfdabf39c27ae11b9edf
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5cef03958c01ab037ae8acce834a0dc18e393e372a888ff28a78f16571a8ef
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e64565a7cd3dda5d1c1496f437763d3a0989eb4ac891a6f6e96cfff34899123
3
+ size 1064
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/tokenizer_config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "32000": {
6
+ "content": "õ",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": false
12
+ },
13
+ "32001": {
14
+ "content": "÷",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "32002": {
22
+ "content": "Á",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32003": {
30
+ "content": "ý",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "32004": {
38
+ "content": "À",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "32005": {
46
+ "content": "ÿ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "32006": {
54
+ "content": "ø",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "32007": {
62
+ "content": "ú",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "32008": {
70
+ "content": "þ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "32009": {
78
+ "content": "ü",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "32010": {
86
+ "content": "ù",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "32011": {
94
+ "content": "ö",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "32012": {
102
+ "content": "û",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "32013": {
110
+ "content": "<|begin▁of▁sentence|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32014": {
118
+ "content": "<|end▁of▁sentence|>",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32015": {
126
+ "content": "<|fim▁hole|>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "32016": {
134
+ "content": "<|fim▁begin|>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "32017": {
142
+ "content": "<|fim▁end|>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "32018": {
150
+ "content": "<pad>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "32019": {
158
+ "content": "<|User|>",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "32020": {
166
+ "content": "<|Assistant|>",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "32021": {
174
+ "content": "<|EOT|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ }
181
+ },
182
+ "bos_token": "<|begin▁of▁sentence|>",
183
+ "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|EOT|>",
186
+ "legacy": true,
187
+ "model_max_length": 16384,
188
+ "pad_token": "<|end▁of▁sentence|>",
189
+ "sp_model_kwargs": {},
190
+ "tokenizer_class": "LlamaTokenizer",
191
+ "unk_token": null,
192
+ "use_default_system_prompt": false
193
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/trainer_state.json ADDED
@@ -0,0 +1,3210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9988925802879292,
5
+ "eval_steps": 113,
6
+ "global_step": 451,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 3.428594393312656,
14
+ "learning_rate": 4.000000000000001e-06,
15
+ "loss": 2.1469,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "eval_loss": 2.179504632949829,
21
+ "eval_runtime": 172.7557,
22
+ "eval_samples_per_second": 13.447,
23
+ "eval_steps_per_second": 0.423,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.0,
28
+ "grad_norm": 3.7159828982580754,
29
+ "learning_rate": 8.000000000000001e-06,
30
+ "loss": 2.1946,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 3.643082734610657,
36
+ "learning_rate": 1.2e-05,
37
+ "loss": 2.232,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "grad_norm": 3.75889245136616,
43
+ "learning_rate": 1.6000000000000003e-05,
44
+ "loss": 2.1482,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.01,
49
+ "grad_norm": 3.121791718587376,
50
+ "learning_rate": 2e-05,
51
+ "loss": 1.9094,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.01,
56
+ "grad_norm": 3.357880867767518,
57
+ "learning_rate": 2.4e-05,
58
+ "loss": 1.5871,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 3.946129682154882,
64
+ "learning_rate": 2.8000000000000003e-05,
65
+ "loss": 1.326,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.02,
70
+ "grad_norm": 2.108395610487625,
71
+ "learning_rate": 3.2000000000000005e-05,
72
+ "loss": 1.0602,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.02,
77
+ "grad_norm": 2.130447473758986,
78
+ "learning_rate": 3.6e-05,
79
+ "loss": 0.962,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.02,
84
+ "grad_norm": 2.250625735269271,
85
+ "learning_rate": 4e-05,
86
+ "loss": 0.7239,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.02,
91
+ "grad_norm": 2.086877732413652,
92
+ "learning_rate": 4.4000000000000006e-05,
93
+ "loss": 0.6204,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.03,
98
+ "grad_norm": 1.1321791358110234,
99
+ "learning_rate": 4.8e-05,
100
+ "loss": 0.5483,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.03,
105
+ "grad_norm": 0.8099243715089002,
106
+ "learning_rate": 5.2000000000000004e-05,
107
+ "loss": 0.5086,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.03,
112
+ "grad_norm": 0.8079844588753853,
113
+ "learning_rate": 5.6000000000000006e-05,
114
+ "loss": 0.5112,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.03,
119
+ "grad_norm": 0.9169759048137539,
120
+ "learning_rate": 6e-05,
121
+ "loss": 0.4889,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.04,
126
+ "grad_norm": 0.6917931287891316,
127
+ "learning_rate": 6.400000000000001e-05,
128
+ "loss": 0.4691,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.04,
133
+ "grad_norm": 0.6294590670553853,
134
+ "learning_rate": 6.800000000000001e-05,
135
+ "loss": 0.4549,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.04,
140
+ "grad_norm": 0.6001819771732335,
141
+ "learning_rate": 7.2e-05,
142
+ "loss": 0.4412,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.04,
147
+ "grad_norm": 0.6389927422614288,
148
+ "learning_rate": 7.6e-05,
149
+ "loss": 0.4077,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.04,
154
+ "grad_norm": 0.5185334799357709,
155
+ "learning_rate": 8e-05,
156
+ "loss": 0.4219,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.05,
161
+ "grad_norm": 0.5274718517549238,
162
+ "learning_rate": 8.4e-05,
163
+ "loss": 0.4124,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.05,
168
+ "grad_norm": 0.5333597700699511,
169
+ "learning_rate": 8.800000000000001e-05,
170
+ "loss": 0.4333,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.05,
175
+ "grad_norm": 0.4817408646893783,
176
+ "learning_rate": 9.200000000000001e-05,
177
+ "loss": 0.4022,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.05,
182
+ "grad_norm": 0.5041871581629092,
183
+ "learning_rate": 9.6e-05,
184
+ "loss": 0.3971,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.06,
189
+ "grad_norm": 0.5892498993207921,
190
+ "learning_rate": 0.0001,
191
+ "loss": 0.3769,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.06,
196
+ "grad_norm": 0.4448471314677223,
197
+ "learning_rate": 0.00010400000000000001,
198
+ "loss": 0.3905,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.06,
203
+ "grad_norm": 0.41460492934076704,
204
+ "learning_rate": 0.00010800000000000001,
205
+ "loss": 0.3814,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.06,
210
+ "grad_norm": 0.39759977947743247,
211
+ "learning_rate": 0.00011200000000000001,
212
+ "loss": 0.3675,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.06,
217
+ "grad_norm": 0.42656614420804834,
218
+ "learning_rate": 0.000116,
219
+ "loss": 0.3949,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.07,
224
+ "grad_norm": 0.3248894659390531,
225
+ "learning_rate": 0.00012,
226
+ "loss": 0.3734,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.07,
231
+ "grad_norm": 0.35364315198613105,
232
+ "learning_rate": 0.000124,
233
+ "loss": 0.3452,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.07,
238
+ "grad_norm": 0.3377660731261332,
239
+ "learning_rate": 0.00012800000000000002,
240
+ "loss": 0.365,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.07,
245
+ "grad_norm": 0.3029616210555195,
246
+ "learning_rate": 0.000132,
247
+ "loss": 0.3502,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.08,
252
+ "grad_norm": 0.43234275224940705,
253
+ "learning_rate": 0.00013600000000000003,
254
+ "loss": 0.3728,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.08,
259
+ "grad_norm": 0.3831666942704003,
260
+ "learning_rate": 0.00014,
261
+ "loss": 0.3922,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.08,
266
+ "grad_norm": 0.3421076533380305,
267
+ "learning_rate": 0.000144,
268
+ "loss": 0.3544,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.08,
273
+ "grad_norm": 0.30146273904800347,
274
+ "learning_rate": 0.000148,
275
+ "loss": 0.3237,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.08,
280
+ "grad_norm": 0.3234891150012619,
281
+ "learning_rate": 0.000152,
282
+ "loss": 0.3771,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.09,
287
+ "grad_norm": 0.32536194802044366,
288
+ "learning_rate": 0.00015600000000000002,
289
+ "loss": 0.3719,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.09,
294
+ "grad_norm": 0.32489341033312524,
295
+ "learning_rate": 0.00016,
296
+ "loss": 0.3716,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.09,
301
+ "grad_norm": 0.3135978131489568,
302
+ "learning_rate": 0.000164,
303
+ "loss": 0.3818,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.09,
308
+ "grad_norm": 0.28403149178513853,
309
+ "learning_rate": 0.000168,
310
+ "loss": 0.3576,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.1,
315
+ "grad_norm": 0.30747408490186307,
316
+ "learning_rate": 0.000172,
317
+ "loss": 0.3801,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.1,
322
+ "grad_norm": 0.3491185114838396,
323
+ "learning_rate": 0.00017600000000000002,
324
+ "loss": 0.3612,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 0.1,
329
+ "grad_norm": 0.4062709802932099,
330
+ "learning_rate": 0.00018,
331
+ "loss": 0.3752,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 0.1,
336
+ "grad_norm": 0.4174448098463489,
337
+ "learning_rate": 0.00018400000000000003,
338
+ "loss": 0.3705,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 0.1,
343
+ "grad_norm": 0.30034892483536746,
344
+ "learning_rate": 0.000188,
345
+ "loss": 0.3504,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 0.11,
350
+ "grad_norm": 0.24699047743823555,
351
+ "learning_rate": 0.000192,
352
+ "loss": 0.3622,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 0.11,
357
+ "grad_norm": 0.2944161708858025,
358
+ "learning_rate": 0.000196,
359
+ "loss": 0.3462,
360
+ "step": 49
361
+ },
362
+ {
363
+ "epoch": 0.11,
364
+ "grad_norm": 0.24968532785704994,
365
+ "learning_rate": 0.0002,
366
+ "loss": 0.3074,
367
+ "step": 50
368
+ },
369
+ {
370
+ "epoch": 0.11,
371
+ "grad_norm": 0.4989866478305154,
372
+ "learning_rate": 0.00019999932018605637,
373
+ "loss": 0.3426,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 0.12,
378
+ "grad_norm": 0.2729667152979656,
379
+ "learning_rate": 0.00019999728075346845,
380
+ "loss": 0.3452,
381
+ "step": 52
382
+ },
383
+ {
384
+ "epoch": 0.12,
385
+ "grad_norm": 0.26553719135078474,
386
+ "learning_rate": 0.00019999388172996495,
387
+ "loss": 0.3771,
388
+ "step": 53
389
+ },
390
+ {
391
+ "epoch": 0.12,
392
+ "grad_norm": 0.2563892875653707,
393
+ "learning_rate": 0.0001999891231617599,
394
+ "loss": 0.3709,
395
+ "step": 54
396
+ },
397
+ {
398
+ "epoch": 0.12,
399
+ "grad_norm": 0.40493567306146466,
400
+ "learning_rate": 0.0001999830051135521,
401
+ "loss": 0.3659,
402
+ "step": 55
403
+ },
404
+ {
405
+ "epoch": 0.12,
406
+ "grad_norm": 0.282074263507646,
407
+ "learning_rate": 0.00019997552766852432,
408
+ "loss": 0.3579,
409
+ "step": 56
410
+ },
411
+ {
412
+ "epoch": 0.13,
413
+ "grad_norm": 0.24830919197340875,
414
+ "learning_rate": 0.00019996669092834193,
415
+ "loss": 0.3164,
416
+ "step": 57
417
+ },
418
+ {
419
+ "epoch": 0.13,
420
+ "grad_norm": 0.31153665113975915,
421
+ "learning_rate": 0.0001999564950131517,
422
+ "loss": 0.3554,
423
+ "step": 58
424
+ },
425
+ {
426
+ "epoch": 0.13,
427
+ "grad_norm": 0.2883397422934534,
428
+ "learning_rate": 0.00019994494006158017,
429
+ "loss": 0.3318,
430
+ "step": 59
431
+ },
432
+ {
433
+ "epoch": 0.13,
434
+ "grad_norm": 0.29607763624002476,
435
+ "learning_rate": 0.00019993202623073172,
436
+ "loss": 0.3515,
437
+ "step": 60
438
+ },
439
+ {
440
+ "epoch": 0.14,
441
+ "grad_norm": 0.26631042457457627,
442
+ "learning_rate": 0.0001999177536961863,
443
+ "loss": 0.3404,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 0.14,
448
+ "grad_norm": 0.30709845993919116,
449
+ "learning_rate": 0.00019990212265199738,
450
+ "loss": 0.3454,
451
+ "step": 62
452
+ },
453
+ {
454
+ "epoch": 0.14,
455
+ "grad_norm": 0.2689959836388791,
456
+ "learning_rate": 0.0001998851333106889,
457
+ "loss": 0.3448,
458
+ "step": 63
459
+ },
460
+ {
461
+ "epoch": 0.14,
462
+ "grad_norm": 0.28079893446027976,
463
+ "learning_rate": 0.00019986678590325273,
464
+ "loss": 0.3537,
465
+ "step": 64
466
+ },
467
+ {
468
+ "epoch": 0.14,
469
+ "grad_norm": 0.2777161046281277,
470
+ "learning_rate": 0.00019984708067914532,
471
+ "loss": 0.3512,
472
+ "step": 65
473
+ },
474
+ {
475
+ "epoch": 0.15,
476
+ "grad_norm": 0.2612936496089595,
477
+ "learning_rate": 0.0001998260179062844,
478
+ "loss": 0.3355,
479
+ "step": 66
480
+ },
481
+ {
482
+ "epoch": 0.15,
483
+ "grad_norm": 0.270747618988818,
484
+ "learning_rate": 0.0001998035978710453,
485
+ "loss": 0.3557,
486
+ "step": 67
487
+ },
488
+ {
489
+ "epoch": 0.15,
490
+ "grad_norm": 0.2524425572972277,
491
+ "learning_rate": 0.00019977982087825713,
492
+ "loss": 0.324,
493
+ "step": 68
494
+ },
495
+ {
496
+ "epoch": 0.15,
497
+ "grad_norm": 0.2655824178335376,
498
+ "learning_rate": 0.00019975468725119843,
499
+ "loss": 0.3274,
500
+ "step": 69
501
+ },
502
+ {
503
+ "epoch": 0.16,
504
+ "grad_norm": 0.29931436509996834,
505
+ "learning_rate": 0.000199728197331593,
506
+ "loss": 0.3733,
507
+ "step": 70
508
+ },
509
+ {
510
+ "epoch": 0.16,
511
+ "grad_norm": 0.2945779980864737,
512
+ "learning_rate": 0.00019970035147960524,
513
+ "loss": 0.3161,
514
+ "step": 71
515
+ },
516
+ {
517
+ "epoch": 0.16,
518
+ "grad_norm": 0.42061603746317366,
519
+ "learning_rate": 0.00019967115007383507,
520
+ "loss": 0.3486,
521
+ "step": 72
522
+ },
523
+ {
524
+ "epoch": 0.16,
525
+ "grad_norm": 0.24899746628510128,
526
+ "learning_rate": 0.000199640593511313,
527
+ "loss": 0.3428,
528
+ "step": 73
529
+ },
530
+ {
531
+ "epoch": 0.16,
532
+ "grad_norm": 0.2748778960772334,
533
+ "learning_rate": 0.00019960868220749448,
534
+ "loss": 0.3215,
535
+ "step": 74
536
+ },
537
+ {
538
+ "epoch": 0.17,
539
+ "grad_norm": 0.2826057447583215,
540
+ "learning_rate": 0.00019957541659625458,
541
+ "loss": 0.3663,
542
+ "step": 75
543
+ },
544
+ {
545
+ "epoch": 0.17,
546
+ "grad_norm": 0.26758814883425874,
547
+ "learning_rate": 0.00019954079712988183,
548
+ "loss": 0.3473,
549
+ "step": 76
550
+ },
551
+ {
552
+ "epoch": 0.17,
553
+ "grad_norm": 0.26191530390935236,
554
+ "learning_rate": 0.00019950482427907211,
555
+ "loss": 0.3464,
556
+ "step": 77
557
+ },
558
+ {
559
+ "epoch": 0.17,
560
+ "grad_norm": 0.27009995213133564,
561
+ "learning_rate": 0.00019946749853292232,
562
+ "loss": 0.3427,
563
+ "step": 78
564
+ },
565
+ {
566
+ "epoch": 0.17,
567
+ "grad_norm": 0.2828252836889427,
568
+ "learning_rate": 0.00019942882039892377,
569
+ "loss": 0.3369,
570
+ "step": 79
571
+ },
572
+ {
573
+ "epoch": 0.18,
574
+ "grad_norm": 0.28359570095347486,
575
+ "learning_rate": 0.00019938879040295508,
576
+ "loss": 0.3474,
577
+ "step": 80
578
+ },
579
+ {
580
+ "epoch": 0.18,
581
+ "grad_norm": 0.23726568176198512,
582
+ "learning_rate": 0.0001993474090892753,
583
+ "loss": 0.3348,
584
+ "step": 81
585
+ },
586
+ {
587
+ "epoch": 0.18,
588
+ "grad_norm": 0.28359002617947426,
589
+ "learning_rate": 0.00019930467702051628,
590
+ "loss": 0.3434,
591
+ "step": 82
592
+ },
593
+ {
594
+ "epoch": 0.18,
595
+ "grad_norm": 0.24507300294017476,
596
+ "learning_rate": 0.0001992605947776752,
597
+ "loss": 0.3211,
598
+ "step": 83
599
+ },
600
+ {
601
+ "epoch": 0.19,
602
+ "grad_norm": 0.2459171433488338,
603
+ "learning_rate": 0.00019921516296010644,
604
+ "loss": 0.3539,
605
+ "step": 84
606
+ },
607
+ {
608
+ "epoch": 0.19,
609
+ "grad_norm": 0.23982609581354497,
610
+ "learning_rate": 0.0001991683821855137,
611
+ "loss": 0.3367,
612
+ "step": 85
613
+ },
614
+ {
615
+ "epoch": 0.19,
616
+ "grad_norm": 0.24948097532302946,
617
+ "learning_rate": 0.00019912025308994148,
618
+ "loss": 0.3313,
619
+ "step": 86
620
+ },
621
+ {
622
+ "epoch": 0.19,
623
+ "grad_norm": 0.25673241215001325,
624
+ "learning_rate": 0.00019907077632776632,
625
+ "loss": 0.3384,
626
+ "step": 87
627
+ },
628
+ {
629
+ "epoch": 0.19,
630
+ "grad_norm": 0.2248520328170477,
631
+ "learning_rate": 0.00019901995257168807,
632
+ "loss": 0.3075,
633
+ "step": 88
634
+ },
635
+ {
636
+ "epoch": 0.2,
637
+ "grad_norm": 0.2785744560779341,
638
+ "learning_rate": 0.00019896778251272078,
639
+ "loss": 0.3505,
640
+ "step": 89
641
+ },
642
+ {
643
+ "epoch": 0.2,
644
+ "grad_norm": 0.2612264615045669,
645
+ "learning_rate": 0.00019891426686018305,
646
+ "loss": 0.3319,
647
+ "step": 90
648
+ },
649
+ {
650
+ "epoch": 0.2,
651
+ "grad_norm": 0.2588107693763323,
652
+ "learning_rate": 0.00019885940634168864,
653
+ "loss": 0.3036,
654
+ "step": 91
655
+ },
656
+ {
657
+ "epoch": 0.2,
658
+ "grad_norm": 0.2539991795599821,
659
+ "learning_rate": 0.0001988032017031364,
660
+ "loss": 0.3275,
661
+ "step": 92
662
+ },
663
+ {
664
+ "epoch": 0.21,
665
+ "grad_norm": 0.2228639788098274,
666
+ "learning_rate": 0.00019874565370870038,
667
+ "loss": 0.3128,
668
+ "step": 93
669
+ },
670
+ {
671
+ "epoch": 0.21,
672
+ "grad_norm": 0.2295330254763288,
673
+ "learning_rate": 0.00019868676314081904,
674
+ "loss": 0.3226,
675
+ "step": 94
676
+ },
677
+ {
678
+ "epoch": 0.21,
679
+ "grad_norm": 0.26774082666954935,
680
+ "learning_rate": 0.00019862653080018506,
681
+ "loss": 0.3438,
682
+ "step": 95
683
+ },
684
+ {
685
+ "epoch": 0.21,
686
+ "grad_norm": 0.2096704558396889,
687
+ "learning_rate": 0.0001985649575057341,
688
+ "loss": 0.3158,
689
+ "step": 96
690
+ },
691
+ {
692
+ "epoch": 0.21,
693
+ "grad_norm": 0.222343383957648,
694
+ "learning_rate": 0.00019850204409463385,
695
+ "loss": 0.3127,
696
+ "step": 97
697
+ },
698
+ {
699
+ "epoch": 0.22,
700
+ "grad_norm": 0.24355910663526567,
701
+ "learning_rate": 0.00019843779142227256,
702
+ "loss": 0.3278,
703
+ "step": 98
704
+ },
705
+ {
706
+ "epoch": 0.22,
707
+ "grad_norm": 0.2589007730313338,
708
+ "learning_rate": 0.00019837220036224756,
709
+ "loss": 0.3433,
710
+ "step": 99
711
+ },
712
+ {
713
+ "epoch": 0.22,
714
+ "grad_norm": 0.2048208439355147,
715
+ "learning_rate": 0.00019830527180635308,
716
+ "loss": 0.3038,
717
+ "step": 100
718
+ },
719
+ {
720
+ "epoch": 0.22,
721
+ "grad_norm": 0.22100911829011585,
722
+ "learning_rate": 0.00019823700666456853,
723
+ "loss": 0.3295,
724
+ "step": 101
725
+ },
726
+ {
727
+ "epoch": 0.23,
728
+ "grad_norm": 0.2865876628348663,
729
+ "learning_rate": 0.0001981674058650458,
730
+ "loss": 0.3374,
731
+ "step": 102
732
+ },
733
+ {
734
+ "epoch": 0.23,
735
+ "grad_norm": 0.23267706317431597,
736
+ "learning_rate": 0.00019809647035409672,
737
+ "loss": 0.3146,
738
+ "step": 103
739
+ },
740
+ {
741
+ "epoch": 0.23,
742
+ "grad_norm": 0.22896912398059824,
743
+ "learning_rate": 0.0001980242010961803,
744
+ "loss": 0.3302,
745
+ "step": 104
746
+ },
747
+ {
748
+ "epoch": 0.23,
749
+ "grad_norm": 0.21199810980682918,
750
+ "learning_rate": 0.00019795059907388952,
751
+ "loss": 0.3089,
752
+ "step": 105
753
+ },
754
+ {
755
+ "epoch": 0.23,
756
+ "grad_norm": 0.2314314536279127,
757
+ "learning_rate": 0.00019787566528793807,
758
+ "loss": 0.3219,
759
+ "step": 106
760
+ },
761
+ {
762
+ "epoch": 0.24,
763
+ "grad_norm": 0.22578014749699987,
764
+ "learning_rate": 0.00019779940075714648,
765
+ "loss": 0.3089,
766
+ "step": 107
767
+ },
768
+ {
769
+ "epoch": 0.24,
770
+ "grad_norm": 0.26739038717214403,
771
+ "learning_rate": 0.0001977218065184287,
772
+ "loss": 0.3314,
773
+ "step": 108
774
+ },
775
+ {
776
+ "epoch": 0.24,
777
+ "grad_norm": 0.27640406493967007,
778
+ "learning_rate": 0.00019764288362677753,
779
+ "loss": 0.315,
780
+ "step": 109
781
+ },
782
+ {
783
+ "epoch": 0.24,
784
+ "grad_norm": 0.275170691575301,
785
+ "learning_rate": 0.0001975626331552507,
786
+ "loss": 0.3192,
787
+ "step": 110
788
+ },
789
+ {
790
+ "epoch": 0.25,
791
+ "grad_norm": 0.28590415176453904,
792
+ "learning_rate": 0.00019748105619495594,
793
+ "loss": 0.3235,
794
+ "step": 111
795
+ },
796
+ {
797
+ "epoch": 0.25,
798
+ "grad_norm": 0.26618886451702856,
799
+ "learning_rate": 0.0001973981538550364,
800
+ "loss": 0.3235,
801
+ "step": 112
802
+ },
803
+ {
804
+ "epoch": 0.25,
805
+ "grad_norm": 0.22783361042781863,
806
+ "learning_rate": 0.00019731392726265537,
807
+ "loss": 0.3319,
808
+ "step": 113
809
+ },
810
+ {
811
+ "epoch": 0.25,
812
+ "eval_loss": 0.3324408233165741,
813
+ "eval_runtime": 173.7614,
814
+ "eval_samples_per_second": 13.369,
815
+ "eval_steps_per_second": 0.42,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.25,
820
+ "grad_norm": 0.25003343596979083,
821
+ "learning_rate": 0.00019722837756298113,
822
+ "loss": 0.3269,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.25,
827
+ "grad_norm": 0.2051643008869916,
828
+ "learning_rate": 0.0001971415059191712,
829
+ "loss": 0.3331,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.26,
834
+ "grad_norm": 0.26679577582874325,
835
+ "learning_rate": 0.00019705331351235674,
836
+ "loss": 0.319,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.26,
841
+ "grad_norm": 0.22959410656566043,
842
+ "learning_rate": 0.0001969638015416263,
843
+ "loss": 0.326,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.26,
848
+ "grad_norm": 0.24732040292675736,
849
+ "learning_rate": 0.0001968729712240095,
850
+ "loss": 0.3456,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.26,
855
+ "grad_norm": 0.21472772586501115,
856
+ "learning_rate": 0.00019678082379446078,
857
+ "loss": 0.3154,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.27,
862
+ "grad_norm": 0.22004349633491344,
863
+ "learning_rate": 0.00019668736050584224,
864
+ "loss": 0.3296,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.27,
869
+ "grad_norm": 0.20481829759367912,
870
+ "learning_rate": 0.00019659258262890683,
871
+ "loss": 0.3168,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.27,
876
+ "grad_norm": 0.22596082871640896,
877
+ "learning_rate": 0.00019649649145228102,
878
+ "loss": 0.3198,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.27,
883
+ "grad_norm": 0.22416198789548378,
884
+ "learning_rate": 0.00019639908828244718,
885
+ "loss": 0.3122,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.27,
890
+ "grad_norm": 0.25019709447245486,
891
+ "learning_rate": 0.000196300374443726,
892
+ "loss": 0.3256,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.28,
897
+ "grad_norm": 0.23470807250735432,
898
+ "learning_rate": 0.0001962003512782584,
899
+ "loss": 0.3245,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.28,
904
+ "grad_norm": 0.24693935678122386,
905
+ "learning_rate": 0.00019609902014598718,
906
+ "loss": 0.3292,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.28,
911
+ "grad_norm": 0.2260534058582815,
912
+ "learning_rate": 0.00019599638242463868,
913
+ "loss": 0.325,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.28,
918
+ "grad_norm": 0.23667285933893983,
919
+ "learning_rate": 0.00019589243950970402,
920
+ "loss": 0.3337,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.29,
925
+ "grad_norm": 0.21298574370036155,
926
+ "learning_rate": 0.00019578719281442003,
927
+ "loss": 0.3092,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.29,
932
+ "grad_norm": 0.23372716379048458,
933
+ "learning_rate": 0.00019568064376975012,
934
+ "loss": 0.317,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.29,
939
+ "grad_norm": 0.22334413345013637,
940
+ "learning_rate": 0.0001955727938243648,
941
+ "loss": 0.2834,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.29,
946
+ "grad_norm": 0.2248381817587477,
947
+ "learning_rate": 0.00019546364444462207,
948
+ "loss": 0.302,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.29,
953
+ "grad_norm": 0.2360215129073668,
954
+ "learning_rate": 0.00019535319711454728,
955
+ "loss": 0.3293,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.3,
960
+ "grad_norm": 0.22623309491159815,
961
+ "learning_rate": 0.00019524145333581317,
962
+ "loss": 0.3273,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.3,
967
+ "grad_norm": 0.2070555672669519,
968
+ "learning_rate": 0.00019512841462771924,
969
+ "loss": 0.2972,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.3,
974
+ "grad_norm": 0.22460201914808312,
975
+ "learning_rate": 0.00019501408252717138,
976
+ "loss": 0.3019,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.3,
981
+ "grad_norm": 0.2225333175600629,
982
+ "learning_rate": 0.00019489845858866066,
983
+ "loss": 0.2983,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.31,
988
+ "grad_norm": 0.21017634852282913,
989
+ "learning_rate": 0.0001947815443842424,
990
+ "loss": 0.3002,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.31,
995
+ "grad_norm": 0.24382065213677345,
996
+ "learning_rate": 0.00019466334150351476,
997
+ "loss": 0.3061,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.31,
1002
+ "grad_norm": 0.22222229848717998,
1003
+ "learning_rate": 0.00019454385155359702,
1004
+ "loss": 0.3189,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.31,
1009
+ "grad_norm": 0.21144991906498956,
1010
+ "learning_rate": 0.00019442307615910793,
1011
+ "loss": 0.3093,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.31,
1016
+ "grad_norm": 0.20966684015092005,
1017
+ "learning_rate": 0.00019430101696214336,
1018
+ "loss": 0.2975,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.32,
1023
+ "grad_norm": 0.2285163015371888,
1024
+ "learning_rate": 0.0001941776756222542,
1025
+ "loss": 0.3291,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.32,
1030
+ "grad_norm": 0.22173438888731478,
1031
+ "learning_rate": 0.00019405305381642375,
1032
+ "loss": 0.3052,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.32,
1037
+ "grad_norm": 0.23472615797287633,
1038
+ "learning_rate": 0.00019392715323904481,
1039
+ "loss": 0.3158,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.32,
1044
+ "grad_norm": 0.19206597044565454,
1045
+ "learning_rate": 0.00019379997560189675,
1046
+ "loss": 0.3047,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.33,
1051
+ "grad_norm": 0.21928888599234125,
1052
+ "learning_rate": 0.00019367152263412217,
1053
+ "loss": 0.3196,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.33,
1058
+ "grad_norm": 0.21469742386937682,
1059
+ "learning_rate": 0.00019354179608220348,
1060
+ "loss": 0.2981,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.33,
1065
+ "grad_norm": 0.39037984784670976,
1066
+ "learning_rate": 0.000193410797709939,
1067
+ "loss": 0.2962,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.33,
1072
+ "grad_norm": 0.3224683875107671,
1073
+ "learning_rate": 0.00019327852929841916,
1074
+ "loss": 0.3268,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.33,
1079
+ "grad_norm": 0.22377973835731155,
1080
+ "learning_rate": 0.0001931449926460022,
1081
+ "loss": 0.3093,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.34,
1086
+ "grad_norm": 0.21525051451055552,
1087
+ "learning_rate": 0.00019301018956828964,
1088
+ "loss": 0.3173,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.34,
1093
+ "grad_norm": 0.19693490433987484,
1094
+ "learning_rate": 0.00019287412189810172,
1095
+ "loss": 0.3069,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.34,
1100
+ "grad_norm": 0.1985719221741534,
1101
+ "learning_rate": 0.00019273679148545245,
1102
+ "loss": 0.3239,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.34,
1107
+ "grad_norm": 0.19043619193681682,
1108
+ "learning_rate": 0.00019259820019752443,
1109
+ "loss": 0.3041,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.35,
1114
+ "grad_norm": 0.2029259146228578,
1115
+ "learning_rate": 0.0001924583499186434,
1116
+ "loss": 0.3049,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.35,
1121
+ "grad_norm": 0.1913024173068621,
1122
+ "learning_rate": 0.00019231724255025284,
1123
+ "loss": 0.3263,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.35,
1128
+ "grad_norm": 0.20418535912509125,
1129
+ "learning_rate": 0.00019217488001088784,
1130
+ "loss": 0.3093,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.35,
1135
+ "grad_norm": 0.21506493193668452,
1136
+ "learning_rate": 0.00019203126423614916,
1137
+ "loss": 0.3386,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.35,
1142
+ "grad_norm": 0.2078781708947467,
1143
+ "learning_rate": 0.00019188639717867696,
1144
+ "loss": 0.3098,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.36,
1149
+ "grad_norm": 0.21203565844547,
1150
+ "learning_rate": 0.00019174028080812415,
1151
+ "loss": 0.3245,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.36,
1156
+ "grad_norm": 0.22944231505967116,
1157
+ "learning_rate": 0.0001915929171111296,
1158
+ "loss": 0.3174,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.36,
1163
+ "grad_norm": 0.19389795184150505,
1164
+ "learning_rate": 0.00019144430809129128,
1165
+ "loss": 0.2985,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.36,
1170
+ "grad_norm": 0.2294568217196969,
1171
+ "learning_rate": 0.00019129445576913888,
1172
+ "loss": 0.2916,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.37,
1177
+ "grad_norm": 0.21471312626755512,
1178
+ "learning_rate": 0.00019114336218210634,
1179
+ "loss": 0.3203,
1180
+ "step": 165
1181
+ },
1182
+ {
1183
+ "epoch": 0.37,
1184
+ "grad_norm": 0.200256096520887,
1185
+ "learning_rate": 0.00019099102938450416,
1186
+ "loss": 0.314,
1187
+ "step": 166
1188
+ },
1189
+ {
1190
+ "epoch": 0.37,
1191
+ "grad_norm": 0.20286230376549838,
1192
+ "learning_rate": 0.00019083745944749162,
1193
+ "loss": 0.2953,
1194
+ "step": 167
1195
+ },
1196
+ {
1197
+ "epoch": 0.37,
1198
+ "grad_norm": 0.22752630009694177,
1199
+ "learning_rate": 0.00019068265445904836,
1200
+ "loss": 0.3098,
1201
+ "step": 168
1202
+ },
1203
+ {
1204
+ "epoch": 0.37,
1205
+ "grad_norm": 0.1969764402722988,
1206
+ "learning_rate": 0.00019052661652394618,
1207
+ "loss": 0.2798,
1208
+ "step": 169
1209
+ },
1210
+ {
1211
+ "epoch": 0.38,
1212
+ "grad_norm": 0.21419927863312113,
1213
+ "learning_rate": 0.0001903693477637204,
1214
+ "loss": 0.315,
1215
+ "step": 170
1216
+ },
1217
+ {
1218
+ "epoch": 0.38,
1219
+ "grad_norm": 0.1978129640154948,
1220
+ "learning_rate": 0.00019021085031664087,
1221
+ "loss": 0.2769,
1222
+ "step": 171
1223
+ },
1224
+ {
1225
+ "epoch": 0.38,
1226
+ "grad_norm": 0.1832605646042805,
1227
+ "learning_rate": 0.00019005112633768313,
1228
+ "loss": 0.2787,
1229
+ "step": 172
1230
+ },
1231
+ {
1232
+ "epoch": 0.38,
1233
+ "grad_norm": 0.25130865190581947,
1234
+ "learning_rate": 0.00018989017799849896,
1235
+ "loss": 0.3042,
1236
+ "step": 173
1237
+ },
1238
+ {
1239
+ "epoch": 0.39,
1240
+ "grad_norm": 0.22333188194958617,
1241
+ "learning_rate": 0.0001897280074873868,
1242
+ "loss": 0.3366,
1243
+ "step": 174
1244
+ },
1245
+ {
1246
+ "epoch": 0.39,
1247
+ "grad_norm": 0.23303955610127655,
1248
+ "learning_rate": 0.00018956461700926215,
1249
+ "loss": 0.3069,
1250
+ "step": 175
1251
+ },
1252
+ {
1253
+ "epoch": 0.39,
1254
+ "grad_norm": 0.21946538873928723,
1255
+ "learning_rate": 0.00018940000878562758,
1256
+ "loss": 0.3026,
1257
+ "step": 176
1258
+ },
1259
+ {
1260
+ "epoch": 0.39,
1261
+ "grad_norm": 0.22034016868322132,
1262
+ "learning_rate": 0.00018923418505454237,
1263
+ "loss": 0.3031,
1264
+ "step": 177
1265
+ },
1266
+ {
1267
+ "epoch": 0.39,
1268
+ "grad_norm": 0.1918273691770089,
1269
+ "learning_rate": 0.00018906714807059218,
1270
+ "loss": 0.287,
1271
+ "step": 178
1272
+ },
1273
+ {
1274
+ "epoch": 0.4,
1275
+ "grad_norm": 0.1984010069624954,
1276
+ "learning_rate": 0.00018889890010485847,
1277
+ "loss": 0.3039,
1278
+ "step": 179
1279
+ },
1280
+ {
1281
+ "epoch": 0.4,
1282
+ "grad_norm": 0.19139523377149348,
1283
+ "learning_rate": 0.00018872944344488747,
1284
+ "loss": 0.3152,
1285
+ "step": 180
1286
+ },
1287
+ {
1288
+ "epoch": 0.4,
1289
+ "grad_norm": 0.18841860181390324,
1290
+ "learning_rate": 0.0001885587803946592,
1291
+ "loss": 0.3171,
1292
+ "step": 181
1293
+ },
1294
+ {
1295
+ "epoch": 0.4,
1296
+ "grad_norm": 0.19736334769102976,
1297
+ "learning_rate": 0.0001883869132745561,
1298
+ "loss": 0.2851,
1299
+ "step": 182
1300
+ },
1301
+ {
1302
+ "epoch": 0.41,
1303
+ "grad_norm": 0.19610383893082234,
1304
+ "learning_rate": 0.00018821384442133145,
1305
+ "loss": 0.307,
1306
+ "step": 183
1307
+ },
1308
+ {
1309
+ "epoch": 0.41,
1310
+ "grad_norm": 0.19693236953217128,
1311
+ "learning_rate": 0.00018803957618807764,
1312
+ "loss": 0.3219,
1313
+ "step": 184
1314
+ },
1315
+ {
1316
+ "epoch": 0.41,
1317
+ "grad_norm": 0.1978461815572098,
1318
+ "learning_rate": 0.0001878641109441942,
1319
+ "loss": 0.2936,
1320
+ "step": 185
1321
+ },
1322
+ {
1323
+ "epoch": 0.41,
1324
+ "grad_norm": 0.22453262012783057,
1325
+ "learning_rate": 0.00018768745107535542,
1326
+ "loss": 0.3225,
1327
+ "step": 186
1328
+ },
1329
+ {
1330
+ "epoch": 0.41,
1331
+ "grad_norm": 0.1951859636493649,
1332
+ "learning_rate": 0.00018750959898347825,
1333
+ "loss": 0.2892,
1334
+ "step": 187
1335
+ },
1336
+ {
1337
+ "epoch": 0.42,
1338
+ "grad_norm": 0.19051492201816908,
1339
+ "learning_rate": 0.00018733055708668926,
1340
+ "loss": 0.2922,
1341
+ "step": 188
1342
+ },
1343
+ {
1344
+ "epoch": 0.42,
1345
+ "grad_norm": 0.19631810708829814,
1346
+ "learning_rate": 0.00018715032781929208,
1347
+ "loss": 0.2928,
1348
+ "step": 189
1349
+ },
1350
+ {
1351
+ "epoch": 0.42,
1352
+ "grad_norm": 0.20533375929006709,
1353
+ "learning_rate": 0.00018696891363173405,
1354
+ "loss": 0.3212,
1355
+ "step": 190
1356
+ },
1357
+ {
1358
+ "epoch": 0.42,
1359
+ "grad_norm": 0.21698315667726276,
1360
+ "learning_rate": 0.00018678631699057302,
1361
+ "loss": 0.3419,
1362
+ "step": 191
1363
+ },
1364
+ {
1365
+ "epoch": 0.43,
1366
+ "grad_norm": 0.23742145444149265,
1367
+ "learning_rate": 0.00018660254037844388,
1368
+ "loss": 0.3081,
1369
+ "step": 192
1370
+ },
1371
+ {
1372
+ "epoch": 0.43,
1373
+ "grad_norm": 0.22429285552735956,
1374
+ "learning_rate": 0.00018641758629402467,
1375
+ "loss": 0.3132,
1376
+ "step": 193
1377
+ },
1378
+ {
1379
+ "epoch": 0.43,
1380
+ "grad_norm": 0.20957107721564286,
1381
+ "learning_rate": 0.00018623145725200278,
1382
+ "loss": 0.3176,
1383
+ "step": 194
1384
+ },
1385
+ {
1386
+ "epoch": 0.43,
1387
+ "grad_norm": 0.23119517784848204,
1388
+ "learning_rate": 0.0001860441557830405,
1389
+ "loss": 0.3174,
1390
+ "step": 195
1391
+ },
1392
+ {
1393
+ "epoch": 0.43,
1394
+ "grad_norm": 0.22081382473360098,
1395
+ "learning_rate": 0.00018585568443374087,
1396
+ "loss": 0.3029,
1397
+ "step": 196
1398
+ },
1399
+ {
1400
+ "epoch": 0.44,
1401
+ "grad_norm": 0.20857278454177905,
1402
+ "learning_rate": 0.00018566604576661288,
1403
+ "loss": 0.2803,
1404
+ "step": 197
1405
+ },
1406
+ {
1407
+ "epoch": 0.44,
1408
+ "grad_norm": 0.20277792365487685,
1409
+ "learning_rate": 0.00018547524236003674,
1410
+ "loss": 0.3032,
1411
+ "step": 198
1412
+ },
1413
+ {
1414
+ "epoch": 0.44,
1415
+ "grad_norm": 0.22068595819466755,
1416
+ "learning_rate": 0.0001852832768082288,
1417
+ "loss": 0.3196,
1418
+ "step": 199
1419
+ },
1420
+ {
1421
+ "epoch": 0.44,
1422
+ "grad_norm": 0.22826508701764245,
1423
+ "learning_rate": 0.00018509015172120621,
1424
+ "loss": 0.307,
1425
+ "step": 200
1426
+ },
1427
+ {
1428
+ "epoch": 0.45,
1429
+ "grad_norm": 0.23318100222090005,
1430
+ "learning_rate": 0.00018489586972475155,
1431
+ "loss": 0.3243,
1432
+ "step": 201
1433
+ },
1434
+ {
1435
+ "epoch": 0.45,
1436
+ "grad_norm": 0.20790305208366794,
1437
+ "learning_rate": 0.00018470043346037698,
1438
+ "loss": 0.3026,
1439
+ "step": 202
1440
+ },
1441
+ {
1442
+ "epoch": 0.45,
1443
+ "grad_norm": 0.23045901908374763,
1444
+ "learning_rate": 0.00018450384558528845,
1445
+ "loss": 0.3215,
1446
+ "step": 203
1447
+ },
1448
+ {
1449
+ "epoch": 0.45,
1450
+ "grad_norm": 0.1895969081217627,
1451
+ "learning_rate": 0.0001843061087723496,
1452
+ "loss": 0.2827,
1453
+ "step": 204
1454
+ },
1455
+ {
1456
+ "epoch": 0.45,
1457
+ "grad_norm": 0.21471234551245583,
1458
+ "learning_rate": 0.00018410722571004522,
1459
+ "loss": 0.2758,
1460
+ "step": 205
1461
+ },
1462
+ {
1463
+ "epoch": 0.46,
1464
+ "grad_norm": 0.19174460263968052,
1465
+ "learning_rate": 0.00018390719910244487,
1466
+ "loss": 0.2935,
1467
+ "step": 206
1468
+ },
1469
+ {
1470
+ "epoch": 0.46,
1471
+ "grad_norm": 0.2042082548764658,
1472
+ "learning_rate": 0.00018370603166916616,
1473
+ "loss": 0.3219,
1474
+ "step": 207
1475
+ },
1476
+ {
1477
+ "epoch": 0.46,
1478
+ "grad_norm": 0.20862412820843024,
1479
+ "learning_rate": 0.00018350372614533753,
1480
+ "loss": 0.3079,
1481
+ "step": 208
1482
+ },
1483
+ {
1484
+ "epoch": 0.46,
1485
+ "grad_norm": 0.19365588486473745,
1486
+ "learning_rate": 0.00018330028528156138,
1487
+ "loss": 0.2878,
1488
+ "step": 209
1489
+ },
1490
+ {
1491
+ "epoch": 0.47,
1492
+ "grad_norm": 0.20923759592087007,
1493
+ "learning_rate": 0.0001830957118438764,
1494
+ "loss": 0.3052,
1495
+ "step": 210
1496
+ },
1497
+ {
1498
+ "epoch": 0.47,
1499
+ "grad_norm": 0.19346679246935292,
1500
+ "learning_rate": 0.00018289000861372007,
1501
+ "loss": 0.2873,
1502
+ "step": 211
1503
+ },
1504
+ {
1505
+ "epoch": 0.47,
1506
+ "grad_norm": 0.20707125547144337,
1507
+ "learning_rate": 0.00018268317838789088,
1508
+ "loss": 0.2888,
1509
+ "step": 212
1510
+ },
1511
+ {
1512
+ "epoch": 0.47,
1513
+ "grad_norm": 0.210272509030541,
1514
+ "learning_rate": 0.00018247522397851028,
1515
+ "loss": 0.3063,
1516
+ "step": 213
1517
+ },
1518
+ {
1519
+ "epoch": 0.47,
1520
+ "grad_norm": 0.22635533463893145,
1521
+ "learning_rate": 0.0001822661482129844,
1522
+ "loss": 0.2844,
1523
+ "step": 214
1524
+ },
1525
+ {
1526
+ "epoch": 0.48,
1527
+ "grad_norm": 0.20691107915045812,
1528
+ "learning_rate": 0.00018205595393396568,
1529
+ "loss": 0.2986,
1530
+ "step": 215
1531
+ },
1532
+ {
1533
+ "epoch": 0.48,
1534
+ "grad_norm": 0.21549907787627992,
1535
+ "learning_rate": 0.00018184464399931412,
1536
+ "loss": 0.3098,
1537
+ "step": 216
1538
+ },
1539
+ {
1540
+ "epoch": 0.48,
1541
+ "grad_norm": 0.20061134281496176,
1542
+ "learning_rate": 0.00018163222128205853,
1543
+ "loss": 0.2871,
1544
+ "step": 217
1545
+ },
1546
+ {
1547
+ "epoch": 0.48,
1548
+ "grad_norm": 0.20502352214692726,
1549
+ "learning_rate": 0.00018141868867035745,
1550
+ "loss": 0.294,
1551
+ "step": 218
1552
+ },
1553
+ {
1554
+ "epoch": 0.49,
1555
+ "grad_norm": 0.2291358128642933,
1556
+ "learning_rate": 0.00018120404906745973,
1557
+ "loss": 0.2757,
1558
+ "step": 219
1559
+ },
1560
+ {
1561
+ "epoch": 0.49,
1562
+ "grad_norm": 0.1972835362678214,
1563
+ "learning_rate": 0.00018098830539166536,
1564
+ "loss": 0.3084,
1565
+ "step": 220
1566
+ },
1567
+ {
1568
+ "epoch": 0.49,
1569
+ "grad_norm": 0.21611715417590088,
1570
+ "learning_rate": 0.00018077146057628545,
1571
+ "loss": 0.2816,
1572
+ "step": 221
1573
+ },
1574
+ {
1575
+ "epoch": 0.49,
1576
+ "grad_norm": 0.21599858891116713,
1577
+ "learning_rate": 0.00018055351756960262,
1578
+ "loss": 0.3085,
1579
+ "step": 222
1580
+ },
1581
+ {
1582
+ "epoch": 0.49,
1583
+ "grad_norm": 0.19318954639097616,
1584
+ "learning_rate": 0.00018033447933483076,
1585
+ "loss": 0.2557,
1586
+ "step": 223
1587
+ },
1588
+ {
1589
+ "epoch": 0.5,
1590
+ "grad_norm": 0.18823294633329682,
1591
+ "learning_rate": 0.00018011434885007482,
1592
+ "loss": 0.2902,
1593
+ "step": 224
1594
+ },
1595
+ {
1596
+ "epoch": 0.5,
1597
+ "grad_norm": 0.21452443929587184,
1598
+ "learning_rate": 0.00017989312910829023,
1599
+ "loss": 0.311,
1600
+ "step": 225
1601
+ },
1602
+ {
1603
+ "epoch": 0.5,
1604
+ "grad_norm": 0.20666447125090429,
1605
+ "learning_rate": 0.00017967082311724227,
1606
+ "loss": 0.2883,
1607
+ "step": 226
1608
+ },
1609
+ {
1610
+ "epoch": 0.5,
1611
+ "eval_loss": 0.297645628452301,
1612
+ "eval_runtime": 173.6984,
1613
+ "eval_samples_per_second": 13.374,
1614
+ "eval_steps_per_second": 0.42,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.5,
1619
+ "grad_norm": 0.21336924271339083,
1620
+ "learning_rate": 0.00017944743389946524,
1621
+ "loss": 0.3026,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.5,
1626
+ "grad_norm": 0.20792583408257218,
1627
+ "learning_rate": 0.0001792229644922212,
1628
+ "loss": 0.2843,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.51,
1633
+ "grad_norm": 0.2129371399765845,
1634
+ "learning_rate": 0.0001789974179474588,
1635
+ "loss": 0.3091,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.51,
1640
+ "grad_norm": 0.19527091537284477,
1641
+ "learning_rate": 0.00017877079733177184,
1642
+ "loss": 0.297,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.51,
1647
+ "grad_norm": 0.20516345655708346,
1648
+ "learning_rate": 0.00017854310572635733,
1649
+ "loss": 0.2935,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.51,
1654
+ "grad_norm": 0.1969309791573733,
1655
+ "learning_rate": 0.00017831434622697385,
1656
+ "loss": 0.2898,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.52,
1661
+ "grad_norm": 0.2696175229862278,
1662
+ "learning_rate": 0.0001780845219438994,
1663
+ "loss": 0.2924,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.52,
1668
+ "grad_norm": 0.234973041162424,
1669
+ "learning_rate": 0.00017785363600188894,
1670
+ "loss": 0.3179,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.52,
1675
+ "grad_norm": 0.23939104068785497,
1676
+ "learning_rate": 0.00017762169154013216,
1677
+ "loss": 0.2796,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.52,
1682
+ "grad_norm": 0.20378572036784126,
1683
+ "learning_rate": 0.00017738869171221068,
1684
+ "loss": 0.2784,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.52,
1689
+ "grad_norm": 0.19159802735904477,
1690
+ "learning_rate": 0.0001771546396860551,
1691
+ "loss": 0.2834,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.53,
1696
+ "grad_norm": 0.23302725481391803,
1697
+ "learning_rate": 0.00017691953864390207,
1698
+ "loss": 0.2997,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.53,
1703
+ "grad_norm": 0.23471887316578038,
1704
+ "learning_rate": 0.0001766833917822509,
1705
+ "loss": 0.3,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.53,
1710
+ "grad_norm": 0.1796225243725005,
1711
+ "learning_rate": 0.00017644620231182015,
1712
+ "loss": 0.2901,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.53,
1717
+ "grad_norm": 0.20037902218428075,
1718
+ "learning_rate": 0.00017620797345750403,
1719
+ "loss": 0.294,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.54,
1724
+ "grad_norm": 0.20415485179217374,
1725
+ "learning_rate": 0.0001759687084583285,
1726
+ "loss": 0.3162,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.54,
1731
+ "grad_norm": 0.18182779738483645,
1732
+ "learning_rate": 0.00017572841056740722,
1733
+ "loss": 0.275,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.54,
1738
+ "grad_norm": 0.17275776985467062,
1739
+ "learning_rate": 0.00017548708305189722,
1740
+ "loss": 0.2592,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.54,
1745
+ "grad_norm": 0.18388792575433974,
1746
+ "learning_rate": 0.00017524472919295487,
1747
+ "loss": 0.2998,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.54,
1752
+ "grad_norm": 0.1821375376929017,
1753
+ "learning_rate": 0.00017500135228569068,
1754
+ "loss": 0.2586,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.55,
1759
+ "grad_norm": 0.18937126829543396,
1760
+ "learning_rate": 0.00017475695563912505,
1761
+ "loss": 0.2858,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.55,
1766
+ "grad_norm": 0.1833938565560613,
1767
+ "learning_rate": 0.00017451154257614287,
1768
+ "loss": 0.271,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.55,
1773
+ "grad_norm": 0.20382927797400074,
1774
+ "learning_rate": 0.0001742651164334486,
1775
+ "loss": 0.2931,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.55,
1780
+ "grad_norm": 0.23510651870506527,
1781
+ "learning_rate": 0.00017401768056152085,
1782
+ "loss": 0.3078,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.56,
1787
+ "grad_norm": 0.2191619740250893,
1788
+ "learning_rate": 0.00017376923832456665,
1789
+ "loss": 0.3111,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.56,
1794
+ "grad_norm": 0.20648235757385622,
1795
+ "learning_rate": 0.00017351979310047602,
1796
+ "loss": 0.2816,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.56,
1801
+ "grad_norm": 0.19445561313531604,
1802
+ "learning_rate": 0.00017326934828077573,
1803
+ "loss": 0.2894,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.56,
1808
+ "grad_norm": 0.19169798824869538,
1809
+ "learning_rate": 0.00017301790727058345,
1810
+ "loss": 0.2802,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.56,
1815
+ "grad_norm": 0.2006703950203184,
1816
+ "learning_rate": 0.0001727654734885612,
1817
+ "loss": 0.2749,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.57,
1822
+ "grad_norm": 0.21061482297381626,
1823
+ "learning_rate": 0.0001725120503668691,
1824
+ "loss": 0.3042,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.57,
1829
+ "grad_norm": 0.20740853426573325,
1830
+ "learning_rate": 0.00017225764135111868,
1831
+ "loss": 0.3025,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.57,
1836
+ "grad_norm": 0.1945016623980029,
1837
+ "learning_rate": 0.00017200224990032576,
1838
+ "loss": 0.2964,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.57,
1843
+ "grad_norm": 0.21095668740779128,
1844
+ "learning_rate": 0.00017174587948686374,
1845
+ "loss": 0.3047,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.58,
1850
+ "grad_norm": 0.18724414137242903,
1851
+ "learning_rate": 0.00017148853359641626,
1852
+ "loss": 0.2678,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.58,
1857
+ "grad_norm": 0.18247589724576602,
1858
+ "learning_rate": 0.00017123021572792982,
1859
+ "loss": 0.2796,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.58,
1864
+ "grad_norm": 0.20187877784359254,
1865
+ "learning_rate": 0.00017097092939356623,
1866
+ "loss": 0.2819,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.58,
1871
+ "grad_norm": 0.21960455156021205,
1872
+ "learning_rate": 0.00017071067811865476,
1873
+ "loss": 0.2849,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.58,
1878
+ "grad_norm": 0.21920804195394356,
1879
+ "learning_rate": 0.00017044946544164433,
1880
+ "loss": 0.286,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.59,
1885
+ "grad_norm": 0.2153094965416609,
1886
+ "learning_rate": 0.00017018729491405536,
1887
+ "loss": 0.2728,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.59,
1892
+ "grad_norm": 0.18899974451413623,
1893
+ "learning_rate": 0.00016992417010043142,
1894
+ "loss": 0.2643,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.59,
1899
+ "grad_norm": 0.20452371771386804,
1900
+ "learning_rate": 0.00016966009457829086,
1901
+ "loss": 0.2805,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.59,
1906
+ "grad_norm": 0.19546419928152936,
1907
+ "learning_rate": 0.0001693950719380782,
1908
+ "loss": 0.2749,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.6,
1913
+ "grad_norm": 0.19627215184487343,
1914
+ "learning_rate": 0.00016912910578311503,
1915
+ "loss": 0.273,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.6,
1920
+ "grad_norm": 0.18828419766156645,
1921
+ "learning_rate": 0.00016886219972955146,
1922
+ "loss": 0.273,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.6,
1927
+ "grad_norm": 0.19027059707604657,
1928
+ "learning_rate": 0.00016859435740631658,
1929
+ "loss": 0.3046,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.6,
1934
+ "grad_norm": 0.18426233664144975,
1935
+ "learning_rate": 0.00016832558245506935,
1936
+ "loss": 0.2643,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.6,
1941
+ "grad_norm": 0.18658729778193736,
1942
+ "learning_rate": 0.00016805587853014895,
1943
+ "loss": 0.285,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.61,
1948
+ "grad_norm": 0.1796032355554721,
1949
+ "learning_rate": 0.00016778524929852512,
1950
+ "loss": 0.261,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.61,
1955
+ "grad_norm": 0.19790847765561645,
1956
+ "learning_rate": 0.0001675136984397484,
1957
+ "loss": 0.3036,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.61,
1962
+ "grad_norm": 0.2013507251688789,
1963
+ "learning_rate": 0.0001672412296459,
1964
+ "loss": 0.2929,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.61,
1969
+ "grad_norm": 0.20727255991158675,
1970
+ "learning_rate": 0.00016696784662154163,
1971
+ "loss": 0.28,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.62,
1976
+ "grad_norm": 0.22175173491924988,
1977
+ "learning_rate": 0.0001666935530836651,
1978
+ "loss": 0.2953,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.62,
1983
+ "grad_norm": 0.2110368794764076,
1984
+ "learning_rate": 0.00016641835276164183,
1985
+ "loss": 0.3012,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.62,
1990
+ "grad_norm": 0.1839097327175649,
1991
+ "learning_rate": 0.00016614224939717217,
1992
+ "loss": 0.2985,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.62,
1997
+ "grad_norm": 0.18075701410157072,
1998
+ "learning_rate": 0.00016586524674423446,
1999
+ "loss": 0.2614,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.62,
2004
+ "grad_norm": 0.19006328055997038,
2005
+ "learning_rate": 0.00016558734856903404,
2006
+ "loss": 0.2741,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.63,
2011
+ "grad_norm": 0.1936501426398552,
2012
+ "learning_rate": 0.00016530855864995195,
2013
+ "loss": 0.2486,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.63,
2018
+ "grad_norm": 0.2146360520459362,
2019
+ "learning_rate": 0.0001650288807774937,
2020
+ "loss": 0.2949,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.63,
2025
+ "grad_norm": 0.1934454081094374,
2026
+ "learning_rate": 0.00016474831875423767,
2027
+ "loss": 0.25,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.63,
2032
+ "grad_norm": 0.20138529225313043,
2033
+ "learning_rate": 0.0001644668763947833,
2034
+ "loss": 0.2826,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.64,
2039
+ "grad_norm": 0.20336180418412392,
2040
+ "learning_rate": 0.00016418455752569943,
2041
+ "loss": 0.281,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.64,
2046
+ "grad_norm": 0.2366660122305481,
2047
+ "learning_rate": 0.00016390136598547217,
2048
+ "loss": 0.2665,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.64,
2053
+ "grad_norm": 0.18478597928089585,
2054
+ "learning_rate": 0.00016361730562445263,
2055
+ "loss": 0.3022,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.64,
2060
+ "grad_norm": 0.18679948920602824,
2061
+ "learning_rate": 0.0001633323803048047,
2062
+ "loss": 0.2844,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.64,
2067
+ "grad_norm": 0.19949957744833288,
2068
+ "learning_rate": 0.00016304659390045252,
2069
+ "loss": 0.2912,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.65,
2074
+ "grad_norm": 0.20044341182168152,
2075
+ "learning_rate": 0.0001627599502970277,
2076
+ "loss": 0.2729,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.65,
2081
+ "grad_norm": 0.18558206726419454,
2082
+ "learning_rate": 0.00016247245339181662,
2083
+ "loss": 0.2693,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.65,
2088
+ "grad_norm": 0.2083655759577377,
2089
+ "learning_rate": 0.00016218410709370736,
2090
+ "loss": 0.3022,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.65,
2095
+ "grad_norm": 0.21456219690969847,
2096
+ "learning_rate": 0.00016189491532313664,
2097
+ "loss": 0.2933,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.66,
2102
+ "grad_norm": 0.2316084982608295,
2103
+ "learning_rate": 0.00016160488201203644,
2104
+ "loss": 0.2631,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.66,
2109
+ "grad_norm": 0.20943766303868608,
2110
+ "learning_rate": 0.00016131401110378043,
2111
+ "loss": 0.2847,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.66,
2116
+ "grad_norm": 0.2346281512353159,
2117
+ "learning_rate": 0.00016102230655313076,
2118
+ "loss": 0.2898,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.66,
2123
+ "grad_norm": 0.18690479698056445,
2124
+ "learning_rate": 0.0001607297723261837,
2125
+ "loss": 0.2761,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.66,
2130
+ "grad_norm": 0.22244239877246086,
2131
+ "learning_rate": 0.00016043641240031623,
2132
+ "loss": 0.2794,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.67,
2137
+ "grad_norm": 0.19513520777123738,
2138
+ "learning_rate": 0.00016014223076413173,
2139
+ "loss": 0.2757,
2140
+ "step": 301
2141
+ },
2142
+ {
2143
+ "epoch": 0.67,
2144
+ "grad_norm": 0.1954004471181246,
2145
+ "learning_rate": 0.00015984723141740576,
2146
+ "loss": 0.2713,
2147
+ "step": 302
2148
+ },
2149
+ {
2150
+ "epoch": 0.67,
2151
+ "grad_norm": 0.21399036326669596,
2152
+ "learning_rate": 0.00015955141837103168,
2153
+ "loss": 0.2767,
2154
+ "step": 303
2155
+ },
2156
+ {
2157
+ "epoch": 0.67,
2158
+ "grad_norm": 0.2045546981741005,
2159
+ "learning_rate": 0.0001592547956469662,
2160
+ "loss": 0.2807,
2161
+ "step": 304
2162
+ },
2163
+ {
2164
+ "epoch": 0.68,
2165
+ "grad_norm": 0.21321779886240727,
2166
+ "learning_rate": 0.00015895736727817455,
2167
+ "loss": 0.2899,
2168
+ "step": 305
2169
+ },
2170
+ {
2171
+ "epoch": 0.68,
2172
+ "grad_norm": 0.20121770170139855,
2173
+ "learning_rate": 0.00015865913730857582,
2174
+ "loss": 0.2706,
2175
+ "step": 306
2176
+ },
2177
+ {
2178
+ "epoch": 0.68,
2179
+ "grad_norm": 0.21901656836549904,
2180
+ "learning_rate": 0.00015836010979298782,
2181
+ "loss": 0.3043,
2182
+ "step": 307
2183
+ },
2184
+ {
2185
+ "epoch": 0.68,
2186
+ "grad_norm": 0.19476889300368513,
2187
+ "learning_rate": 0.0001580602887970721,
2188
+ "loss": 0.294,
2189
+ "step": 308
2190
+ },
2191
+ {
2192
+ "epoch": 0.68,
2193
+ "grad_norm": 0.1864916489760236,
2194
+ "learning_rate": 0.00015775967839727842,
2195
+ "loss": 0.2789,
2196
+ "step": 309
2197
+ },
2198
+ {
2199
+ "epoch": 0.69,
2200
+ "grad_norm": 0.2005263777787887,
2201
+ "learning_rate": 0.0001574582826807897,
2202
+ "loss": 0.2786,
2203
+ "step": 310
2204
+ },
2205
+ {
2206
+ "epoch": 0.69,
2207
+ "grad_norm": 0.20330910409013883,
2208
+ "learning_rate": 0.0001571561057454661,
2209
+ "loss": 0.2856,
2210
+ "step": 311
2211
+ },
2212
+ {
2213
+ "epoch": 0.69,
2214
+ "grad_norm": 0.2075041911121307,
2215
+ "learning_rate": 0.00015685315169978954,
2216
+ "loss": 0.301,
2217
+ "step": 312
2218
+ },
2219
+ {
2220
+ "epoch": 0.69,
2221
+ "grad_norm": 0.198545224318664,
2222
+ "learning_rate": 0.0001565494246628077,
2223
+ "loss": 0.2667,
2224
+ "step": 313
2225
+ },
2226
+ {
2227
+ "epoch": 0.7,
2228
+ "grad_norm": 0.18262873322406858,
2229
+ "learning_rate": 0.0001562449287640781,
2230
+ "loss": 0.2761,
2231
+ "step": 314
2232
+ },
2233
+ {
2234
+ "epoch": 0.7,
2235
+ "grad_norm": 0.20682904427400386,
2236
+ "learning_rate": 0.0001559396681436118,
2237
+ "loss": 0.2798,
2238
+ "step": 315
2239
+ },
2240
+ {
2241
+ "epoch": 0.7,
2242
+ "grad_norm": 0.17100330949740156,
2243
+ "learning_rate": 0.00015563364695181741,
2244
+ "loss": 0.2638,
2245
+ "step": 316
2246
+ },
2247
+ {
2248
+ "epoch": 0.7,
2249
+ "grad_norm": 0.19147233406100644,
2250
+ "learning_rate": 0.00015532686934944438,
2251
+ "loss": 0.2772,
2252
+ "step": 317
2253
+ },
2254
+ {
2255
+ "epoch": 0.7,
2256
+ "grad_norm": 0.20859821686266017,
2257
+ "learning_rate": 0.00015501933950752656,
2258
+ "loss": 0.2899,
2259
+ "step": 318
2260
+ },
2261
+ {
2262
+ "epoch": 0.71,
2263
+ "grad_norm": 0.22660943071408993,
2264
+ "learning_rate": 0.00015471106160732542,
2265
+ "loss": 0.2809,
2266
+ "step": 319
2267
+ },
2268
+ {
2269
+ "epoch": 0.71,
2270
+ "grad_norm": 0.1970595992988207,
2271
+ "learning_rate": 0.00015440203984027324,
2272
+ "loss": 0.2664,
2273
+ "step": 320
2274
+ },
2275
+ {
2276
+ "epoch": 0.71,
2277
+ "grad_norm": 0.21441465355834557,
2278
+ "learning_rate": 0.00015409227840791617,
2279
+ "loss": 0.2872,
2280
+ "step": 321
2281
+ },
2282
+ {
2283
+ "epoch": 0.71,
2284
+ "grad_norm": 0.1928160770028818,
2285
+ "learning_rate": 0.000153781781521857,
2286
+ "loss": 0.2932,
2287
+ "step": 322
2288
+ },
2289
+ {
2290
+ "epoch": 0.72,
2291
+ "grad_norm": 0.18232874606722474,
2292
+ "learning_rate": 0.00015347055340369804,
2293
+ "loss": 0.2865,
2294
+ "step": 323
2295
+ },
2296
+ {
2297
+ "epoch": 0.72,
2298
+ "grad_norm": 0.18944518082470313,
2299
+ "learning_rate": 0.00015315859828498354,
2300
+ "loss": 0.2895,
2301
+ "step": 324
2302
+ },
2303
+ {
2304
+ "epoch": 0.72,
2305
+ "grad_norm": 0.18688411692002266,
2306
+ "learning_rate": 0.00015284592040714227,
2307
+ "loss": 0.3068,
2308
+ "step": 325
2309
+ },
2310
+ {
2311
+ "epoch": 0.72,
2312
+ "grad_norm": 0.15360887670295487,
2313
+ "learning_rate": 0.00015253252402142988,
2314
+ "loss": 0.2541,
2315
+ "step": 326
2316
+ },
2317
+ {
2318
+ "epoch": 0.72,
2319
+ "grad_norm": 0.17886321037354558,
2320
+ "learning_rate": 0.00015221841338887104,
2321
+ "loss": 0.2735,
2322
+ "step": 327
2323
+ },
2324
+ {
2325
+ "epoch": 0.73,
2326
+ "grad_norm": 0.18210509607368616,
2327
+ "learning_rate": 0.0001519035927802015,
2328
+ "loss": 0.2674,
2329
+ "step": 328
2330
+ },
2331
+ {
2332
+ "epoch": 0.73,
2333
+ "grad_norm": 0.2113800439853146,
2334
+ "learning_rate": 0.00015158806647581002,
2335
+ "loss": 0.2611,
2336
+ "step": 329
2337
+ },
2338
+ {
2339
+ "epoch": 0.73,
2340
+ "grad_norm": 0.21664373470568593,
2341
+ "learning_rate": 0.00015127183876568022,
2342
+ "loss": 0.2734,
2343
+ "step": 330
2344
+ },
2345
+ {
2346
+ "epoch": 0.73,
2347
+ "grad_norm": 0.21152226314764647,
2348
+ "learning_rate": 0.0001509549139493323,
2349
+ "loss": 0.2605,
2350
+ "step": 331
2351
+ },
2352
+ {
2353
+ "epoch": 0.74,
2354
+ "grad_norm": 0.20787608314066813,
2355
+ "learning_rate": 0.0001506372963357644,
2356
+ "loss": 0.2829,
2357
+ "step": 332
2358
+ },
2359
+ {
2360
+ "epoch": 0.74,
2361
+ "grad_norm": 0.20890612340451087,
2362
+ "learning_rate": 0.00015031899024339415,
2363
+ "loss": 0.2761,
2364
+ "step": 333
2365
+ },
2366
+ {
2367
+ "epoch": 0.74,
2368
+ "grad_norm": 0.18123961862890825,
2369
+ "learning_rate": 0.00015000000000000001,
2370
+ "loss": 0.2625,
2371
+ "step": 334
2372
+ },
2373
+ {
2374
+ "epoch": 0.74,
2375
+ "grad_norm": 0.20005061805451332,
2376
+ "learning_rate": 0.00014968032994266224,
2377
+ "loss": 0.2739,
2378
+ "step": 335
2379
+ },
2380
+ {
2381
+ "epoch": 0.74,
2382
+ "grad_norm": 0.22011995491515385,
2383
+ "learning_rate": 0.00014935998441770407,
2384
+ "loss": 0.2769,
2385
+ "step": 336
2386
+ },
2387
+ {
2388
+ "epoch": 0.75,
2389
+ "grad_norm": 0.1759458248293069,
2390
+ "learning_rate": 0.00014903896778063267,
2391
+ "loss": 0.2751,
2392
+ "step": 337
2393
+ },
2394
+ {
2395
+ "epoch": 0.75,
2396
+ "grad_norm": 0.17963701036394614,
2397
+ "learning_rate": 0.00014871728439607966,
2398
+ "loss": 0.2861,
2399
+ "step": 338
2400
+ },
2401
+ {
2402
+ "epoch": 0.75,
2403
+ "grad_norm": 0.22395035633400334,
2404
+ "learning_rate": 0.00014839493863774212,
2405
+ "loss": 0.2748,
2406
+ "step": 339
2407
+ },
2408
+ {
2409
+ "epoch": 0.75,
2410
+ "eval_loss": 0.27847620844841003,
2411
+ "eval_runtime": 174.097,
2412
+ "eval_samples_per_second": 13.343,
2413
+ "eval_steps_per_second": 0.419,
2414
+ "step": 339
2415
+ },
2416
+ {
2417
+ "epoch": 0.75,
2418
+ "grad_norm": 0.19931473943900235,
2419
+ "learning_rate": 0.00014807193488832282,
2420
+ "loss": 0.261,
2421
+ "step": 340
2422
+ },
2423
+ {
2424
+ "epoch": 0.76,
2425
+ "grad_norm": 0.1895068181039295,
2426
+ "learning_rate": 0.00014774827753947088,
2427
+ "loss": 0.2666,
2428
+ "step": 341
2429
+ },
2430
+ {
2431
+ "epoch": 0.76,
2432
+ "grad_norm": 0.19682173311096884,
2433
+ "learning_rate": 0.00014742397099172183,
2434
+ "loss": 0.2564,
2435
+ "step": 342
2436
+ },
2437
+ {
2438
+ "epoch": 0.76,
2439
+ "grad_norm": 0.2183356533233642,
2440
+ "learning_rate": 0.00014709901965443794,
2441
+ "loss": 0.2904,
2442
+ "step": 343
2443
+ },
2444
+ {
2445
+ "epoch": 0.76,
2446
+ "grad_norm": 0.2113562507034691,
2447
+ "learning_rate": 0.00014677342794574817,
2448
+ "loss": 0.2915,
2449
+ "step": 344
2450
+ },
2451
+ {
2452
+ "epoch": 0.76,
2453
+ "grad_norm": 0.2034784414842117,
2454
+ "learning_rate": 0.00014644720029248829,
2455
+ "loss": 0.2717,
2456
+ "step": 345
2457
+ },
2458
+ {
2459
+ "epoch": 0.77,
2460
+ "grad_norm": 0.1967572374697097,
2461
+ "learning_rate": 0.00014612034113014035,
2462
+ "loss": 0.2887,
2463
+ "step": 346
2464
+ },
2465
+ {
2466
+ "epoch": 0.77,
2467
+ "grad_norm": 0.1899953535101574,
2468
+ "learning_rate": 0.00014579285490277274,
2469
+ "loss": 0.2922,
2470
+ "step": 347
2471
+ },
2472
+ {
2473
+ "epoch": 0.77,
2474
+ "grad_norm": 0.19441384522232566,
2475
+ "learning_rate": 0.0001454647460629795,
2476
+ "loss": 0.2785,
2477
+ "step": 348
2478
+ },
2479
+ {
2480
+ "epoch": 0.77,
2481
+ "grad_norm": 0.20710674744690838,
2482
+ "learning_rate": 0.00014513601907181992,
2483
+ "loss": 0.2929,
2484
+ "step": 349
2485
+ },
2486
+ {
2487
+ "epoch": 0.78,
2488
+ "grad_norm": 0.18640646337351455,
2489
+ "learning_rate": 0.00014480667839875786,
2490
+ "loss": 0.261,
2491
+ "step": 350
2492
+ },
2493
+ {
2494
+ "epoch": 0.78,
2495
+ "grad_norm": 0.1754057901024035,
2496
+ "learning_rate": 0.00014447672852160095,
2497
+ "loss": 0.267,
2498
+ "step": 351
2499
+ },
2500
+ {
2501
+ "epoch": 0.78,
2502
+ "grad_norm": 0.1845116395890587,
2503
+ "learning_rate": 0.0001441461739264397,
2504
+ "loss": 0.2608,
2505
+ "step": 352
2506
+ },
2507
+ {
2508
+ "epoch": 0.78,
2509
+ "grad_norm": 0.1832842348380918,
2510
+ "learning_rate": 0.00014381501910758662,
2511
+ "loss": 0.264,
2512
+ "step": 353
2513
+ },
2514
+ {
2515
+ "epoch": 0.78,
2516
+ "grad_norm": 0.21078762275275081,
2517
+ "learning_rate": 0.00014348326856751496,
2518
+ "loss": 0.2903,
2519
+ "step": 354
2520
+ },
2521
+ {
2522
+ "epoch": 0.79,
2523
+ "grad_norm": 0.2040423439668467,
2524
+ "learning_rate": 0.00014315092681679755,
2525
+ "loss": 0.2866,
2526
+ "step": 355
2527
+ },
2528
+ {
2529
+ "epoch": 0.79,
2530
+ "grad_norm": 0.20916646529152538,
2531
+ "learning_rate": 0.00014281799837404552,
2532
+ "loss": 0.2669,
2533
+ "step": 356
2534
+ },
2535
+ {
2536
+ "epoch": 0.79,
2537
+ "grad_norm": 0.20829199241699958,
2538
+ "learning_rate": 0.00014248448776584688,
2539
+ "loss": 0.2773,
2540
+ "step": 357
2541
+ },
2542
+ {
2543
+ "epoch": 0.79,
2544
+ "grad_norm": 0.22863270132432637,
2545
+ "learning_rate": 0.0001421503995267048,
2546
+ "loss": 0.2691,
2547
+ "step": 358
2548
+ },
2549
+ {
2550
+ "epoch": 0.8,
2551
+ "grad_norm": 0.19107899619014693,
2552
+ "learning_rate": 0.00014181573819897617,
2553
+ "loss": 0.2854,
2554
+ "step": 359
2555
+ },
2556
+ {
2557
+ "epoch": 0.8,
2558
+ "grad_norm": 0.18310920202899897,
2559
+ "learning_rate": 0.00014148050833280977,
2560
+ "loss": 0.2523,
2561
+ "step": 360
2562
+ },
2563
+ {
2564
+ "epoch": 0.8,
2565
+ "grad_norm": 0.19142180009852441,
2566
+ "learning_rate": 0.00014114471448608426,
2567
+ "loss": 0.2668,
2568
+ "step": 361
2569
+ },
2570
+ {
2571
+ "epoch": 0.8,
2572
+ "grad_norm": 0.17916118207750328,
2573
+ "learning_rate": 0.0001408083612243465,
2574
+ "loss": 0.2826,
2575
+ "step": 362
2576
+ },
2577
+ {
2578
+ "epoch": 0.8,
2579
+ "grad_norm": 0.1795532305403206,
2580
+ "learning_rate": 0.0001404714531207492,
2581
+ "loss": 0.2876,
2582
+ "step": 363
2583
+ },
2584
+ {
2585
+ "epoch": 0.81,
2586
+ "grad_norm": 0.19812580507096825,
2587
+ "learning_rate": 0.0001401339947559889,
2588
+ "loss": 0.2813,
2589
+ "step": 364
2590
+ },
2591
+ {
2592
+ "epoch": 0.81,
2593
+ "grad_norm": 0.16985158453017796,
2594
+ "learning_rate": 0.00013979599071824362,
2595
+ "loss": 0.2824,
2596
+ "step": 365
2597
+ },
2598
+ {
2599
+ "epoch": 0.81,
2600
+ "grad_norm": 0.20447909651191687,
2601
+ "learning_rate": 0.00013945744560311057,
2602
+ "loss": 0.2567,
2603
+ "step": 366
2604
+ },
2605
+ {
2606
+ "epoch": 0.81,
2607
+ "grad_norm": 0.20178806044954373,
2608
+ "learning_rate": 0.0001391183640135435,
2609
+ "loss": 0.2755,
2610
+ "step": 367
2611
+ },
2612
+ {
2613
+ "epoch": 0.82,
2614
+ "grad_norm": 0.2265042443851448,
2615
+ "learning_rate": 0.00013877875055979023,
2616
+ "loss": 0.2905,
2617
+ "step": 368
2618
+ },
2619
+ {
2620
+ "epoch": 0.82,
2621
+ "grad_norm": 0.22127748544952333,
2622
+ "learning_rate": 0.00013843860985933003,
2623
+ "loss": 0.2624,
2624
+ "step": 369
2625
+ },
2626
+ {
2627
+ "epoch": 0.82,
2628
+ "grad_norm": 0.20805077265871227,
2629
+ "learning_rate": 0.00013809794653681074,
2630
+ "loss": 0.2396,
2631
+ "step": 370
2632
+ },
2633
+ {
2634
+ "epoch": 0.82,
2635
+ "grad_norm": 0.21236429860308073,
2636
+ "learning_rate": 0.00013775676522398588,
2637
+ "loss": 0.2916,
2638
+ "step": 371
2639
+ },
2640
+ {
2641
+ "epoch": 0.82,
2642
+ "grad_norm": 0.19893678645018886,
2643
+ "learning_rate": 0.00013741507055965168,
2644
+ "loss": 0.2551,
2645
+ "step": 372
2646
+ },
2647
+ {
2648
+ "epoch": 0.83,
2649
+ "grad_norm": 0.18256634499151977,
2650
+ "learning_rate": 0.00013707286718958413,
2651
+ "loss": 0.2633,
2652
+ "step": 373
2653
+ },
2654
+ {
2655
+ "epoch": 0.83,
2656
+ "grad_norm": 0.18221655160895703,
2657
+ "learning_rate": 0.00013673015976647568,
2658
+ "loss": 0.2672,
2659
+ "step": 374
2660
+ },
2661
+ {
2662
+ "epoch": 0.83,
2663
+ "grad_norm": 0.19429764298431815,
2664
+ "learning_rate": 0.00013638695294987204,
2665
+ "loss": 0.2417,
2666
+ "step": 375
2667
+ },
2668
+ {
2669
+ "epoch": 0.83,
2670
+ "grad_norm": 0.18793583918739298,
2671
+ "learning_rate": 0.0001360432514061087,
2672
+ "loss": 0.2613,
2673
+ "step": 376
2674
+ },
2675
+ {
2676
+ "epoch": 0.83,
2677
+ "grad_norm": 0.2011435035031983,
2678
+ "learning_rate": 0.00013569905980824788,
2679
+ "loss": 0.2685,
2680
+ "step": 377
2681
+ },
2682
+ {
2683
+ "epoch": 0.84,
2684
+ "grad_norm": 0.2052637622415086,
2685
+ "learning_rate": 0.00013535438283601435,
2686
+ "loss": 0.2959,
2687
+ "step": 378
2688
+ },
2689
+ {
2690
+ "epoch": 0.84,
2691
+ "grad_norm": 0.1945089020971851,
2692
+ "learning_rate": 0.00013500922517573245,
2693
+ "loss": 0.2482,
2694
+ "step": 379
2695
+ },
2696
+ {
2697
+ "epoch": 0.84,
2698
+ "grad_norm": 0.20812822716018067,
2699
+ "learning_rate": 0.00013466359152026195,
2700
+ "loss": 0.2741,
2701
+ "step": 380
2702
+ },
2703
+ {
2704
+ "epoch": 0.84,
2705
+ "grad_norm": 0.22384533508071883,
2706
+ "learning_rate": 0.0001343174865689344,
2707
+ "loss": 0.2914,
2708
+ "step": 381
2709
+ },
2710
+ {
2711
+ "epoch": 0.85,
2712
+ "grad_norm": 0.19177959970339475,
2713
+ "learning_rate": 0.0001339709150274893,
2714
+ "loss": 0.2694,
2715
+ "step": 382
2716
+ },
2717
+ {
2718
+ "epoch": 0.85,
2719
+ "grad_norm": 0.18196106404878304,
2720
+ "learning_rate": 0.0001336238816080099,
2721
+ "loss": 0.2649,
2722
+ "step": 383
2723
+ },
2724
+ {
2725
+ "epoch": 0.85,
2726
+ "grad_norm": 0.2526481416447456,
2727
+ "learning_rate": 0.00013327639102885937,
2728
+ "loss": 0.2813,
2729
+ "step": 384
2730
+ },
2731
+ {
2732
+ "epoch": 0.85,
2733
+ "grad_norm": 0.26250954621985834,
2734
+ "learning_rate": 0.0001329284480146166,
2735
+ "loss": 0.2793,
2736
+ "step": 385
2737
+ },
2738
+ {
2739
+ "epoch": 0.85,
2740
+ "grad_norm": 0.19318862080071506,
2741
+ "learning_rate": 0.00013258005729601177,
2742
+ "loss": 0.2441,
2743
+ "step": 386
2744
+ },
2745
+ {
2746
+ "epoch": 0.86,
2747
+ "grad_norm": 0.21447364833995924,
2748
+ "learning_rate": 0.00013223122360986225,
2749
+ "loss": 0.2887,
2750
+ "step": 387
2751
+ },
2752
+ {
2753
+ "epoch": 0.86,
2754
+ "grad_norm": 0.18021814111984105,
2755
+ "learning_rate": 0.00013188195169900813,
2756
+ "loss": 0.2722,
2757
+ "step": 388
2758
+ },
2759
+ {
2760
+ "epoch": 0.86,
2761
+ "grad_norm": 0.1986609859985159,
2762
+ "learning_rate": 0.0001315322463122477,
2763
+ "loss": 0.2638,
2764
+ "step": 389
2765
+ },
2766
+ {
2767
+ "epoch": 0.86,
2768
+ "grad_norm": 0.24674716367898283,
2769
+ "learning_rate": 0.00013118211220427298,
2770
+ "loss": 0.2753,
2771
+ "step": 390
2772
+ },
2773
+ {
2774
+ "epoch": 0.87,
2775
+ "grad_norm": 0.22439353357611744,
2776
+ "learning_rate": 0.0001308315541356049,
2777
+ "loss": 0.2853,
2778
+ "step": 391
2779
+ },
2780
+ {
2781
+ "epoch": 0.87,
2782
+ "grad_norm": 0.17776179888083857,
2783
+ "learning_rate": 0.00013048057687252865,
2784
+ "loss": 0.2441,
2785
+ "step": 392
2786
+ },
2787
+ {
2788
+ "epoch": 0.87,
2789
+ "grad_norm": 0.21002490821212472,
2790
+ "learning_rate": 0.00013012918518702914,
2791
+ "loss": 0.2882,
2792
+ "step": 393
2793
+ },
2794
+ {
2795
+ "epoch": 0.87,
2796
+ "grad_norm": 0.18514107951857695,
2797
+ "learning_rate": 0.00012977738385672557,
2798
+ "loss": 0.2643,
2799
+ "step": 394
2800
+ },
2801
+ {
2802
+ "epoch": 0.87,
2803
+ "grad_norm": 0.18577497415553973,
2804
+ "learning_rate": 0.000129425177664807,
2805
+ "loss": 0.2588,
2806
+ "step": 395
2807
+ },
2808
+ {
2809
+ "epoch": 0.88,
2810
+ "grad_norm": 0.21465706083350347,
2811
+ "learning_rate": 0.00012907257139996704,
2812
+ "loss": 0.287,
2813
+ "step": 396
2814
+ },
2815
+ {
2816
+ "epoch": 0.88,
2817
+ "grad_norm": 0.1742663966229961,
2818
+ "learning_rate": 0.0001287195698563388,
2819
+ "loss": 0.2668,
2820
+ "step": 397
2821
+ },
2822
+ {
2823
+ "epoch": 0.88,
2824
+ "grad_norm": 0.20553702826350395,
2825
+ "learning_rate": 0.0001283661778334297,
2826
+ "loss": 0.2739,
2827
+ "step": 398
2828
+ },
2829
+ {
2830
+ "epoch": 0.88,
2831
+ "grad_norm": 0.16765771945192084,
2832
+ "learning_rate": 0.0001280124001360562,
2833
+ "loss": 0.2594,
2834
+ "step": 399
2835
+ },
2836
+ {
2837
+ "epoch": 0.89,
2838
+ "grad_norm": 0.1943563901020515,
2839
+ "learning_rate": 0.0001276582415742786,
2840
+ "loss": 0.2708,
2841
+ "step": 400
2842
+ },
2843
+ {
2844
+ "epoch": 0.89,
2845
+ "grad_norm": 0.1896178424172107,
2846
+ "learning_rate": 0.0001273037069633354,
2847
+ "loss": 0.2639,
2848
+ "step": 401
2849
+ },
2850
+ {
2851
+ "epoch": 0.89,
2852
+ "grad_norm": 1.0712870595212587,
2853
+ "learning_rate": 0.00012694880112357808,
2854
+ "loss": 0.2765,
2855
+ "step": 402
2856
+ },
2857
+ {
2858
+ "epoch": 0.89,
2859
+ "grad_norm": 0.2092081458414992,
2860
+ "learning_rate": 0.00012659352888040547,
2861
+ "loss": 0.2589,
2862
+ "step": 403
2863
+ },
2864
+ {
2865
+ "epoch": 0.89,
2866
+ "grad_norm": 0.21211010351459872,
2867
+ "learning_rate": 0.0001262378950641979,
2868
+ "loss": 0.285,
2869
+ "step": 404
2870
+ },
2871
+ {
2872
+ "epoch": 0.9,
2873
+ "grad_norm": 0.23377789093835202,
2874
+ "learning_rate": 0.00012588190451025207,
2875
+ "loss": 0.3038,
2876
+ "step": 405
2877
+ },
2878
+ {
2879
+ "epoch": 0.9,
2880
+ "grad_norm": 0.24727367430206143,
2881
+ "learning_rate": 0.00012552556205871478,
2882
+ "loss": 0.2577,
2883
+ "step": 406
2884
+ },
2885
+ {
2886
+ "epoch": 0.9,
2887
+ "grad_norm": 0.1800512798597796,
2888
+ "learning_rate": 0.00012516887255451735,
2889
+ "loss": 0.2392,
2890
+ "step": 407
2891
+ },
2892
+ {
2893
+ "epoch": 0.9,
2894
+ "grad_norm": 0.2114523591213948,
2895
+ "learning_rate": 0.00012481184084730976,
2896
+ "loss": 0.27,
2897
+ "step": 408
2898
+ },
2899
+ {
2900
+ "epoch": 0.91,
2901
+ "grad_norm": 0.22674225384993477,
2902
+ "learning_rate": 0.0001244544717913947,
2903
+ "loss": 0.2372,
2904
+ "step": 409
2905
+ },
2906
+ {
2907
+ "epoch": 0.91,
2908
+ "grad_norm": 0.17864125878915538,
2909
+ "learning_rate": 0.00012409677024566144,
2910
+ "loss": 0.2601,
2911
+ "step": 410
2912
+ },
2913
+ {
2914
+ "epoch": 0.91,
2915
+ "grad_norm": 0.1904684467485375,
2916
+ "learning_rate": 0.00012373874107352004,
2917
+ "loss": 0.2647,
2918
+ "step": 411
2919
+ },
2920
+ {
2921
+ "epoch": 0.91,
2922
+ "grad_norm": 0.17298512767214175,
2923
+ "learning_rate": 0.0001233803891428349,
2924
+ "loss": 0.2438,
2925
+ "step": 412
2926
+ },
2927
+ {
2928
+ "epoch": 0.91,
2929
+ "grad_norm": 0.17544690229663243,
2930
+ "learning_rate": 0.00012302171932585885,
2931
+ "loss": 0.2585,
2932
+ "step": 413
2933
+ },
2934
+ {
2935
+ "epoch": 0.92,
2936
+ "grad_norm": 0.20074139062085292,
2937
+ "learning_rate": 0.0001226627364991667,
2938
+ "loss": 0.2575,
2939
+ "step": 414
2940
+ },
2941
+ {
2942
+ "epoch": 0.92,
2943
+ "grad_norm": 0.18591883669880138,
2944
+ "learning_rate": 0.0001223034455435891,
2945
+ "loss": 0.2593,
2946
+ "step": 415
2947
+ },
2948
+ {
2949
+ "epoch": 0.92,
2950
+ "grad_norm": 0.20298533039339473,
2951
+ "learning_rate": 0.00012194385134414608,
2952
+ "loss": 0.2779,
2953
+ "step": 416
2954
+ },
2955
+ {
2956
+ "epoch": 0.92,
2957
+ "grad_norm": 0.19947012767946487,
2958
+ "learning_rate": 0.00012158395878998063,
2959
+ "loss": 0.2776,
2960
+ "step": 417
2961
+ },
2962
+ {
2963
+ "epoch": 0.93,
2964
+ "grad_norm": 0.18367461189103365,
2965
+ "learning_rate": 0.00012122377277429231,
2966
+ "loss": 0.2934,
2967
+ "step": 418
2968
+ },
2969
+ {
2970
+ "epoch": 0.93,
2971
+ "grad_norm": 0.18639410238342366,
2972
+ "learning_rate": 0.00012086329819427065,
2973
+ "loss": 0.2848,
2974
+ "step": 419
2975
+ },
2976
+ {
2977
+ "epoch": 0.93,
2978
+ "grad_norm": 0.18355116451736245,
2979
+ "learning_rate": 0.00012050253995102854,
2980
+ "loss": 0.2864,
2981
+ "step": 420
2982
+ },
2983
+ {
2984
+ "epoch": 0.93,
2985
+ "grad_norm": 0.16618761241876043,
2986
+ "learning_rate": 0.00012014150294953563,
2987
+ "loss": 0.2722,
2988
+ "step": 421
2989
+ },
2990
+ {
2991
+ "epoch": 0.93,
2992
+ "grad_norm": 0.1679981782550472,
2993
+ "learning_rate": 0.00011978019209855174,
2994
+ "loss": 0.2417,
2995
+ "step": 422
2996
+ },
2997
+ {
2998
+ "epoch": 0.94,
2999
+ "grad_norm": 0.17373986628880816,
3000
+ "learning_rate": 0.00011941861231055994,
3001
+ "loss": 0.2464,
3002
+ "step": 423
3003
+ },
3004
+ {
3005
+ "epoch": 0.94,
3006
+ "grad_norm": 0.16872090115264807,
3007
+ "learning_rate": 0.0001190567685016998,
3008
+ "loss": 0.2541,
3009
+ "step": 424
3010
+ },
3011
+ {
3012
+ "epoch": 0.94,
3013
+ "grad_norm": 0.17551340253440462,
3014
+ "learning_rate": 0.00011869466559170073,
3015
+ "loss": 0.2521,
3016
+ "step": 425
3017
+ },
3018
+ {
3019
+ "epoch": 0.94,
3020
+ "grad_norm": 0.18233468138593362,
3021
+ "learning_rate": 0.00011833230850381487,
3022
+ "loss": 0.2712,
3023
+ "step": 426
3024
+ },
3025
+ {
3026
+ "epoch": 0.95,
3027
+ "grad_norm": 0.18504780357931186,
3028
+ "learning_rate": 0.00011796970216475018,
3029
+ "loss": 0.2754,
3030
+ "step": 427
3031
+ },
3032
+ {
3033
+ "epoch": 0.95,
3034
+ "grad_norm": 0.18478336853897961,
3035
+ "learning_rate": 0.00011760685150460362,
3036
+ "loss": 0.2592,
3037
+ "step": 428
3038
+ },
3039
+ {
3040
+ "epoch": 0.95,
3041
+ "grad_norm": 0.19917097640828008,
3042
+ "learning_rate": 0.00011724376145679394,
3043
+ "loss": 0.2855,
3044
+ "step": 429
3045
+ },
3046
+ {
3047
+ "epoch": 0.95,
3048
+ "grad_norm": 0.19144241808727208,
3049
+ "learning_rate": 0.00011688043695799468,
3050
+ "loss": 0.2502,
3051
+ "step": 430
3052
+ },
3053
+ {
3054
+ "epoch": 0.95,
3055
+ "grad_norm": 0.18405239160044018,
3056
+ "learning_rate": 0.00011651688294806706,
3057
+ "loss": 0.2477,
3058
+ "step": 431
3059
+ },
3060
+ {
3061
+ "epoch": 0.96,
3062
+ "grad_norm": 0.1796721265270086,
3063
+ "learning_rate": 0.00011615310436999279,
3064
+ "loss": 0.249,
3065
+ "step": 432
3066
+ },
3067
+ {
3068
+ "epoch": 0.96,
3069
+ "grad_norm": 0.198923200273133,
3070
+ "learning_rate": 0.00011578910616980683,
3071
+ "loss": 0.2559,
3072
+ "step": 433
3073
+ },
3074
+ {
3075
+ "epoch": 0.96,
3076
+ "grad_norm": 0.2056661440897021,
3077
+ "learning_rate": 0.00011542489329653024,
3078
+ "loss": 0.2645,
3079
+ "step": 434
3080
+ },
3081
+ {
3082
+ "epoch": 0.96,
3083
+ "grad_norm": 0.21059350822537876,
3084
+ "learning_rate": 0.00011506047070210282,
3085
+ "loss": 0.2747,
3086
+ "step": 435
3087
+ },
3088
+ {
3089
+ "epoch": 0.97,
3090
+ "grad_norm": 0.19445339084850757,
3091
+ "learning_rate": 0.00011469584334131578,
3092
+ "loss": 0.2732,
3093
+ "step": 436
3094
+ },
3095
+ {
3096
+ "epoch": 0.97,
3097
+ "grad_norm": 0.1978300160069901,
3098
+ "learning_rate": 0.0001143310161717444,
3099
+ "loss": 0.2653,
3100
+ "step": 437
3101
+ },
3102
+ {
3103
+ "epoch": 0.97,
3104
+ "grad_norm": 0.1992099703929807,
3105
+ "learning_rate": 0.00011396599415368061,
3106
+ "loss": 0.2775,
3107
+ "step": 438
3108
+ },
3109
+ {
3110
+ "epoch": 0.97,
3111
+ "grad_norm": 0.20064532246856512,
3112
+ "learning_rate": 0.00011360078225006562,
3113
+ "loss": 0.2642,
3114
+ "step": 439
3115
+ },
3116
+ {
3117
+ "epoch": 0.97,
3118
+ "grad_norm": 0.19513907362313387,
3119
+ "learning_rate": 0.00011323538542642227,
3120
+ "loss": 0.2554,
3121
+ "step": 440
3122
+ },
3123
+ {
3124
+ "epoch": 0.98,
3125
+ "grad_norm": 0.191943659814926,
3126
+ "learning_rate": 0.00011286980865078763,
3127
+ "loss": 0.2778,
3128
+ "step": 441
3129
+ },
3130
+ {
3131
+ "epoch": 0.98,
3132
+ "grad_norm": 0.1921885618320068,
3133
+ "learning_rate": 0.0001125040568936456,
3134
+ "loss": 0.2734,
3135
+ "step": 442
3136
+ },
3137
+ {
3138
+ "epoch": 0.98,
3139
+ "grad_norm": 0.19140570504247198,
3140
+ "learning_rate": 0.00011213813512785898,
3141
+ "loss": 0.2809,
3142
+ "step": 443
3143
+ },
3144
+ {
3145
+ "epoch": 0.98,
3146
+ "grad_norm": 0.1772706721475436,
3147
+ "learning_rate": 0.00011177204832860213,
3148
+ "loss": 0.2558,
3149
+ "step": 444
3150
+ },
3151
+ {
3152
+ "epoch": 0.99,
3153
+ "grad_norm": 0.17875694766580855,
3154
+ "learning_rate": 0.00011140580147329338,
3155
+ "loss": 0.2569,
3156
+ "step": 445
3157
+ },
3158
+ {
3159
+ "epoch": 0.99,
3160
+ "grad_norm": 0.1891065652583873,
3161
+ "learning_rate": 0.000111039399541527,
3162
+ "loss": 0.271,
3163
+ "step": 446
3164
+ },
3165
+ {
3166
+ "epoch": 0.99,
3167
+ "grad_norm": 0.17377167224369017,
3168
+ "learning_rate": 0.00011067284751500583,
3169
+ "loss": 0.2711,
3170
+ "step": 447
3171
+ },
3172
+ {
3173
+ "epoch": 0.99,
3174
+ "grad_norm": 0.16281001225472738,
3175
+ "learning_rate": 0.00011030615037747353,
3176
+ "loss": 0.2464,
3177
+ "step": 448
3178
+ },
3179
+ {
3180
+ "epoch": 0.99,
3181
+ "grad_norm": 0.17949964198592142,
3182
+ "learning_rate": 0.0001099393131146466,
3183
+ "loss": 0.2514,
3184
+ "step": 449
3185
+ },
3186
+ {
3187
+ "epoch": 1.0,
3188
+ "grad_norm": 0.18463419257880492,
3189
+ "learning_rate": 0.00010957234071414674,
3190
+ "loss": 0.2799,
3191
+ "step": 450
3192
+ },
3193
+ {
3194
+ "epoch": 1.0,
3195
+ "grad_norm": 0.18137088353673406,
3196
+ "learning_rate": 0.00010920523816543309,
3197
+ "loss": 0.2615,
3198
+ "step": 451
3199
+ }
3200
+ ],
3201
+ "logging_steps": 1,
3202
+ "max_steps": 902,
3203
+ "num_input_tokens_seen": 0,
3204
+ "num_train_epochs": 2,
3205
+ "save_steps": 451,
3206
+ "total_flos": 1.317682551726696e+19,
3207
+ "train_batch_size": 4,
3208
+ "trial_name": null,
3209
+ "trial_params": null
3210
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b552ebcfa11608a328cb2f41b5014ee3e0856e9b78f1f4b8f616860c15d52362
3
+ size 7288
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-451/zero_to_fp32.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
+ elif zero_stage == 3:
216
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
+
218
+
219
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
+ return
222
+
223
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
+
226
+ if debug:
227
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
+
230
+ wanted_params = len(frozen_param_shapes)
231
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
+ print(f'Frozen params: Have {avail_numel} numels to process.')
234
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
+
236
+ total_params = 0
237
+ total_numel = 0
238
+ for name, shape in frozen_param_shapes.items():
239
+ total_params += 1
240
+ unpartitioned_numel = shape.numel()
241
+ total_numel += unpartitioned_numel
242
+
243
+ state_dict[name] = frozen_param_fragments[name]
244
+
245
+ if debug:
246
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
+
248
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
+
250
+
251
+ def _has_callable(obj, fn):
252
+ attr = getattr(obj, fn, None)
253
+ return callable(attr)
254
+
255
+
256
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
257
+ param_shapes = zero_model_states[0].param_shapes
258
+
259
+ # Reconstruction protocol:
260
+ #
261
+ # XXX: document this
262
+
263
+ if debug:
264
+ for i in range(world_size):
265
+ for j in range(len(fp32_flat_groups[0])):
266
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
267
+
268
+ # XXX: memory usage doubles here (zero2)
269
+ num_param_groups = len(fp32_flat_groups[0])
270
+ merged_single_partition_of_fp32_groups = []
271
+ for i in range(num_param_groups):
272
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
273
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
274
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
275
+ avail_numel = sum(
276
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
277
+
278
+ if debug:
279
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
280
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
281
+ # not asserting if there is a mismatch due to possible padding
282
+ print(f"Have {avail_numel} numels to process.")
283
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
284
+
285
+ # params
286
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
287
+ # out-of-core computing solution
288
+ total_numel = 0
289
+ total_params = 0
290
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
291
+ offset = 0
292
+ avail_numel = full_single_fp32_vector.numel()
293
+ for name, shape in shapes.items():
294
+
295
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
296
+ total_numel += unpartitioned_numel
297
+ total_params += 1
298
+
299
+ if debug:
300
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
301
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
302
+ offset += unpartitioned_numel
303
+
304
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
305
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
306
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
307
+ # live optimizer object, so we are checking that the numbers are within the right range
308
+ align_to = 2 * world_size
309
+
310
+ def zero2_align(x):
311
+ return align_to * math.ceil(x / align_to)
312
+
313
+ if debug:
314
+ print(f"original offset={offset}, avail_numel={avail_numel}")
315
+
316
+ offset = zero2_align(offset)
317
+ avail_numel = zero2_align(avail_numel)
318
+
319
+ if debug:
320
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
321
+
322
+ # Sanity check
323
+ if offset != avail_numel:
324
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
325
+
326
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
327
+
328
+
329
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
330
+ state_dict = OrderedDict()
331
+
332
+ # buffers
333
+ buffers = zero_model_states[0].buffers
334
+ state_dict.update(buffers)
335
+ if debug:
336
+ print(f"added {len(buffers)} buffers")
337
+
338
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
339
+
340
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
341
+
342
+ # recover shared parameters
343
+ for pair in zero_model_states[0].shared_params:
344
+ if pair[1] in state_dict:
345
+ state_dict[pair[0]] = state_dict[pair[1]]
346
+
347
+ return state_dict
348
+
349
+
350
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
351
+ remainder = unpartitioned_numel % world_size
352
+ padding_numel = (world_size - remainder) if remainder else 0
353
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
354
+ return partitioned_numel, padding_numel
355
+
356
+
357
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
358
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
359
+ return
360
+
361
+ if debug:
362
+ for i in range(world_size):
363
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
364
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
365
+
366
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
367
+ wanted_params = len(frozen_param_shapes)
368
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
369
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
370
+ print(f'Frozen params: Have {avail_numel} numels to process.')
371
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
372
+
373
+ total_params = 0
374
+ total_numel = 0
375
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
376
+ total_params += 1
377
+ unpartitioned_numel = shape.numel()
378
+ total_numel += unpartitioned_numel
379
+
380
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
381
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
382
+
383
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
384
+
385
+ if debug:
386
+ print(
387
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
388
+ )
389
+
390
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
391
+
392
+
393
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
394
+ param_shapes = zero_model_states[0].param_shapes
395
+ avail_numel = fp32_flat_groups[0].numel() * world_size
396
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
397
+ # param, re-consolidating each param, while dealing with padding if any
398
+
399
+ # merge list of dicts, preserving order
400
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
401
+
402
+ if debug:
403
+ for i in range(world_size):
404
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
405
+
406
+ wanted_params = len(param_shapes)
407
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
408
+ # not asserting if there is a mismatch due to possible padding
409
+ avail_numel = fp32_flat_groups[0].numel() * world_size
410
+ print(f"Trainable params: Have {avail_numel} numels to process.")
411
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
412
+
413
+ # params
414
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
415
+ # out-of-core computing solution
416
+ offset = 0
417
+ total_numel = 0
418
+ total_params = 0
419
+ for name, shape in param_shapes.items():
420
+
421
+ unpartitioned_numel = shape.numel()
422
+ total_numel += unpartitioned_numel
423
+ total_params += 1
424
+
425
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
426
+
427
+ if debug:
428
+ print(
429
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
430
+ )
431
+
432
+ # XXX: memory usage doubles here
433
+ state_dict[name] = torch.cat(
434
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
435
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
436
+ offset += partitioned_numel
437
+
438
+ offset *= world_size
439
+
440
+ # Sanity check
441
+ if offset != avail_numel:
442
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
443
+
444
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
445
+
446
+
447
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
448
+ state_dict = OrderedDict()
449
+
450
+ # buffers
451
+ buffers = zero_model_states[0].buffers
452
+ state_dict.update(buffers)
453
+ if debug:
454
+ print(f"added {len(buffers)} buffers")
455
+
456
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
457
+
458
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
459
+
460
+ # recover shared parameters
461
+ for pair in zero_model_states[0].shared_params:
462
+ if pair[1] in state_dict:
463
+ state_dict[pair[0]] = state_dict[pair[1]]
464
+
465
+ return state_dict
466
+
467
+
468
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
469
+ """
470
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
471
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
472
+ via a model hub.
473
+
474
+ Args:
475
+ - ``checkpoint_dir``: path to the desired checkpoint folder
476
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
477
+
478
+ Returns:
479
+ - pytorch ``state_dict``
480
+
481
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
482
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
483
+ the checkpoint.
484
+
485
+ A typical usage might be ::
486
+
487
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
488
+ # do the training and checkpoint saving
489
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
490
+ model = model.cpu() # move to cpu
491
+ model.load_state_dict(state_dict)
492
+ # submit to model hub or save the model to share with others
493
+
494
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
495
+ application. i.e. you will need to re-initialize the deepspeed engine, since
496
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
497
+
498
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
499
+
500
+ """
501
+ if tag is None:
502
+ latest_path = os.path.join(checkpoint_dir, 'latest')
503
+ if os.path.isfile(latest_path):
504
+ with open(latest_path, 'r') as fd:
505
+ tag = fd.read().strip()
506
+ else:
507
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
508
+
509
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
510
+
511
+ if not os.path.isdir(ds_checkpoint_dir):
512
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
513
+
514
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
515
+
516
+
517
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
518
+ """
519
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
520
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
521
+
522
+ Args:
523
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
524
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
525
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
526
+ """
527
+
528
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
529
+ print(f"Saving fp32 state dict to {output_file}")
530
+ torch.save(state_dict, output_file)
531
+
532
+
533
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
534
+ """
535
+ 1. Put the provided model to cpu
536
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
537
+ 3. Load it into the provided model
538
+
539
+ Args:
540
+ - ``model``: the model object to update
541
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
542
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
543
+
544
+ Returns:
545
+ - ``model`: modified model
546
+
547
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
548
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
549
+ conveniently placed for you in the checkpoint folder.
550
+
551
+ A typical usage might be ::
552
+
553
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
554
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
555
+ # submit to model hub or save the model to share with others
556
+
557
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
558
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
559
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
560
+
561
+ """
562
+ logger.info(f"Extracting fp32 weights")
563
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
564
+
565
+ logger.info(f"Overwriting model with fp32 weights")
566
+ model = model.cpu()
567
+ model.load_state_dict(state_dict, strict=False)
568
+
569
+ return model
570
+
571
+
572
+ if __name__ == "__main__":
573
+
574
+ parser = argparse.ArgumentParser()
575
+ parser.add_argument("checkpoint_dir",
576
+ type=str,
577
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
578
+ parser.add_argument(
579
+ "output_file",
580
+ type=str,
581
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
582
+ parser.add_argument("-t",
583
+ "--tag",
584
+ type=str,
585
+ default=None,
586
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
587
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
588
+ args = parser.parse_args()
589
+
590
+ debug = args.debug
591
+
592
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 1024,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 1024,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "gate_proj",
25
+ "k_proj",
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbadbc4bf37e0222adccdf3d2d066f81dc6d0efb7afe26f84b75ecaa117586d
3
+ size 15765462656
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5086401edf65104d1bcf5d744e1263cb25b005d20daa1a8cc75d48371471038f
3
+ size 11824020348
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47a9e394930a95f340d95cfef65ed7714d27cbc3d6beca582dc06352e1e508c
3
+ size 11824020540
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7ec27de69ce70b2754b7633cc97a7986ea4f9db12778e4f944ba843a6e4fe25
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:217c443cf640fdecaf22c834225533e81fd121134f107acdd54259f7109f1c06
3
+ size 11824020028
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2d16b4eee453bec2af1483e26cb0b005c17a7d8616db3a778140a04f56d27a
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18544f59b04c982b21b048d852b437a918aab4a68aca7d654cb4c4c2c8648d05
3
+ size 11824020604
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9812bbbc58d8ef9a84ec6e21adb5049ca812e975b607bb297b303f4c3992d722
3
+ size 11824020476
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9adbbd4b5b7eacbd6b6beb2c9dee1012be19e39b615145de02373331ca7f8db
3
+ size 11824020028
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/global_step902/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d3e12c7bdac60ad4ee1180532ca232aa8d25ad01b8acbfcb3d9d2adc2413747
3
+ size 15781541500
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step902
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c182cf493d9a8058062c1e747325e0e5dd031ae4ff9bbd18f4c10c399af56e53
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb90a861818194df582d66fa9fafbaa624cead2e57c1a7307ef83dc83beffe1
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8f15ffb9acdb56b0f31adb982eb91c7de346d1f4a1befb5a9f15401c52aaec7
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:169980460024c37a998fb6e4e78282a2d3605625e536060dc505d00385e220df
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a96e7e0e55477b4c628620559ece87571fa101ef2644938d6ce0d70f9c4f3eca
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_gpt35i_lr_0.0002_alpha_1024_r_1024/checkpoint-902/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70df896ab6205936d69533ca98e1213326e93a1b05511ff51bc549103e435543
3
+ size 15984