xu3kev commited on
Commit
7555715
1 Parent(s): 06386c6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/README.md +152 -0
  2. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/adapter_config.json +34 -0
  3. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/adapter_model.bin +3 -0
  4. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/README.md +202 -0
  5. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/adapter_config.json +34 -0
  6. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/adapter_model.safetensors +3 -0
  7. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  8. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  9. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  10. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  11. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  12. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  13. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  14. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  15. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/mp_rank_00_model_states.pt +3 -0
  16. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/latest +1 -0
  17. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_0.pth +3 -0
  18. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_1.pth +3 -0
  19. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_2.pth +3 -0
  20. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_3.pth +3 -0
  21. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_4.pth +3 -0
  22. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_5.pth +3 -0
  23. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_6.pth +3 -0
  24. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_7.pth +3 -0
  25. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/scheduler.pt +3 -0
  26. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/special_tokens_map.json +23 -0
  27. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/tokenizer.json +0 -0
  28. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/tokenizer_config.json +193 -0
  29. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/trainer_state.json +2357 -0
  30. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/training_args.bin +3 -0
  31. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/zero_to_fp32.py +592 -0
  32. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/README.md +202 -0
  33. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/adapter_config.json +34 -0
  34. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/adapter_model.safetensors +3 -0
  35. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  36. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  37. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  38. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  39. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  40. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  41. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  42. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  43. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/mp_rank_00_model_states.pt +3 -0
  44. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/latest +1 -0
  45. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_0.pth +3 -0
  46. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_1.pth +3 -0
  47. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_2.pth +3 -0
  48. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_3.pth +3 -0
  49. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_4.pth +3 -0
  50. gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_5.pth +3 -0
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/README.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
7
+ model-index:
8
+ - name: lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ adapter: lora
21
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
22
+ bf16: auto
23
+ dataset_prepared_path: ./logo_ds_preprocess_list_gpt35
24
+ datasets:
25
+ - path: ../logo/fix_deepseek_synthetic_training_data_full.jsonl
26
+ type:
27
+ field_instruction: input
28
+ field_output: output
29
+ format: '### Instruction:
30
+
31
+ {input}
32
+
33
+ ### Response:
34
+
35
+ '
36
+ no_input_format: '{instruction}'
37
+ debug: null
38
+ deepspeed: ./deepspeed_configs/zero2.json
39
+ early_stopping_patience: null
40
+ eval_sample_packing: true
41
+ evals_per_epoch: 4
42
+ flash_attention: true
43
+ fp16: null
44
+ fsdp: null
45
+ fsdp_config: null
46
+ gradient_accumulation_steps: 2
47
+ gradient_checkpointing: true
48
+ group_by_length: false
49
+ is_llama_derived_model: true
50
+ learning_rate: 0.0002
51
+ load_in_4bit: false
52
+ load_in_8bit: true
53
+ local_rank: null
54
+ logging_steps: 1
55
+ lora_alpha: 512
56
+ lora_dropout: 0.05
57
+ lora_fan_in_fan_out: null
58
+ lora_model_dir: null
59
+ lora_r: 512
60
+ lora_target_linear: true
61
+ lr_scheduler: cosine
62
+ micro_batch_size: 4
63
+ model_type: AutoModelForCausalLM
64
+ num_epochs: 2
65
+ optimizer: adamw_bnb_8bit
66
+ output_dir: ./lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512
67
+ pad_to_sequence_len: true
68
+ resume_from_checkpoint: null
69
+ s2_attention: null
70
+ sample_packing: true
71
+ saves_per_epoch: 1
72
+ sequence_len: 1800
73
+ special_tokens:
74
+ bos_token: "<\uFF5Cbegin\u2581of\u2581sentence\uFF5C>"
75
+ eos_token: <|EOT|>
76
+ strict: true
77
+ tf32: false
78
+ tokenizer_type: AutoTokenizer
79
+ train_on_inputs: false
80
+ val_set_size: 0.05
81
+ wandb_entity: null
82
+ wandb_log_model: null
83
+ wandb_name: logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512
84
+ wandb_project: pbe-axo
85
+ wandb_watch: null
86
+ warmup_steps: 50
87
+ weight_decay: 0.0
88
+ xformers_attention: null
89
+
90
+ ```
91
+
92
+ </details><br>
93
+
94
+ # lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512
95
+
96
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) on the None dataset.
97
+ It achieves the following results on the evaluation set:
98
+ - Loss: 0.3745
99
+
100
+ ## Model description
101
+
102
+ More information needed
103
+
104
+ ## Intended uses & limitations
105
+
106
+ More information needed
107
+
108
+ ## Training and evaluation data
109
+
110
+ More information needed
111
+
112
+ ## Training procedure
113
+
114
+ ### Training hyperparameters
115
+
116
+ The following hyperparameters were used during training:
117
+ - learning_rate: 0.0002
118
+ - train_batch_size: 4
119
+ - eval_batch_size: 4
120
+ - seed: 42
121
+ - distributed_type: multi-GPU
122
+ - num_devices: 8
123
+ - gradient_accumulation_steps: 2
124
+ - total_train_batch_size: 64
125
+ - total_eval_batch_size: 32
126
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
127
+ - lr_scheduler_type: cosine
128
+ - lr_scheduler_warmup_steps: 50
129
+ - num_epochs: 2
130
+
131
+ ### Training results
132
+
133
+ | Training Loss | Epoch | Step | Validation Loss |
134
+ |:-------------:|:-----:|:----:|:---------------:|
135
+ | 1.8824 | 0.0 | 1 | 1.9415 |
136
+ | 0.4244 | 0.25 | 82 | 0.4359 |
137
+ | 0.4116 | 0.5 | 164 | 0.4136 |
138
+ | 0.4143 | 0.75 | 246 | 0.4024 |
139
+ | 0.3856 | 1.0 | 328 | 0.3903 |
140
+ | 0.3595 | 1.23 | 410 | 0.3845 |
141
+ | 0.3511 | 1.47 | 492 | 0.3793 |
142
+ | 0.3728 | 1.72 | 574 | 0.3756 |
143
+ | 0.3314 | 1.97 | 656 | 0.3745 |
144
+
145
+
146
+ ### Framework versions
147
+
148
+ - PEFT 0.10.0
149
+ - Transformers 4.40.0.dev0
150
+ - Pytorch 2.1.2+cu121
151
+ - Datasets 2.15.0
152
+ - Tokenizers 0.15.0
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 512,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 512,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "gate_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "up_proj",
27
+ "o_proj",
28
+ "q_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:810f90d58d8d95e32df14a90c77b0090deaacf5138d8fba196b5ac1f7592c597
3
+ size 7882811310
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 512,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 512,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "gate_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "up_proj",
27
+ "o_proj",
28
+ "q_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0d9e93115b81e46028f1cd37c60fc8e373f67ffa1899ba63f328c3a61cf2c1
3
+ size 7882790952
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0cf66ad7187aca241c6d073ffe94ecdd2c13451c7f6d50c49eed84d96b07662
3
+ size 5912017776
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d426b486e5f1051e174ded2c6ded63b5b62e37c432dd9a5aeba5e48bd1629ec
3
+ size 5912017968
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c93c4849fc33629c795e5c85506e989bc7aca84e568fc4b199f7d4b759701a1
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7e9c528ed3437ba9f639c3bbcf956a81f33c1a300fedf779a972b8d2322625d
3
+ size 5912017456
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8782441998d87f02c57949224b07ba0af30b4c1cde3c367c30e2f997a6f80652
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcf8a092935465003e6241abf4bb309937a6e8b238f491278e9a92316c4c2ab0
3
+ size 5912018032
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3337d39ad278f1f31bf7c75203f9b0c96554348b9bbd8190f2f5b598aa85a0f1
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26517fd5c5d7f332c4a4bea48122fd997ed522e74d817e5c5e152b7d2c90d1b0
3
+ size 5912017456
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/global_step328/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62e3cb9ce974fb1de83091394aaadfa4f65bf3291aaa064e0e2d8bcb9d55b898
3
+ size 7898870908
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step328
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a352c182b9710826f0022a36cda6c73ec3f55fb8a9f7d1bf5457aa604b227543
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf162381c8eda30138a5cc4f06ba60af2992dcc796d94ebb76768cbe5002a42c
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:650434f97263f72a07ae9b01a2538795aabe4478c250f67ab7abe050c22b1da5
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5170c6239bf02e19975139f52918f0cbfa17c56afb857c7f50779b54a80e439a
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb35e799a14b30fb847e0d02d064425a25f5ce1e4ecde2fdff7ab27a96e7cfa
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e729767affb0a6530ff09e29abac3e6c0b93fcdff168b1e976df6fa26c8aea6e
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e2e71e48f9c2474cb68cbe667b43fe7cc3c8c58d5a529047bffbb8bff31ca4e
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95a9cb685e0b146fb5f5b0b6d1320bc23add31072c50c1ea02cc210946d41a88
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4444a078a732932553a8ad0a46109c3c1f8f5e5a23dd95a7245c651cd75b5d02
3
+ size 1064
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/tokenizer_config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "32000": {
6
+ "content": "õ",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": false
12
+ },
13
+ "32001": {
14
+ "content": "÷",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "32002": {
22
+ "content": "Á",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32003": {
30
+ "content": "ý",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "32004": {
38
+ "content": "À",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "32005": {
46
+ "content": "ÿ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "32006": {
54
+ "content": "ø",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "32007": {
62
+ "content": "ú",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "32008": {
70
+ "content": "þ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "32009": {
78
+ "content": "ü",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "32010": {
86
+ "content": "ù",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "32011": {
94
+ "content": "ö",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "32012": {
102
+ "content": "û",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "32013": {
110
+ "content": "<|begin▁of▁sentence|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32014": {
118
+ "content": "<|end▁of▁sentence|>",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32015": {
126
+ "content": "<|fim▁hole|>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "32016": {
134
+ "content": "<|fim▁begin|>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "32017": {
142
+ "content": "<|fim▁end|>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "32018": {
150
+ "content": "<pad>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "32019": {
158
+ "content": "<|User|>",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "32020": {
166
+ "content": "<|Assistant|>",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "32021": {
174
+ "content": "<|EOT|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ }
181
+ },
182
+ "bos_token": "<|begin▁of▁sentence|>",
183
+ "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|EOT|>",
186
+ "legacy": true,
187
+ "model_max_length": 16384,
188
+ "pad_token": "<|end▁of▁sentence|>",
189
+ "sp_model_kwargs": {},
190
+ "tokenizer_class": "LlamaTokenizer",
191
+ "unk_token": null,
192
+ "use_default_system_prompt": false
193
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/trainer_state.json ADDED
@@ -0,0 +1,2357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9984779299847792,
5
+ "eval_steps": 82,
6
+ "global_step": 328,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.7203346695870653,
14
+ "learning_rate": 4.000000000000001e-06,
15
+ "loss": 1.8824,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "eval_loss": 1.9414628744125366,
21
+ "eval_runtime": 123.2242,
22
+ "eval_samples_per_second": 12.871,
23
+ "eval_steps_per_second": 0.406,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.01,
28
+ "grad_norm": 0.8066382973847509,
29
+ "learning_rate": 8.000000000000001e-06,
30
+ "loss": 2.0085,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 0.6936845903233333,
36
+ "learning_rate": 1.2e-05,
37
+ "loss": 1.848,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "grad_norm": 0.7777025171228136,
43
+ "learning_rate": 1.6000000000000003e-05,
44
+ "loss": 1.8716,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.02,
49
+ "grad_norm": 0.7156607956242002,
50
+ "learning_rate": 2e-05,
51
+ "loss": 1.7856,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.02,
56
+ "grad_norm": 0.6445740194290953,
57
+ "learning_rate": 2.4e-05,
58
+ "loss": 1.5502,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 0.6763034633014766,
64
+ "learning_rate": 2.8000000000000003e-05,
65
+ "loss": 1.427,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.02,
70
+ "grad_norm": 0.6361172883549601,
71
+ "learning_rate": 3.2000000000000005e-05,
72
+ "loss": 1.2594,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.03,
77
+ "grad_norm": 0.4873251442447332,
78
+ "learning_rate": 3.6e-05,
79
+ "loss": 1.0563,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.03,
84
+ "grad_norm": 0.9955711639466034,
85
+ "learning_rate": 4e-05,
86
+ "loss": 0.9543,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "grad_norm": 0.44869550002102815,
92
+ "learning_rate": 4.4000000000000006e-05,
93
+ "loss": 0.8686,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.04,
98
+ "grad_norm": 0.3476322680141856,
99
+ "learning_rate": 4.8e-05,
100
+ "loss": 0.779,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.04,
105
+ "grad_norm": 0.2946964438123855,
106
+ "learning_rate": 5.2000000000000004e-05,
107
+ "loss": 0.6666,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.04,
112
+ "grad_norm": 0.23276548871050887,
113
+ "learning_rate": 5.6000000000000006e-05,
114
+ "loss": 0.6025,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.05,
119
+ "grad_norm": 0.17634054850907374,
120
+ "learning_rate": 6e-05,
121
+ "loss": 0.563,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.05,
126
+ "grad_norm": 0.17254736906459509,
127
+ "learning_rate": 6.400000000000001e-05,
128
+ "loss": 0.558,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.05,
133
+ "grad_norm": 0.1816173148836372,
134
+ "learning_rate": 6.800000000000001e-05,
135
+ "loss": 0.5486,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.05,
140
+ "grad_norm": 0.18575965325903093,
141
+ "learning_rate": 7.2e-05,
142
+ "loss": 0.5235,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.06,
147
+ "grad_norm": 0.15455046469681186,
148
+ "learning_rate": 7.6e-05,
149
+ "loss": 0.5642,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.06,
154
+ "grad_norm": 0.1586319562620695,
155
+ "learning_rate": 8e-05,
156
+ "loss": 0.5296,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.06,
161
+ "grad_norm": 0.13858657550627276,
162
+ "learning_rate": 8.4e-05,
163
+ "loss": 0.4948,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.07,
168
+ "grad_norm": 0.14394025858439913,
169
+ "learning_rate": 8.800000000000001e-05,
170
+ "loss": 0.5314,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.07,
175
+ "grad_norm": 0.11233355109777812,
176
+ "learning_rate": 9.200000000000001e-05,
177
+ "loss": 0.4759,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.07,
182
+ "grad_norm": 0.1064765648066197,
183
+ "learning_rate": 9.6e-05,
184
+ "loss": 0.4779,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.08,
189
+ "grad_norm": 0.10971288771683063,
190
+ "learning_rate": 0.0001,
191
+ "loss": 0.5432,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.08,
196
+ "grad_norm": 0.10634648696354684,
197
+ "learning_rate": 0.00010400000000000001,
198
+ "loss": 0.5006,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 0.08,
203
+ "grad_norm": 0.11003325135876459,
204
+ "learning_rate": 0.00010800000000000001,
205
+ "loss": 0.4854,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 0.09,
210
+ "grad_norm": 0.16688339973054586,
211
+ "learning_rate": 0.00011200000000000001,
212
+ "loss": 0.4548,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 0.09,
217
+ "grad_norm": 0.112873717299172,
218
+ "learning_rate": 0.000116,
219
+ "loss": 0.4637,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 0.09,
224
+ "grad_norm": 0.10229636034261401,
225
+ "learning_rate": 0.00012,
226
+ "loss": 0.4782,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 0.09,
231
+ "grad_norm": 0.11176276273674135,
232
+ "learning_rate": 0.000124,
233
+ "loss": 0.5223,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 0.1,
238
+ "grad_norm": 0.11584883864473663,
239
+ "learning_rate": 0.00012800000000000002,
240
+ "loss": 0.4187,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 0.1,
245
+ "grad_norm": 0.08376085678053431,
246
+ "learning_rate": 0.000132,
247
+ "loss": 0.4819,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 0.1,
252
+ "grad_norm": 0.08394222532536022,
253
+ "learning_rate": 0.00013600000000000003,
254
+ "loss": 0.4558,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 0.11,
259
+ "grad_norm": 0.08730185448660513,
260
+ "learning_rate": 0.00014,
261
+ "loss": 0.5022,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 0.11,
266
+ "grad_norm": 0.07280665284119933,
267
+ "learning_rate": 0.000144,
268
+ "loss": 0.4709,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 0.11,
273
+ "grad_norm": 0.07480352067161859,
274
+ "learning_rate": 0.000148,
275
+ "loss": 0.4421,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 0.12,
280
+ "grad_norm": 0.07355901855302331,
281
+ "learning_rate": 0.000152,
282
+ "loss": 0.4393,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 0.12,
287
+ "grad_norm": 0.0717117924993047,
288
+ "learning_rate": 0.00015600000000000002,
289
+ "loss": 0.4793,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 0.12,
294
+ "grad_norm": 0.06778395499705651,
295
+ "learning_rate": 0.00016,
296
+ "loss": 0.4445,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 0.12,
301
+ "grad_norm": 0.06555787382950577,
302
+ "learning_rate": 0.000164,
303
+ "loss": 0.4186,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.13,
308
+ "grad_norm": 0.07303309157575222,
309
+ "learning_rate": 0.000168,
310
+ "loss": 0.4254,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.13,
315
+ "grad_norm": 0.08106284215420356,
316
+ "learning_rate": 0.000172,
317
+ "loss": 0.4365,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.13,
322
+ "grad_norm": 0.07975875448010136,
323
+ "learning_rate": 0.00017600000000000002,
324
+ "loss": 0.4717,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 0.14,
329
+ "grad_norm": 0.08433876009904785,
330
+ "learning_rate": 0.00018,
331
+ "loss": 0.4351,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 0.14,
336
+ "grad_norm": 0.09208191082018911,
337
+ "learning_rate": 0.00018400000000000003,
338
+ "loss": 0.4467,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 0.14,
343
+ "grad_norm": 0.07933523549211162,
344
+ "learning_rate": 0.000188,
345
+ "loss": 0.479,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 0.15,
350
+ "grad_norm": 0.0790637214240238,
351
+ "learning_rate": 0.000192,
352
+ "loss": 0.4273,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 0.15,
357
+ "grad_norm": 0.08280301354612359,
358
+ "learning_rate": 0.000196,
359
+ "loss": 0.4763,
360
+ "step": 49
361
+ },
362
+ {
363
+ "epoch": 0.15,
364
+ "grad_norm": 0.07042898026998311,
365
+ "learning_rate": 0.0002,
366
+ "loss": 0.4884,
367
+ "step": 50
368
+ },
369
+ {
370
+ "epoch": 0.16,
371
+ "grad_norm": 0.07740076063925797,
372
+ "learning_rate": 0.00019999865623437013,
373
+ "loss": 0.4263,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 0.16,
378
+ "grad_norm": 0.07410948641111367,
379
+ "learning_rate": 0.00019999462497359466,
380
+ "loss": 0.4618,
381
+ "step": 52
382
+ },
383
+ {
384
+ "epoch": 0.16,
385
+ "grad_norm": 0.06942695365949933,
386
+ "learning_rate": 0.00019998790632601496,
387
+ "loss": 0.4476,
388
+ "step": 53
389
+ },
390
+ {
391
+ "epoch": 0.16,
392
+ "grad_norm": 0.07740921900572059,
393
+ "learning_rate": 0.0001999785004721968,
394
+ "loss": 0.4579,
395
+ "step": 54
396
+ },
397
+ {
398
+ "epoch": 0.17,
399
+ "grad_norm": 0.07356813699321713,
400
+ "learning_rate": 0.00019996640766492543,
401
+ "loss": 0.4794,
402
+ "step": 55
403
+ },
404
+ {
405
+ "epoch": 0.17,
406
+ "grad_norm": 0.0868751406582716,
407
+ "learning_rate": 0.00019995162822919883,
408
+ "loss": 0.4884,
409
+ "step": 56
410
+ },
411
+ {
412
+ "epoch": 0.17,
413
+ "grad_norm": 0.07343471336657725,
414
+ "learning_rate": 0.00019993416256221895,
415
+ "loss": 0.4239,
416
+ "step": 57
417
+ },
418
+ {
419
+ "epoch": 0.18,
420
+ "grad_norm": 0.06349845655369611,
421
+ "learning_rate": 0.00019991401113338104,
422
+ "loss": 0.4011,
423
+ "step": 58
424
+ },
425
+ {
426
+ "epoch": 0.18,
427
+ "grad_norm": 0.06045312891933264,
428
+ "learning_rate": 0.00019989117448426108,
429
+ "loss": 0.393,
430
+ "step": 59
431
+ },
432
+ {
433
+ "epoch": 0.18,
434
+ "grad_norm": 0.07037779637162567,
435
+ "learning_rate": 0.00019986565322860115,
436
+ "loss": 0.4539,
437
+ "step": 60
438
+ },
439
+ {
440
+ "epoch": 0.19,
441
+ "grad_norm": 0.0735017342117541,
442
+ "learning_rate": 0.00019983744805229296,
443
+ "loss": 0.448,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 0.19,
448
+ "grad_norm": 0.06709936204936039,
449
+ "learning_rate": 0.00019980655971335945,
450
+ "loss": 0.4521,
451
+ "step": 62
452
+ },
453
+ {
454
+ "epoch": 0.19,
455
+ "grad_norm": 0.059016546103329105,
456
+ "learning_rate": 0.00019977298904193437,
457
+ "loss": 0.4583,
458
+ "step": 63
459
+ },
460
+ {
461
+ "epoch": 0.19,
462
+ "grad_norm": 0.061842136466940804,
463
+ "learning_rate": 0.00019973673694024,
464
+ "loss": 0.419,
465
+ "step": 64
466
+ },
467
+ {
468
+ "epoch": 0.2,
469
+ "grad_norm": 0.07109183306258049,
470
+ "learning_rate": 0.00019969780438256293,
471
+ "loss": 0.4232,
472
+ "step": 65
473
+ },
474
+ {
475
+ "epoch": 0.2,
476
+ "grad_norm": 0.06594182395203928,
477
+ "learning_rate": 0.0001996561924152278,
478
+ "loss": 0.4367,
479
+ "step": 66
480
+ },
481
+ {
482
+ "epoch": 0.2,
483
+ "grad_norm": 0.06374736388917707,
484
+ "learning_rate": 0.0001996119021565693,
485
+ "loss": 0.4636,
486
+ "step": 67
487
+ },
488
+ {
489
+ "epoch": 0.21,
490
+ "grad_norm": 0.057076115956103006,
491
+ "learning_rate": 0.0001995649347969019,
492
+ "loss": 0.4361,
493
+ "step": 68
494
+ },
495
+ {
496
+ "epoch": 0.21,
497
+ "grad_norm": 0.05831060742074982,
498
+ "learning_rate": 0.00019951529159848805,
499
+ "loss": 0.4528,
500
+ "step": 69
501
+ },
502
+ {
503
+ "epoch": 0.21,
504
+ "grad_norm": 0.06976258952637959,
505
+ "learning_rate": 0.00019946297389550433,
506
+ "loss": 0.4205,
507
+ "step": 70
508
+ },
509
+ {
510
+ "epoch": 0.22,
511
+ "grad_norm": 0.09732026476620399,
512
+ "learning_rate": 0.00019940798309400526,
513
+ "loss": 0.4938,
514
+ "step": 71
515
+ },
516
+ {
517
+ "epoch": 0.22,
518
+ "grad_norm": 0.06811348765878435,
519
+ "learning_rate": 0.0001993503206718859,
520
+ "loss": 0.4462,
521
+ "step": 72
522
+ },
523
+ {
524
+ "epoch": 0.22,
525
+ "grad_norm": 0.06034104871370953,
526
+ "learning_rate": 0.00019928998817884182,
527
+ "loss": 0.4333,
528
+ "step": 73
529
+ },
530
+ {
531
+ "epoch": 0.23,
532
+ "grad_norm": 0.06738721791669006,
533
+ "learning_rate": 0.00019922698723632767,
534
+ "loss": 0.4641,
535
+ "step": 74
536
+ },
537
+ {
538
+ "epoch": 0.23,
539
+ "grad_norm": 0.059717789218296215,
540
+ "learning_rate": 0.00019916131953751342,
541
+ "loss": 0.4192,
542
+ "step": 75
543
+ },
544
+ {
545
+ "epoch": 0.23,
546
+ "grad_norm": 0.06161983068534973,
547
+ "learning_rate": 0.00019909298684723904,
548
+ "loss": 0.4283,
549
+ "step": 76
550
+ },
551
+ {
552
+ "epoch": 0.23,
553
+ "grad_norm": 0.0713429247511406,
554
+ "learning_rate": 0.00019902199100196697,
555
+ "loss": 0.4252,
556
+ "step": 77
557
+ },
558
+ {
559
+ "epoch": 0.24,
560
+ "grad_norm": 0.06767486775319806,
561
+ "learning_rate": 0.00019894833390973266,
562
+ "loss": 0.4442,
563
+ "step": 78
564
+ },
565
+ {
566
+ "epoch": 0.24,
567
+ "grad_norm": 0.06013764629410644,
568
+ "learning_rate": 0.00019887201755009357,
569
+ "loss": 0.4243,
570
+ "step": 79
571
+ },
572
+ {
573
+ "epoch": 0.24,
574
+ "grad_norm": 0.05939317773206885,
575
+ "learning_rate": 0.0001987930439740757,
576
+ "loss": 0.4287,
577
+ "step": 80
578
+ },
579
+ {
580
+ "epoch": 0.25,
581
+ "grad_norm": 0.058121288373969834,
582
+ "learning_rate": 0.00019871141530411853,
583
+ "loss": 0.4224,
584
+ "step": 81
585
+ },
586
+ {
587
+ "epoch": 0.25,
588
+ "grad_norm": 0.06367682996438209,
589
+ "learning_rate": 0.0001986271337340182,
590
+ "loss": 0.4244,
591
+ "step": 82
592
+ },
593
+ {
594
+ "epoch": 0.25,
595
+ "eval_loss": 0.43593671917915344,
596
+ "eval_runtime": 123.6408,
597
+ "eval_samples_per_second": 12.827,
598
+ "eval_steps_per_second": 0.404,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.25,
603
+ "grad_norm": 0.05887554673201214,
604
+ "learning_rate": 0.00019854020152886814,
605
+ "loss": 0.4152,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.26,
610
+ "grad_norm": 0.07021748228779248,
611
+ "learning_rate": 0.0001984506210249986,
612
+ "loss": 0.4428,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.26,
617
+ "grad_norm": 0.058579498783159746,
618
+ "learning_rate": 0.00019835839462991361,
619
+ "loss": 0.4382,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.26,
624
+ "grad_norm": 0.06631961598874808,
625
+ "learning_rate": 0.00019826352482222638,
626
+ "loss": 0.4242,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.26,
631
+ "grad_norm": 0.057575046759170044,
632
+ "learning_rate": 0.00019816601415159263,
633
+ "loss": 0.3953,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.27,
638
+ "grad_norm": 0.06366723108031043,
639
+ "learning_rate": 0.0001980658652386421,
640
+ "loss": 0.4127,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.27,
645
+ "grad_norm": 0.06394637911494824,
646
+ "learning_rate": 0.00019796308077490817,
647
+ "loss": 0.4352,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.27,
652
+ "grad_norm": 0.061983815621212,
653
+ "learning_rate": 0.00019785766352275542,
654
+ "loss": 0.425,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.28,
659
+ "grad_norm": 0.06475206249797272,
660
+ "learning_rate": 0.00019774961631530545,
661
+ "loss": 0.4163,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.28,
666
+ "grad_norm": 0.08320454397021351,
667
+ "learning_rate": 0.00019763894205636072,
668
+ "loss": 0.4577,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.28,
673
+ "grad_norm": 0.06396276892871468,
674
+ "learning_rate": 0.00019752564372032657,
675
+ "loss": 0.4285,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.29,
680
+ "grad_norm": 0.06492567947523852,
681
+ "learning_rate": 0.00019740972435213115,
682
+ "loss": 0.4306,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.29,
687
+ "grad_norm": 0.05926229232420999,
688
+ "learning_rate": 0.00019729118706714375,
689
+ "loss": 0.3974,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.29,
694
+ "grad_norm": 0.09188441911085327,
695
+ "learning_rate": 0.00019717003505109095,
696
+ "loss": 0.4821,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.3,
701
+ "grad_norm": 0.061060087834010235,
702
+ "learning_rate": 0.00019704627155997108,
703
+ "loss": 0.4238,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.3,
708
+ "grad_norm": 0.06816036342760105,
709
+ "learning_rate": 0.00019691989991996663,
710
+ "loss": 0.4126,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.3,
715
+ "grad_norm": 0.06089974832622745,
716
+ "learning_rate": 0.0001967909235273549,
717
+ "loss": 0.395,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.3,
722
+ "grad_norm": 0.061666173586949515,
723
+ "learning_rate": 0.00019665934584841682,
724
+ "loss": 0.3862,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.31,
729
+ "grad_norm": 0.07033284541990681,
730
+ "learning_rate": 0.00019652517041934356,
731
+ "loss": 0.4289,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.31,
736
+ "grad_norm": 0.056671446602724315,
737
+ "learning_rate": 0.00019638840084614182,
738
+ "loss": 0.414,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.31,
743
+ "grad_norm": 0.07404529543224442,
744
+ "learning_rate": 0.00019624904080453655,
745
+ "loss": 0.4478,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.32,
750
+ "grad_norm": 0.0637624026837155,
751
+ "learning_rate": 0.00019610709403987246,
752
+ "loss": 0.4376,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.32,
757
+ "grad_norm": 0.061950221318366465,
758
+ "learning_rate": 0.00019596256436701324,
759
+ "loss": 0.4347,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.32,
764
+ "grad_norm": 0.06399298551166889,
765
+ "learning_rate": 0.000195815455670239,
766
+ "loss": 0.4621,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.33,
771
+ "grad_norm": 0.057580014538137055,
772
+ "learning_rate": 0.00019566577190314197,
773
+ "loss": 0.4269,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.33,
778
+ "grad_norm": 0.05728079704466134,
779
+ "learning_rate": 0.0001955135170885202,
780
+ "loss": 0.4347,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.33,
785
+ "grad_norm": 0.059588937505571996,
786
+ "learning_rate": 0.00019535869531826937,
787
+ "loss": 0.4258,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.33,
792
+ "grad_norm": 0.05908603891068658,
793
+ "learning_rate": 0.00019520131075327298,
794
+ "loss": 0.4087,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.34,
799
+ "grad_norm": 0.05691278978874001,
800
+ "learning_rate": 0.00019504136762329047,
801
+ "loss": 0.3762,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.34,
806
+ "grad_norm": 0.0610389288690018,
807
+ "learning_rate": 0.00019487887022684336,
808
+ "loss": 0.4387,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.34,
813
+ "grad_norm": 0.058057719333723924,
814
+ "learning_rate": 0.00019471382293110003,
815
+ "loss": 0.4317,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.35,
820
+ "grad_norm": 0.05928092573363259,
821
+ "learning_rate": 0.00019454623017175812,
822
+ "loss": 0.4094,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.35,
827
+ "grad_norm": 0.0633442269536897,
828
+ "learning_rate": 0.00019437609645292546,
829
+ "loss": 0.4754,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.35,
834
+ "grad_norm": 0.07352467420423782,
835
+ "learning_rate": 0.0001942034263469989,
836
+ "loss": 0.4745,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.36,
841
+ "grad_norm": 0.06479997938550956,
842
+ "learning_rate": 0.00019402822449454153,
843
+ "loss": 0.4011,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.36,
848
+ "grad_norm": 0.05494756255932164,
849
+ "learning_rate": 0.00019385049560415794,
850
+ "loss": 0.4117,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.36,
855
+ "grad_norm": 0.053848208229852666,
856
+ "learning_rate": 0.00019367024445236754,
857
+ "loss": 0.4021,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.37,
862
+ "grad_norm": 0.0513948430468449,
863
+ "learning_rate": 0.00019348747588347637,
864
+ "loss": 0.3913,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.37,
869
+ "grad_norm": 0.05968265805277767,
870
+ "learning_rate": 0.00019330219480944694,
871
+ "loss": 0.3852,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.37,
876
+ "grad_norm": 0.06143883724069487,
877
+ "learning_rate": 0.00019311440620976597,
878
+ "loss": 0.4239,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.37,
883
+ "grad_norm": 0.06702756184099444,
884
+ "learning_rate": 0.0001929241151313108,
885
+ "loss": 0.4179,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.38,
890
+ "grad_norm": 0.06273285137098253,
891
+ "learning_rate": 0.00019273132668821364,
892
+ "loss": 0.4174,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.38,
897
+ "grad_norm": 0.06682279666625734,
898
+ "learning_rate": 0.00019253604606172417,
899
+ "loss": 0.4315,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.38,
904
+ "grad_norm": 0.056506240030426425,
905
+ "learning_rate": 0.00019233827850007027,
906
+ "loss": 0.404,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.39,
911
+ "grad_norm": 0.07527434233699179,
912
+ "learning_rate": 0.00019213802931831696,
913
+ "loss": 0.4528,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.39,
918
+ "grad_norm": 0.05535480690320134,
919
+ "learning_rate": 0.00019193530389822363,
920
+ "loss": 0.4107,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.39,
925
+ "grad_norm": 0.06557369752989106,
926
+ "learning_rate": 0.00019173010768809933,
927
+ "loss": 0.4045,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.4,
932
+ "grad_norm": 0.06007600934975136,
933
+ "learning_rate": 0.0001915224462026563,
934
+ "loss": 0.4147,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.4,
939
+ "grad_norm": 0.0686353460299127,
940
+ "learning_rate": 0.00019131232502286188,
941
+ "loss": 0.434,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.4,
946
+ "grad_norm": 0.06286385911412697,
947
+ "learning_rate": 0.0001910997497957885,
948
+ "loss": 0.4113,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.4,
953
+ "grad_norm": 0.0687787746229253,
954
+ "learning_rate": 0.00019088472623446183,
955
+ "loss": 0.4319,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.41,
960
+ "grad_norm": 0.061249142328772585,
961
+ "learning_rate": 0.00019066726011770726,
962
+ "loss": 0.4015,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.41,
967
+ "grad_norm": 0.06508152766794473,
968
+ "learning_rate": 0.0001904473572899947,
969
+ "loss": 0.4377,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.41,
974
+ "grad_norm": 0.06376427224221563,
975
+ "learning_rate": 0.00019022502366128135,
976
+ "loss": 0.4146,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.42,
981
+ "grad_norm": 0.067625252327034,
982
+ "learning_rate": 0.00019000026520685302,
983
+ "loss": 0.3926,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.42,
988
+ "grad_norm": 0.05130591917745107,
989
+ "learning_rate": 0.0001897730879671634,
990
+ "loss": 0.4104,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.42,
995
+ "grad_norm": 0.07002413081513567,
996
+ "learning_rate": 0.00018954349804767184,
997
+ "loss": 0.4046,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.43,
1002
+ "grad_norm": 0.06309452264198372,
1003
+ "learning_rate": 0.00018931150161867916,
1004
+ "loss": 0.3931,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.43,
1009
+ "grad_norm": 0.07066138455391835,
1010
+ "learning_rate": 0.00018907710491516199,
1011
+ "loss": 0.3959,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.43,
1016
+ "grad_norm": 0.06617122671784828,
1017
+ "learning_rate": 0.0001888403142366049,
1018
+ "loss": 0.4461,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.44,
1023
+ "grad_norm": 0.06188850665120826,
1024
+ "learning_rate": 0.00018860113594683148,
1025
+ "loss": 0.4476,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.44,
1030
+ "grad_norm": 0.0675458938713568,
1031
+ "learning_rate": 0.00018835957647383303,
1032
+ "loss": 0.3886,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.44,
1037
+ "grad_norm": 0.059099748414052966,
1038
+ "learning_rate": 0.00018811564230959588,
1039
+ "loss": 0.4102,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.44,
1044
+ "grad_norm": 0.06146419412677516,
1045
+ "learning_rate": 0.00018786934000992688,
1046
+ "loss": 0.4053,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.45,
1051
+ "grad_norm": 0.07052260373164539,
1052
+ "learning_rate": 0.00018762067619427746,
1053
+ "loss": 0.434,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.45,
1058
+ "grad_norm": 0.06405607662330068,
1059
+ "learning_rate": 0.00018736965754556528,
1060
+ "loss": 0.3973,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.45,
1065
+ "grad_norm": 0.05848109820491341,
1066
+ "learning_rate": 0.00018711629080999504,
1067
+ "loss": 0.4038,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.46,
1072
+ "grad_norm": 0.05734865806517323,
1073
+ "learning_rate": 0.00018686058279687698,
1074
+ "loss": 0.3684,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.46,
1079
+ "grad_norm": 0.05745744787313202,
1080
+ "learning_rate": 0.00018660254037844388,
1081
+ "loss": 0.4407,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.46,
1086
+ "grad_norm": 0.060533314386101295,
1087
+ "learning_rate": 0.00018634217048966637,
1088
+ "loss": 0.4396,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.47,
1093
+ "grad_norm": 0.05855819895205811,
1094
+ "learning_rate": 0.0001860794801280666,
1095
+ "loss": 0.4292,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.47,
1100
+ "grad_norm": 0.05617334346350608,
1101
+ "learning_rate": 0.0001858144763535302,
1102
+ "loss": 0.4304,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.47,
1107
+ "grad_norm": 0.053335701901412944,
1108
+ "learning_rate": 0.0001855471662881164,
1109
+ "loss": 0.4728,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.47,
1114
+ "grad_norm": 0.05628407823572177,
1115
+ "learning_rate": 0.00018527755711586678,
1116
+ "loss": 0.4509,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.48,
1121
+ "grad_norm": 0.06041813245213351,
1122
+ "learning_rate": 0.00018500565608261214,
1123
+ "loss": 0.4288,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.48,
1128
+ "grad_norm": 0.06769536961037438,
1129
+ "learning_rate": 0.00018473147049577774,
1130
+ "loss": 0.4135,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.48,
1135
+ "grad_norm": 0.05838015088221951,
1136
+ "learning_rate": 0.00018445500772418697,
1137
+ "loss": 0.4207,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.49,
1142
+ "grad_norm": 0.06853778581855352,
1143
+ "learning_rate": 0.00018417627519786315,
1144
+ "loss": 0.4347,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.49,
1149
+ "grad_norm": 0.053945190939189674,
1150
+ "learning_rate": 0.00018389528040783012,
1151
+ "loss": 0.3815,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.49,
1156
+ "grad_norm": 0.06710865737011762,
1157
+ "learning_rate": 0.00018361203090591071,
1158
+ "loss": 0.4304,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.5,
1163
+ "grad_norm": 0.05631006315671297,
1164
+ "learning_rate": 0.00018332653430452376,
1165
+ "loss": 0.4053,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.5,
1170
+ "grad_norm": 0.0657154008394696,
1171
+ "learning_rate": 0.00018303879827647975,
1172
+ "loss": 0.4116,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.5,
1177
+ "eval_loss": 0.41363829374313354,
1178
+ "eval_runtime": 123.9822,
1179
+ "eval_samples_per_second": 12.792,
1180
+ "eval_steps_per_second": 0.403,
1181
+ "step": 164
1182
+ },
1183
+ {
1184
+ "epoch": 0.5,
1185
+ "grad_norm": 0.05630640994563289,
1186
+ "learning_rate": 0.00018274883055477436,
1187
+ "loss": 0.3659,
1188
+ "step": 165
1189
+ },
1190
+ {
1191
+ "epoch": 0.51,
1192
+ "grad_norm": 0.07061221564045389,
1193
+ "learning_rate": 0.00018245663893238075,
1194
+ "loss": 0.428,
1195
+ "step": 166
1196
+ },
1197
+ {
1198
+ "epoch": 0.51,
1199
+ "grad_norm": 0.06110655042267304,
1200
+ "learning_rate": 0.00018216223126204007,
1201
+ "loss": 0.365,
1202
+ "step": 167
1203
+ },
1204
+ {
1205
+ "epoch": 0.51,
1206
+ "grad_norm": 0.060738092388703646,
1207
+ "learning_rate": 0.00018186561545605054,
1208
+ "loss": 0.441,
1209
+ "step": 168
1210
+ },
1211
+ {
1212
+ "epoch": 0.51,
1213
+ "grad_norm": 0.06073954329607645,
1214
+ "learning_rate": 0.00018156679948605467,
1215
+ "loss": 0.3883,
1216
+ "step": 169
1217
+ },
1218
+ {
1219
+ "epoch": 0.52,
1220
+ "grad_norm": 0.05611059253769085,
1221
+ "learning_rate": 0.00018126579138282503,
1222
+ "loss": 0.4182,
1223
+ "step": 170
1224
+ },
1225
+ {
1226
+ "epoch": 0.52,
1227
+ "grad_norm": 0.053806034345734145,
1228
+ "learning_rate": 0.0001809625992360485,
1229
+ "loss": 0.3848,
1230
+ "step": 171
1231
+ },
1232
+ {
1233
+ "epoch": 0.52,
1234
+ "grad_norm": 0.05630738374493018,
1235
+ "learning_rate": 0.00018065723119410884,
1236
+ "loss": 0.422,
1237
+ "step": 172
1238
+ },
1239
+ {
1240
+ "epoch": 0.53,
1241
+ "grad_norm": 0.06406207738713686,
1242
+ "learning_rate": 0.00018034969546386757,
1243
+ "loss": 0.4312,
1244
+ "step": 173
1245
+ },
1246
+ {
1247
+ "epoch": 0.53,
1248
+ "grad_norm": 0.0625195584390969,
1249
+ "learning_rate": 0.0001800400003104436,
1250
+ "loss": 0.4223,
1251
+ "step": 174
1252
+ },
1253
+ {
1254
+ "epoch": 0.53,
1255
+ "grad_norm": 0.06574492523626345,
1256
+ "learning_rate": 0.00017972815405699103,
1257
+ "loss": 0.4135,
1258
+ "step": 175
1259
+ },
1260
+ {
1261
+ "epoch": 0.54,
1262
+ "grad_norm": 0.0635055622467547,
1263
+ "learning_rate": 0.00017941416508447536,
1264
+ "loss": 0.4158,
1265
+ "step": 176
1266
+ },
1267
+ {
1268
+ "epoch": 0.54,
1269
+ "grad_norm": 0.05639940105674086,
1270
+ "learning_rate": 0.0001790980418314484,
1271
+ "loss": 0.3788,
1272
+ "step": 177
1273
+ },
1274
+ {
1275
+ "epoch": 0.54,
1276
+ "grad_norm": 0.06299319728622466,
1277
+ "learning_rate": 0.00017877979279382135,
1278
+ "loss": 0.4172,
1279
+ "step": 178
1280
+ },
1281
+ {
1282
+ "epoch": 0.54,
1283
+ "grad_norm": 0.05622383832182509,
1284
+ "learning_rate": 0.0001784594265246366,
1285
+ "loss": 0.4151,
1286
+ "step": 179
1287
+ },
1288
+ {
1289
+ "epoch": 0.55,
1290
+ "grad_norm": 0.05523518934013776,
1291
+ "learning_rate": 0.0001781369516338378,
1292
+ "loss": 0.3873,
1293
+ "step": 180
1294
+ },
1295
+ {
1296
+ "epoch": 0.55,
1297
+ "grad_norm": 0.05855116687583789,
1298
+ "learning_rate": 0.00017781237678803847,
1299
+ "loss": 0.4393,
1300
+ "step": 181
1301
+ },
1302
+ {
1303
+ "epoch": 0.55,
1304
+ "grad_norm": 0.056738829979401585,
1305
+ "learning_rate": 0.000177485710710289,
1306
+ "loss": 0.395,
1307
+ "step": 182
1308
+ },
1309
+ {
1310
+ "epoch": 0.56,
1311
+ "grad_norm": 0.058119623880280614,
1312
+ "learning_rate": 0.00017715696217984235,
1313
+ "loss": 0.4027,
1314
+ "step": 183
1315
+ },
1316
+ {
1317
+ "epoch": 0.56,
1318
+ "grad_norm": 0.06259885435698509,
1319
+ "learning_rate": 0.00017682614003191807,
1320
+ "loss": 0.4108,
1321
+ "step": 184
1322
+ },
1323
+ {
1324
+ "epoch": 0.56,
1325
+ "grad_norm": 0.059063802224138755,
1326
+ "learning_rate": 0.00017649325315746478,
1327
+ "loss": 0.3947,
1328
+ "step": 185
1329
+ },
1330
+ {
1331
+ "epoch": 0.57,
1332
+ "grad_norm": 0.0643644517640597,
1333
+ "learning_rate": 0.0001761583105029213,
1334
+ "loss": 0.4166,
1335
+ "step": 186
1336
+ },
1337
+ {
1338
+ "epoch": 0.57,
1339
+ "grad_norm": 0.05772573542573537,
1340
+ "learning_rate": 0.00017582132106997616,
1341
+ "loss": 0.4128,
1342
+ "step": 187
1343
+ },
1344
+ {
1345
+ "epoch": 0.57,
1346
+ "grad_norm": 0.06044984356521842,
1347
+ "learning_rate": 0.00017548229391532572,
1348
+ "loss": 0.3901,
1349
+ "step": 188
1350
+ },
1351
+ {
1352
+ "epoch": 0.58,
1353
+ "grad_norm": 0.06785343818729309,
1354
+ "learning_rate": 0.00017514123815043074,
1355
+ "loss": 0.4466,
1356
+ "step": 189
1357
+ },
1358
+ {
1359
+ "epoch": 0.58,
1360
+ "grad_norm": 0.06074826525980157,
1361
+ "learning_rate": 0.00017479816294127152,
1362
+ "loss": 0.4048,
1363
+ "step": 190
1364
+ },
1365
+ {
1366
+ "epoch": 0.58,
1367
+ "grad_norm": 0.055450179120944854,
1368
+ "learning_rate": 0.0001744530775081015,
1369
+ "loss": 0.4067,
1370
+ "step": 191
1371
+ },
1372
+ {
1373
+ "epoch": 0.58,
1374
+ "grad_norm": 0.06191386552231799,
1375
+ "learning_rate": 0.0001741059911251997,
1376
+ "loss": 0.3905,
1377
+ "step": 192
1378
+ },
1379
+ {
1380
+ "epoch": 0.59,
1381
+ "grad_norm": 0.06155751624181917,
1382
+ "learning_rate": 0.000173756913120621,
1383
+ "loss": 0.4079,
1384
+ "step": 193
1385
+ },
1386
+ {
1387
+ "epoch": 0.59,
1388
+ "grad_norm": 0.05284583709899453,
1389
+ "learning_rate": 0.00017340585287594604,
1390
+ "loss": 0.3658,
1391
+ "step": 194
1392
+ },
1393
+ {
1394
+ "epoch": 0.59,
1395
+ "grad_norm": 0.060942615927008996,
1396
+ "learning_rate": 0.0001730528198260285,
1397
+ "loss": 0.4263,
1398
+ "step": 195
1399
+ },
1400
+ {
1401
+ "epoch": 0.6,
1402
+ "grad_norm": 0.05811937550409047,
1403
+ "learning_rate": 0.00017269782345874203,
1404
+ "loss": 0.4251,
1405
+ "step": 196
1406
+ },
1407
+ {
1408
+ "epoch": 0.6,
1409
+ "grad_norm": 0.06501573238693972,
1410
+ "learning_rate": 0.00017234087331472497,
1411
+ "loss": 0.4214,
1412
+ "step": 197
1413
+ },
1414
+ {
1415
+ "epoch": 0.6,
1416
+ "grad_norm": 0.05800268755856704,
1417
+ "learning_rate": 0.00017198197898712404,
1418
+ "loss": 0.386,
1419
+ "step": 198
1420
+ },
1421
+ {
1422
+ "epoch": 0.61,
1423
+ "grad_norm": 0.06111127493805823,
1424
+ "learning_rate": 0.00017162115012133643,
1425
+ "loss": 0.3904,
1426
+ "step": 199
1427
+ },
1428
+ {
1429
+ "epoch": 0.61,
1430
+ "grad_norm": 0.0755165163264209,
1431
+ "learning_rate": 0.00017125839641475072,
1432
+ "loss": 0.416,
1433
+ "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.61,
1437
+ "grad_norm": 0.0614412379262829,
1438
+ "learning_rate": 0.00017089372761648616,
1439
+ "loss": 0.4366,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.61,
1444
+ "grad_norm": 0.07701476201626284,
1445
+ "learning_rate": 0.00017052715352713075,
1446
+ "loss": 0.4025,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.62,
1451
+ "grad_norm": 0.0689546923159957,
1452
+ "learning_rate": 0.00017015868399847768,
1453
+ "loss": 0.4259,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.62,
1458
+ "grad_norm": 0.07053930076122024,
1459
+ "learning_rate": 0.00016978832893326074,
1460
+ "loss": 0.4116,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.62,
1465
+ "grad_norm": 0.06512051456558368,
1466
+ "learning_rate": 0.00016941609828488807,
1467
+ "loss": 0.4218,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.63,
1472
+ "grad_norm": 0.058682027518078904,
1473
+ "learning_rate": 0.0001690420020571747,
1474
+ "loss": 0.3955,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.63,
1479
+ "grad_norm": 0.07259273598587179,
1480
+ "learning_rate": 0.0001686660503040737,
1481
+ "loss": 0.3792,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.63,
1486
+ "grad_norm": 0.06267618142936672,
1487
+ "learning_rate": 0.00016828825312940592,
1488
+ "loss": 0.3887,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.64,
1493
+ "grad_norm": 0.07240394294151951,
1494
+ "learning_rate": 0.0001679086206865886,
1495
+ "loss": 0.375,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.64,
1500
+ "grad_norm": 0.0694963821841272,
1501
+ "learning_rate": 0.00016752716317836229,
1502
+ "loss": 0.4211,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.64,
1507
+ "grad_norm": 0.06447800965085054,
1508
+ "learning_rate": 0.0001671438908565167,
1509
+ "loss": 0.4229,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.65,
1514
+ "grad_norm": 0.054857241792797835,
1515
+ "learning_rate": 0.00016675881402161536,
1516
+ "loss": 0.3748,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.65,
1521
+ "grad_norm": 0.05883060900951831,
1522
+ "learning_rate": 0.0001663719430227186,
1523
+ "loss": 0.4485,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.65,
1528
+ "grad_norm": 0.060169692906174264,
1529
+ "learning_rate": 0.00016598328825710533,
1530
+ "loss": 0.436,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.65,
1535
+ "grad_norm": 0.05919837924005844,
1536
+ "learning_rate": 0.000165592860169994,
1537
+ "loss": 0.3961,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.66,
1542
+ "grad_norm": 0.05110857539220564,
1543
+ "learning_rate": 0.00016520066925426144,
1544
+ "loss": 0.3799,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.66,
1549
+ "grad_norm": 0.05769915730650853,
1550
+ "learning_rate": 0.0001648067260501611,
1551
+ "loss": 0.3719,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.66,
1556
+ "grad_norm": 0.05551832763917467,
1557
+ "learning_rate": 0.0001644110411450398,
1558
+ "loss": 0.4024,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.67,
1563
+ "grad_norm": 0.054021371704014956,
1564
+ "learning_rate": 0.00016401362517305296,
1565
+ "loss": 0.3698,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.67,
1570
+ "grad_norm": 0.05808525374932993,
1571
+ "learning_rate": 0.00016361448881487914,
1572
+ "loss": 0.3967,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.67,
1577
+ "grad_norm": 0.06541846663067528,
1578
+ "learning_rate": 0.00016321364279743266,
1579
+ "loss": 0.4085,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.68,
1584
+ "grad_norm": 0.055548248357870096,
1585
+ "learning_rate": 0.0001628110978935756,
1586
+ "loss": 0.3794,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.68,
1591
+ "grad_norm": 0.05620210350185819,
1592
+ "learning_rate": 0.00016240686492182804,
1593
+ "loss": 0.3827,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.68,
1598
+ "grad_norm": 0.05662188754425024,
1599
+ "learning_rate": 0.00016200095474607753,
1600
+ "loss": 0.39,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.68,
1605
+ "grad_norm": 0.06430927392872963,
1606
+ "learning_rate": 0.00016159337827528685,
1607
+ "loss": 0.4126,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.69,
1612
+ "grad_norm": 0.06050671506887589,
1613
+ "learning_rate": 0.0001611841464632011,
1614
+ "loss": 0.3814,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.69,
1619
+ "grad_norm": 0.06516196863889696,
1620
+ "learning_rate": 0.0001607732703080532,
1621
+ "loss": 0.4103,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.69,
1626
+ "grad_norm": 0.05716853723416032,
1627
+ "learning_rate": 0.00016036076085226814,
1628
+ "loss": 0.365,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.7,
1633
+ "grad_norm": 0.0589005227934801,
1634
+ "learning_rate": 0.0001599466291821666,
1635
+ "loss": 0.3729,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.7,
1640
+ "grad_norm": 0.06695814310295387,
1641
+ "learning_rate": 0.0001595308864276666,
1642
+ "loss": 0.3924,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.7,
1647
+ "grad_norm": 0.060756148638168354,
1648
+ "learning_rate": 0.0001591135437619847,
1649
+ "loss": 0.3809,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.71,
1654
+ "grad_norm": 0.05215608115737446,
1655
+ "learning_rate": 0.0001586946124013354,
1656
+ "loss": 0.3965,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.71,
1661
+ "grad_norm": 0.05758081920924926,
1662
+ "learning_rate": 0.0001582741036046301,
1663
+ "loss": 0.3929,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.71,
1668
+ "grad_norm": 0.05701472413667785,
1669
+ "learning_rate": 0.00015785202867317407,
1670
+ "loss": 0.3777,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.72,
1675
+ "grad_norm": 0.06022974376834673,
1676
+ "learning_rate": 0.00015742839895036305,
1677
+ "loss": 0.3971,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.72,
1682
+ "grad_norm": 0.06966467282493215,
1683
+ "learning_rate": 0.00015700322582137827,
1684
+ "loss": 0.4362,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.72,
1689
+ "grad_norm": 0.057523669707635096,
1690
+ "learning_rate": 0.0001565765207128805,
1691
+ "loss": 0.389,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.72,
1696
+ "grad_norm": 0.061725975621654716,
1697
+ "learning_rate": 0.0001561482950927029,
1698
+ "loss": 0.4457,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.73,
1703
+ "grad_norm": 0.05848389898056001,
1704
+ "learning_rate": 0.00015571856046954285,
1705
+ "loss": 0.4014,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.73,
1710
+ "grad_norm": 0.05421736988927547,
1711
+ "learning_rate": 0.00015528732839265272,
1712
+ "loss": 0.3917,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.73,
1717
+ "grad_norm": 0.07861308281441043,
1718
+ "learning_rate": 0.0001548546104515294,
1719
+ "loss": 0.409,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.74,
1724
+ "grad_norm": 0.058381273544545885,
1725
+ "learning_rate": 0.00015442041827560274,
1726
+ "loss": 0.3927,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.74,
1731
+ "grad_norm": 0.06398677449972948,
1732
+ "learning_rate": 0.00015398476353392323,
1733
+ "loss": 0.3869,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.74,
1738
+ "grad_norm": 0.058201546135551674,
1739
+ "learning_rate": 0.00015354765793484834,
1740
+ "loss": 0.3948,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.75,
1745
+ "grad_norm": 0.05210488390120664,
1746
+ "learning_rate": 0.00015310911322572753,
1747
+ "loss": 0.4184,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.75,
1752
+ "grad_norm": 0.06515345694366666,
1753
+ "learning_rate": 0.000152669141192587,
1754
+ "loss": 0.4143,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.75,
1759
+ "eval_loss": 0.40236926078796387,
1760
+ "eval_runtime": 124.4453,
1761
+ "eval_samples_per_second": 12.745,
1762
+ "eval_steps_per_second": 0.402,
1763
+ "step": 246
1764
+ },
1765
+ {
1766
+ "epoch": 0.75,
1767
+ "grad_norm": 0.06418700258936089,
1768
+ "learning_rate": 0.00015222775365981273,
1769
+ "loss": 0.4216,
1770
+ "step": 247
1771
+ },
1772
+ {
1773
+ "epoch": 0.75,
1774
+ "grad_norm": 0.057372949478391395,
1775
+ "learning_rate": 0.00015178496248983254,
1776
+ "loss": 0.3997,
1777
+ "step": 248
1778
+ },
1779
+ {
1780
+ "epoch": 0.76,
1781
+ "grad_norm": 0.06880826702576336,
1782
+ "learning_rate": 0.00015134077958279765,
1783
+ "loss": 0.4295,
1784
+ "step": 249
1785
+ },
1786
+ {
1787
+ "epoch": 0.76,
1788
+ "grad_norm": 0.055593196615394004,
1789
+ "learning_rate": 0.00015089521687626243,
1790
+ "loss": 0.411,
1791
+ "step": 250
1792
+ },
1793
+ {
1794
+ "epoch": 0.76,
1795
+ "grad_norm": 0.05511861346121708,
1796
+ "learning_rate": 0.000150448286344864,
1797
+ "loss": 0.401,
1798
+ "step": 251
1799
+ },
1800
+ {
1801
+ "epoch": 0.77,
1802
+ "grad_norm": 0.07014458527477682,
1803
+ "learning_rate": 0.00015000000000000001,
1804
+ "loss": 0.382,
1805
+ "step": 252
1806
+ },
1807
+ {
1808
+ "epoch": 0.77,
1809
+ "grad_norm": 0.05457886481391161,
1810
+ "learning_rate": 0.00014955036988950618,
1811
+ "loss": 0.3736,
1812
+ "step": 253
1813
+ },
1814
+ {
1815
+ "epoch": 0.77,
1816
+ "grad_norm": 0.05889065936490638,
1817
+ "learning_rate": 0.00014909940809733222,
1818
+ "loss": 0.4199,
1819
+ "step": 254
1820
+ },
1821
+ {
1822
+ "epoch": 0.78,
1823
+ "grad_norm": 0.07098120686793513,
1824
+ "learning_rate": 0.00014864712674321734,
1825
+ "loss": 0.4303,
1826
+ "step": 255
1827
+ },
1828
+ {
1829
+ "epoch": 0.78,
1830
+ "grad_norm": 0.05568381009416088,
1831
+ "learning_rate": 0.00014819353798236427,
1832
+ "loss": 0.4221,
1833
+ "step": 256
1834
+ },
1835
+ {
1836
+ "epoch": 0.78,
1837
+ "grad_norm": 0.054097940137764075,
1838
+ "learning_rate": 0.00014773865400511272,
1839
+ "loss": 0.4087,
1840
+ "step": 257
1841
+ },
1842
+ {
1843
+ "epoch": 0.79,
1844
+ "grad_norm": 0.05779850643824985,
1845
+ "learning_rate": 0.00014728248703661182,
1846
+ "loss": 0.4293,
1847
+ "step": 258
1848
+ },
1849
+ {
1850
+ "epoch": 0.79,
1851
+ "grad_norm": 0.051717105638374755,
1852
+ "learning_rate": 0.00014682504933649144,
1853
+ "loss": 0.3857,
1854
+ "step": 259
1855
+ },
1856
+ {
1857
+ "epoch": 0.79,
1858
+ "grad_norm": 0.055928705445817024,
1859
+ "learning_rate": 0.00014636635319853275,
1860
+ "loss": 0.4268,
1861
+ "step": 260
1862
+ },
1863
+ {
1864
+ "epoch": 0.79,
1865
+ "grad_norm": 0.053618998300878094,
1866
+ "learning_rate": 0.00014590641095033787,
1867
+ "loss": 0.3847,
1868
+ "step": 261
1869
+ },
1870
+ {
1871
+ "epoch": 0.8,
1872
+ "grad_norm": 0.06096446521204243,
1873
+ "learning_rate": 0.00014544523495299842,
1874
+ "loss": 0.4154,
1875
+ "step": 262
1876
+ },
1877
+ {
1878
+ "epoch": 0.8,
1879
+ "grad_norm": 0.07227680725494777,
1880
+ "learning_rate": 0.0001449828376007636,
1881
+ "loss": 0.4213,
1882
+ "step": 263
1883
+ },
1884
+ {
1885
+ "epoch": 0.8,
1886
+ "grad_norm": 0.054914340367891584,
1887
+ "learning_rate": 0.0001445192313207067,
1888
+ "loss": 0.3747,
1889
+ "step": 264
1890
+ },
1891
+ {
1892
+ "epoch": 0.81,
1893
+ "grad_norm": 0.061131620471654886,
1894
+ "learning_rate": 0.0001440544285723915,
1895
+ "loss": 0.363,
1896
+ "step": 265
1897
+ },
1898
+ {
1899
+ "epoch": 0.81,
1900
+ "grad_norm": 0.06725204951613678,
1901
+ "learning_rate": 0.00014358844184753712,
1902
+ "loss": 0.3789,
1903
+ "step": 266
1904
+ },
1905
+ {
1906
+ "epoch": 0.81,
1907
+ "grad_norm": 0.05234337600175846,
1908
+ "learning_rate": 0.00014312128366968243,
1909
+ "loss": 0.3919,
1910
+ "step": 267
1911
+ },
1912
+ {
1913
+ "epoch": 0.82,
1914
+ "grad_norm": 0.050646805688266065,
1915
+ "learning_rate": 0.00014265296659384956,
1916
+ "loss": 0.3917,
1917
+ "step": 268
1918
+ },
1919
+ {
1920
+ "epoch": 0.82,
1921
+ "grad_norm": 0.06770924955750708,
1922
+ "learning_rate": 0.00014218350320620624,
1923
+ "loss": 0.3646,
1924
+ "step": 269
1925
+ },
1926
+ {
1927
+ "epoch": 0.82,
1928
+ "grad_norm": 0.059036905615901046,
1929
+ "learning_rate": 0.0001417129061237278,
1930
+ "loss": 0.4116,
1931
+ "step": 270
1932
+ },
1933
+ {
1934
+ "epoch": 0.82,
1935
+ "grad_norm": 0.0563149482067058,
1936
+ "learning_rate": 0.00014124118799385796,
1937
+ "loss": 0.3883,
1938
+ "step": 271
1939
+ },
1940
+ {
1941
+ "epoch": 0.83,
1942
+ "grad_norm": 0.05101811434687905,
1943
+ "learning_rate": 0.00014076836149416887,
1944
+ "loss": 0.3849,
1945
+ "step": 272
1946
+ },
1947
+ {
1948
+ "epoch": 0.83,
1949
+ "grad_norm": 0.05877249197444632,
1950
+ "learning_rate": 0.0001402944393320206,
1951
+ "loss": 0.3934,
1952
+ "step": 273
1953
+ },
1954
+ {
1955
+ "epoch": 0.83,
1956
+ "grad_norm": 0.060264919177227615,
1957
+ "learning_rate": 0.00013981943424421932,
1958
+ "loss": 0.3974,
1959
+ "step": 274
1960
+ },
1961
+ {
1962
+ "epoch": 0.84,
1963
+ "grad_norm": 0.05389918370856248,
1964
+ "learning_rate": 0.00013934335899667527,
1965
+ "loss": 0.3673,
1966
+ "step": 275
1967
+ },
1968
+ {
1969
+ "epoch": 0.84,
1970
+ "grad_norm": 0.06003023462725107,
1971
+ "learning_rate": 0.00013886622638405952,
1972
+ "loss": 0.3816,
1973
+ "step": 276
1974
+ },
1975
+ {
1976
+ "epoch": 0.84,
1977
+ "grad_norm": 0.06410228363296318,
1978
+ "learning_rate": 0.00013838804922946027,
1979
+ "loss": 0.3726,
1980
+ "step": 277
1981
+ },
1982
+ {
1983
+ "epoch": 0.85,
1984
+ "grad_norm": 0.05930094712489676,
1985
+ "learning_rate": 0.00013790884038403795,
1986
+ "loss": 0.3935,
1987
+ "step": 278
1988
+ },
1989
+ {
1990
+ "epoch": 0.85,
1991
+ "grad_norm": 0.060659826195864366,
1992
+ "learning_rate": 0.00013742861272668012,
1993
+ "loss": 0.3836,
1994
+ "step": 279
1995
+ },
1996
+ {
1997
+ "epoch": 0.85,
1998
+ "grad_norm": 0.05760349296820562,
1999
+ "learning_rate": 0.00013694737916365517,
2000
+ "loss": 0.3787,
2001
+ "step": 280
2002
+ },
2003
+ {
2004
+ "epoch": 0.86,
2005
+ "grad_norm": 0.0571696327815726,
2006
+ "learning_rate": 0.00013646515262826552,
2007
+ "loss": 0.3989,
2008
+ "step": 281
2009
+ },
2010
+ {
2011
+ "epoch": 0.86,
2012
+ "grad_norm": 0.05528789919119492,
2013
+ "learning_rate": 0.0001359819460805001,
2014
+ "loss": 0.3878,
2015
+ "step": 282
2016
+ },
2017
+ {
2018
+ "epoch": 0.86,
2019
+ "grad_norm": 0.0556735022387613,
2020
+ "learning_rate": 0.0001354977725066859,
2021
+ "loss": 0.3946,
2022
+ "step": 283
2023
+ },
2024
+ {
2025
+ "epoch": 0.86,
2026
+ "grad_norm": 0.0518340727564826,
2027
+ "learning_rate": 0.00013501264491913906,
2028
+ "loss": 0.4182,
2029
+ "step": 284
2030
+ },
2031
+ {
2032
+ "epoch": 0.87,
2033
+ "grad_norm": 0.05087174550075183,
2034
+ "learning_rate": 0.0001345265763558152,
2035
+ "loss": 0.3745,
2036
+ "step": 285
2037
+ },
2038
+ {
2039
+ "epoch": 0.87,
2040
+ "grad_norm": 0.059018891463680286,
2041
+ "learning_rate": 0.00013403957987995882,
2042
+ "loss": 0.4826,
2043
+ "step": 286
2044
+ },
2045
+ {
2046
+ "epoch": 0.87,
2047
+ "grad_norm": 0.05644881722680517,
2048
+ "learning_rate": 0.0001335516685797525,
2049
+ "loss": 0.4224,
2050
+ "step": 287
2051
+ },
2052
+ {
2053
+ "epoch": 0.88,
2054
+ "grad_norm": 0.05366777723333598,
2055
+ "learning_rate": 0.00013306285556796495,
2056
+ "loss": 0.3873,
2057
+ "step": 288
2058
+ },
2059
+ {
2060
+ "epoch": 0.88,
2061
+ "grad_norm": 0.055307686375362285,
2062
+ "learning_rate": 0.00013257315398159864,
2063
+ "loss": 0.4107,
2064
+ "step": 289
2065
+ },
2066
+ {
2067
+ "epoch": 0.88,
2068
+ "grad_norm": 0.05402204621055011,
2069
+ "learning_rate": 0.00013208257698153677,
2070
+ "loss": 0.371,
2071
+ "step": 290
2072
+ },
2073
+ {
2074
+ "epoch": 0.89,
2075
+ "grad_norm": 0.05528634732726841,
2076
+ "learning_rate": 0.00013159113775218964,
2077
+ "loss": 0.3754,
2078
+ "step": 291
2079
+ },
2080
+ {
2081
+ "epoch": 0.89,
2082
+ "grad_norm": 0.06412117904903747,
2083
+ "learning_rate": 0.00013109884950114007,
2084
+ "loss": 0.3863,
2085
+ "step": 292
2086
+ },
2087
+ {
2088
+ "epoch": 0.89,
2089
+ "grad_norm": 0.06473008528972156,
2090
+ "learning_rate": 0.00013060572545878875,
2091
+ "loss": 0.3987,
2092
+ "step": 293
2093
+ },
2094
+ {
2095
+ "epoch": 0.89,
2096
+ "grad_norm": 0.05924591082609274,
2097
+ "learning_rate": 0.00013011177887799845,
2098
+ "loss": 0.4012,
2099
+ "step": 294
2100
+ },
2101
+ {
2102
+ "epoch": 0.9,
2103
+ "grad_norm": 0.05708955974814907,
2104
+ "learning_rate": 0.00012961702303373795,
2105
+ "loss": 0.3873,
2106
+ "step": 295
2107
+ },
2108
+ {
2109
+ "epoch": 0.9,
2110
+ "grad_norm": 0.05797668418156991,
2111
+ "learning_rate": 0.00012912147122272523,
2112
+ "loss": 0.3948,
2113
+ "step": 296
2114
+ },
2115
+ {
2116
+ "epoch": 0.9,
2117
+ "grad_norm": 0.05490087707362648,
2118
+ "learning_rate": 0.00012862513676307008,
2119
+ "loss": 0.4133,
2120
+ "step": 297
2121
+ },
2122
+ {
2123
+ "epoch": 0.91,
2124
+ "grad_norm": 0.04985772375715609,
2125
+ "learning_rate": 0.00012812803299391628,
2126
+ "loss": 0.343,
2127
+ "step": 298
2128
+ },
2129
+ {
2130
+ "epoch": 0.91,
2131
+ "grad_norm": 0.053384085479669836,
2132
+ "learning_rate": 0.00012763017327508305,
2133
+ "loss": 0.3968,
2134
+ "step": 299
2135
+ },
2136
+ {
2137
+ "epoch": 0.91,
2138
+ "grad_norm": 0.054182728844989564,
2139
+ "learning_rate": 0.0001271315709867059,
2140
+ "loss": 0.3691,
2141
+ "step": 300
2142
+ },
2143
+ {
2144
+ "epoch": 0.92,
2145
+ "grad_norm": 0.05366480319869371,
2146
+ "learning_rate": 0.00012663223952887723,
2147
+ "loss": 0.3641,
2148
+ "step": 301
2149
+ },
2150
+ {
2151
+ "epoch": 0.92,
2152
+ "grad_norm": 0.05825052467442966,
2153
+ "learning_rate": 0.00012613219232128608,
2154
+ "loss": 0.4246,
2155
+ "step": 302
2156
+ },
2157
+ {
2158
+ "epoch": 0.92,
2159
+ "grad_norm": 0.05914668021956808,
2160
+ "learning_rate": 0.00012563144280285741,
2161
+ "loss": 0.4466,
2162
+ "step": 303
2163
+ },
2164
+ {
2165
+ "epoch": 0.93,
2166
+ "grad_norm": 0.05311433022303428,
2167
+ "learning_rate": 0.00012513000443139112,
2168
+ "loss": 0.3524,
2169
+ "step": 304
2170
+ },
2171
+ {
2172
+ "epoch": 0.93,
2173
+ "grad_norm": 0.06271465514138942,
2174
+ "learning_rate": 0.00012462789068320017,
2175
+ "loss": 0.3966,
2176
+ "step": 305
2177
+ },
2178
+ {
2179
+ "epoch": 0.93,
2180
+ "grad_norm": 0.061437635915296405,
2181
+ "learning_rate": 0.00012412511505274844,
2182
+ "loss": 0.4154,
2183
+ "step": 306
2184
+ },
2185
+ {
2186
+ "epoch": 0.93,
2187
+ "grad_norm": 0.061713719162943416,
2188
+ "learning_rate": 0.00012362169105228826,
2189
+ "loss": 0.3878,
2190
+ "step": 307
2191
+ },
2192
+ {
2193
+ "epoch": 0.94,
2194
+ "grad_norm": 0.06620759210040088,
2195
+ "learning_rate": 0.000123117632211497,
2196
+ "loss": 0.4021,
2197
+ "step": 308
2198
+ },
2199
+ {
2200
+ "epoch": 0.94,
2201
+ "grad_norm": 0.059779518980777165,
2202
+ "learning_rate": 0.00012261295207711346,
2203
+ "loss": 0.3804,
2204
+ "step": 309
2205
+ },
2206
+ {
2207
+ "epoch": 0.94,
2208
+ "grad_norm": 0.06040057457515796,
2209
+ "learning_rate": 0.0001221076642125742,
2210
+ "loss": 0.3961,
2211
+ "step": 310
2212
+ },
2213
+ {
2214
+ "epoch": 0.95,
2215
+ "grad_norm": 0.053782038877654016,
2216
+ "learning_rate": 0.00012160178219764837,
2217
+ "loss": 0.3713,
2218
+ "step": 311
2219
+ },
2220
+ {
2221
+ "epoch": 0.95,
2222
+ "grad_norm": 0.05671465286518603,
2223
+ "learning_rate": 0.00012109531962807332,
2224
+ "loss": 0.3944,
2225
+ "step": 312
2226
+ },
2227
+ {
2228
+ "epoch": 0.95,
2229
+ "grad_norm": 0.050310610945257364,
2230
+ "learning_rate": 0.00012058829011518896,
2231
+ "loss": 0.3897,
2232
+ "step": 313
2233
+ },
2234
+ {
2235
+ "epoch": 0.96,
2236
+ "grad_norm": 0.05388334304160659,
2237
+ "learning_rate": 0.00012008070728557186,
2238
+ "loss": 0.4021,
2239
+ "step": 314
2240
+ },
2241
+ {
2242
+ "epoch": 0.96,
2243
+ "grad_norm": 0.05025295406407605,
2244
+ "learning_rate": 0.00011957258478066931,
2245
+ "loss": 0.3834,
2246
+ "step": 315
2247
+ },
2248
+ {
2249
+ "epoch": 0.96,
2250
+ "grad_norm": 0.04961052380756055,
2251
+ "learning_rate": 0.00011906393625643244,
2252
+ "loss": 0.3577,
2253
+ "step": 316
2254
+ },
2255
+ {
2256
+ "epoch": 0.96,
2257
+ "grad_norm": 0.055687911797134414,
2258
+ "learning_rate": 0.00011855477538294935,
2259
+ "loss": 0.3783,
2260
+ "step": 317
2261
+ },
2262
+ {
2263
+ "epoch": 0.97,
2264
+ "grad_norm": 0.04787231501340896,
2265
+ "learning_rate": 0.00011804511584407763,
2266
+ "loss": 0.3389,
2267
+ "step": 318
2268
+ },
2269
+ {
2270
+ "epoch": 0.97,
2271
+ "grad_norm": 0.05861699517422006,
2272
+ "learning_rate": 0.00011753497133707679,
2273
+ "loss": 0.3999,
2274
+ "step": 319
2275
+ },
2276
+ {
2277
+ "epoch": 0.97,
2278
+ "grad_norm": 0.05407827216847258,
2279
+ "learning_rate": 0.00011702435557223987,
2280
+ "loss": 0.3368,
2281
+ "step": 320
2282
+ },
2283
+ {
2284
+ "epoch": 0.98,
2285
+ "grad_norm": 0.055141560736813096,
2286
+ "learning_rate": 0.00011651328227252517,
2287
+ "loss": 0.3174,
2288
+ "step": 321
2289
+ },
2290
+ {
2291
+ "epoch": 0.98,
2292
+ "grad_norm": 0.0580826161428443,
2293
+ "learning_rate": 0.00011600176517318741,
2294
+ "loss": 0.3683,
2295
+ "step": 322
2296
+ },
2297
+ {
2298
+ "epoch": 0.98,
2299
+ "grad_norm": 0.05553542087792669,
2300
+ "learning_rate": 0.00011548981802140848,
2301
+ "loss": 0.3767,
2302
+ "step": 323
2303
+ },
2304
+ {
2305
+ "epoch": 0.99,
2306
+ "grad_norm": 0.05569746872191059,
2307
+ "learning_rate": 0.00011497745457592816,
2308
+ "loss": 0.3447,
2309
+ "step": 324
2310
+ },
2311
+ {
2312
+ "epoch": 0.99,
2313
+ "grad_norm": 0.05769366910268819,
2314
+ "learning_rate": 0.00011446468860667421,
2315
+ "loss": 0.3777,
2316
+ "step": 325
2317
+ },
2318
+ {
2319
+ "epoch": 0.99,
2320
+ "grad_norm": 0.05852472491421206,
2321
+ "learning_rate": 0.00011395153389439233,
2322
+ "loss": 0.4017,
2323
+ "step": 326
2324
+ },
2325
+ {
2326
+ "epoch": 1.0,
2327
+ "grad_norm": 0.05111687503639324,
2328
+ "learning_rate": 0.00011343800423027582,
2329
+ "loss": 0.38,
2330
+ "step": 327
2331
+ },
2332
+ {
2333
+ "epoch": 1.0,
2334
+ "grad_norm": 0.05289578020364973,
2335
+ "learning_rate": 0.0001129241134155949,
2336
+ "loss": 0.3856,
2337
+ "step": 328
2338
+ },
2339
+ {
2340
+ "epoch": 1.0,
2341
+ "eval_loss": 0.39025306701660156,
2342
+ "eval_runtime": 123.5882,
2343
+ "eval_samples_per_second": 12.833,
2344
+ "eval_steps_per_second": 0.405,
2345
+ "step": 328
2346
+ }
2347
+ ],
2348
+ "logging_steps": 1,
2349
+ "max_steps": 656,
2350
+ "num_input_tokens_seen": 0,
2351
+ "num_train_epochs": 2,
2352
+ "save_steps": 328,
2353
+ "total_flos": 8.661792109281411e+18,
2354
+ "train_batch_size": 4,
2355
+ "trial_name": null,
2356
+ "trial_params": null
2357
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c7069f6ea8100401f859df9be683a86238eadf385bf57f46282a090e50b3943
3
+ size 7224
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-328/zero_to_fp32.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
+ elif zero_stage == 3:
216
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
+
218
+
219
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
+ return
222
+
223
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
+
226
+ if debug:
227
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
+
230
+ wanted_params = len(frozen_param_shapes)
231
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
+ print(f'Frozen params: Have {avail_numel} numels to process.')
234
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
+
236
+ total_params = 0
237
+ total_numel = 0
238
+ for name, shape in frozen_param_shapes.items():
239
+ total_params += 1
240
+ unpartitioned_numel = shape.numel()
241
+ total_numel += unpartitioned_numel
242
+
243
+ state_dict[name] = frozen_param_fragments[name]
244
+
245
+ if debug:
246
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
+
248
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
+
250
+
251
+ def _has_callable(obj, fn):
252
+ attr = getattr(obj, fn, None)
253
+ return callable(attr)
254
+
255
+
256
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
257
+ param_shapes = zero_model_states[0].param_shapes
258
+
259
+ # Reconstruction protocol:
260
+ #
261
+ # XXX: document this
262
+
263
+ if debug:
264
+ for i in range(world_size):
265
+ for j in range(len(fp32_flat_groups[0])):
266
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
267
+
268
+ # XXX: memory usage doubles here (zero2)
269
+ num_param_groups = len(fp32_flat_groups[0])
270
+ merged_single_partition_of_fp32_groups = []
271
+ for i in range(num_param_groups):
272
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
273
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
274
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
275
+ avail_numel = sum(
276
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
277
+
278
+ if debug:
279
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
280
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
281
+ # not asserting if there is a mismatch due to possible padding
282
+ print(f"Have {avail_numel} numels to process.")
283
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
284
+
285
+ # params
286
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
287
+ # out-of-core computing solution
288
+ total_numel = 0
289
+ total_params = 0
290
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
291
+ offset = 0
292
+ avail_numel = full_single_fp32_vector.numel()
293
+ for name, shape in shapes.items():
294
+
295
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
296
+ total_numel += unpartitioned_numel
297
+ total_params += 1
298
+
299
+ if debug:
300
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
301
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
302
+ offset += unpartitioned_numel
303
+
304
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
305
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
306
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
307
+ # live optimizer object, so we are checking that the numbers are within the right range
308
+ align_to = 2 * world_size
309
+
310
+ def zero2_align(x):
311
+ return align_to * math.ceil(x / align_to)
312
+
313
+ if debug:
314
+ print(f"original offset={offset}, avail_numel={avail_numel}")
315
+
316
+ offset = zero2_align(offset)
317
+ avail_numel = zero2_align(avail_numel)
318
+
319
+ if debug:
320
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
321
+
322
+ # Sanity check
323
+ if offset != avail_numel:
324
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
325
+
326
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
327
+
328
+
329
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
330
+ state_dict = OrderedDict()
331
+
332
+ # buffers
333
+ buffers = zero_model_states[0].buffers
334
+ state_dict.update(buffers)
335
+ if debug:
336
+ print(f"added {len(buffers)} buffers")
337
+
338
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
339
+
340
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
341
+
342
+ # recover shared parameters
343
+ for pair in zero_model_states[0].shared_params:
344
+ if pair[1] in state_dict:
345
+ state_dict[pair[0]] = state_dict[pair[1]]
346
+
347
+ return state_dict
348
+
349
+
350
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
351
+ remainder = unpartitioned_numel % world_size
352
+ padding_numel = (world_size - remainder) if remainder else 0
353
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
354
+ return partitioned_numel, padding_numel
355
+
356
+
357
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
358
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
359
+ return
360
+
361
+ if debug:
362
+ for i in range(world_size):
363
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
364
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
365
+
366
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
367
+ wanted_params = len(frozen_param_shapes)
368
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
369
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
370
+ print(f'Frozen params: Have {avail_numel} numels to process.')
371
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
372
+
373
+ total_params = 0
374
+ total_numel = 0
375
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
376
+ total_params += 1
377
+ unpartitioned_numel = shape.numel()
378
+ total_numel += unpartitioned_numel
379
+
380
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
381
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
382
+
383
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
384
+
385
+ if debug:
386
+ print(
387
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
388
+ )
389
+
390
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
391
+
392
+
393
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
394
+ param_shapes = zero_model_states[0].param_shapes
395
+ avail_numel = fp32_flat_groups[0].numel() * world_size
396
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
397
+ # param, re-consolidating each param, while dealing with padding if any
398
+
399
+ # merge list of dicts, preserving order
400
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
401
+
402
+ if debug:
403
+ for i in range(world_size):
404
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
405
+
406
+ wanted_params = len(param_shapes)
407
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
408
+ # not asserting if there is a mismatch due to possible padding
409
+ avail_numel = fp32_flat_groups[0].numel() * world_size
410
+ print(f"Trainable params: Have {avail_numel} numels to process.")
411
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
412
+
413
+ # params
414
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
415
+ # out-of-core computing solution
416
+ offset = 0
417
+ total_numel = 0
418
+ total_params = 0
419
+ for name, shape in param_shapes.items():
420
+
421
+ unpartitioned_numel = shape.numel()
422
+ total_numel += unpartitioned_numel
423
+ total_params += 1
424
+
425
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
426
+
427
+ if debug:
428
+ print(
429
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
430
+ )
431
+
432
+ # XXX: memory usage doubles here
433
+ state_dict[name] = torch.cat(
434
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
435
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
436
+ offset += partitioned_numel
437
+
438
+ offset *= world_size
439
+
440
+ # Sanity check
441
+ if offset != avail_numel:
442
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
443
+
444
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
445
+
446
+
447
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
448
+ state_dict = OrderedDict()
449
+
450
+ # buffers
451
+ buffers = zero_model_states[0].buffers
452
+ state_dict.update(buffers)
453
+ if debug:
454
+ print(f"added {len(buffers)} buffers")
455
+
456
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
457
+
458
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
459
+
460
+ # recover shared parameters
461
+ for pair in zero_model_states[0].shared_params:
462
+ if pair[1] in state_dict:
463
+ state_dict[pair[0]] = state_dict[pair[1]]
464
+
465
+ return state_dict
466
+
467
+
468
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
469
+ """
470
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
471
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
472
+ via a model hub.
473
+
474
+ Args:
475
+ - ``checkpoint_dir``: path to the desired checkpoint folder
476
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
477
+
478
+ Returns:
479
+ - pytorch ``state_dict``
480
+
481
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
482
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
483
+ the checkpoint.
484
+
485
+ A typical usage might be ::
486
+
487
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
488
+ # do the training and checkpoint saving
489
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
490
+ model = model.cpu() # move to cpu
491
+ model.load_state_dict(state_dict)
492
+ # submit to model hub or save the model to share with others
493
+
494
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
495
+ application. i.e. you will need to re-initialize the deepspeed engine, since
496
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
497
+
498
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
499
+
500
+ """
501
+ if tag is None:
502
+ latest_path = os.path.join(checkpoint_dir, 'latest')
503
+ if os.path.isfile(latest_path):
504
+ with open(latest_path, 'r') as fd:
505
+ tag = fd.read().strip()
506
+ else:
507
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
508
+
509
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
510
+
511
+ if not os.path.isdir(ds_checkpoint_dir):
512
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
513
+
514
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
515
+
516
+
517
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
518
+ """
519
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
520
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
521
+
522
+ Args:
523
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
524
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
525
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
526
+ """
527
+
528
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
529
+ print(f"Saving fp32 state dict to {output_file}")
530
+ torch.save(state_dict, output_file)
531
+
532
+
533
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
534
+ """
535
+ 1. Put the provided model to cpu
536
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
537
+ 3. Load it into the provided model
538
+
539
+ Args:
540
+ - ``model``: the model object to update
541
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
542
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
543
+
544
+ Returns:
545
+ - ``model`: modified model
546
+
547
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
548
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
549
+ conveniently placed for you in the checkpoint folder.
550
+
551
+ A typical usage might be ::
552
+
553
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
554
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
555
+ # submit to model hub or save the model to share with others
556
+
557
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
558
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
559
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
560
+
561
+ """
562
+ logger.info(f"Extracting fp32 weights")
563
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
564
+
565
+ logger.info(f"Overwriting model with fp32 weights")
566
+ model = model.cpu()
567
+ model.load_state_dict(state_dict, strict=False)
568
+
569
+ return model
570
+
571
+
572
+ if __name__ == "__main__":
573
+
574
+ parser = argparse.ArgumentParser()
575
+ parser.add_argument("checkpoint_dir",
576
+ type=str,
577
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
578
+ parser.add_argument(
579
+ "output_file",
580
+ type=str,
581
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
582
+ parser.add_argument("-t",
583
+ "--tag",
584
+ type=str,
585
+ default=None,
586
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
587
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
588
+ args = parser.parse_args()
589
+
590
+ debug = args.debug
591
+
592
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: deepseek-ai/deepseek-coder-33b-instruct
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-33b-instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 512,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 512,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "gate_proj",
24
+ "v_proj",
25
+ "down_proj",
26
+ "up_proj",
27
+ "o_proj",
28
+ "q_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4825f0cc5886b23fa55b664e84a2a8c454df7eaf2d1e60e9bf3e89712edbeb
3
+ size 7882790952
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:251c0ef5d177b36da6957b367c79ef95b4b0a8e909685cc0fd8e80fdd79d8fcc
3
+ size 5912017776
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65e0c97038ccf1be90045db12d9df93437425b0843d23f4ebae42ad75e7b43e9
3
+ size 5912017968
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df3fa01d941bf9c093646cefb6b450206e659e22bb2060875324119b04641953
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a8503509ccd10ac688d55b6b653f961abf16f83f0caf4ff48ba6276c170931
3
+ size 5912017456
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57da307cad84ae9fa928466e8dd4127be4fab1eea56cbcbd7dc9e2bc16e6cafe
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5479286828bbf125efdc0e6efad8825858ca31b3cbb1fcb497d5e87c5eaefe77
3
+ size 5912018032
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:787f7be13cb5342bf4b594d94dd6c1592748ca757ec10f64bd924e3581ce9ebe
3
+ size 5912017904
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5cfe57125c2469d9921af0d1c939250eedb8cd01acbb1f5f1f7580241d390cf
3
+ size 5912017456
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/global_step656/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3993b55bf28d4cc7248d3be9c8ea5147ae7065f1d38731b7658f55bc354e39f1
3
+ size 7898870908
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step656
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ede1adf7bd5e316a79c69a737a77ffe40386468221df1d951a791fd8ea90c2e1
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f29e3c0d6b52d1682d5fa36eb0bea3979b971344c3b7bcfa7a8a69de2aa5523
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13e9078596afd111a96a356f098c49ae54d4262c4bec87c305412b399b0a4892
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8426d1884f494562c6c52394f6dc2891855b054586a3ddf0708e18c826ccc50
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b7e68bba25832a428e48e1146ef66ce72755893af3706ce3d84f7f4d3894d3e
3
+ size 15984
gcp_models/lora-logo_fix_full_deepseek33b_ds33i_lr_0.0002_alpha_512_r_512/checkpoint-656/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e952cbb371f2547cc3c81e411209b949ebe232fb1e045cc24332638332b7a9c3
3
+ size 15984