inflaton commited on
Commit
69fc39f
·
1 Parent(s): bdc91f5

fine-tuned qwen2 72b

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama-factory/saves/Qwen2-72B-Instruct/README.md +87 -0
  2. llama-factory/saves/Qwen2-72B-Instruct/adapter_config.json +34 -0
  3. llama-factory/saves/Qwen2-72B-Instruct/adapter_model.safetensors +3 -0
  4. llama-factory/saves/Qwen2-72B-Instruct/added_tokens.json +5 -0
  5. llama-factory/saves/Qwen2-72B-Instruct/all_results.json +12 -0
  6. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/README.md +202 -0
  7. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/adapter_config.json +34 -0
  8. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/adapter_model.safetensors +3 -0
  9. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/added_tokens.json +5 -0
  10. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/merges.txt +0 -0
  11. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/optimizer.pt +3 -0
  12. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/rng_state.pth +3 -0
  13. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/scheduler.pt +3 -0
  14. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/special_tokens_map.json +20 -0
  15. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/tokenizer.json +0 -0
  16. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/tokenizer_config.json +44 -0
  17. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/trainer_state.json +1623 -0
  18. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/training_args.bin +3 -0
  19. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/vocab.json +0 -0
  20. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/README.md +202 -0
  21. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/adapter_config.json +34 -0
  22. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/adapter_model.safetensors +3 -0
  23. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/added_tokens.json +5 -0
  24. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/merges.txt +0 -0
  25. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/optimizer.pt +3 -0
  26. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/rng_state.pth +3 -0
  27. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/scheduler.pt +3 -0
  28. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/special_tokens_map.json +20 -0
  29. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/tokenizer.json +0 -0
  30. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/tokenizer_config.json +44 -0
  31. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/trainer_state.json +1729 -0
  32. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/training_args.bin +3 -0
  33. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/vocab.json +0 -0
  34. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/README.md +202 -0
  35. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/adapter_config.json +34 -0
  36. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/adapter_model.safetensors +3 -0
  37. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/added_tokens.json +5 -0
  38. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/merges.txt +0 -0
  39. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/optimizer.pt +3 -0
  40. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/rng_state.pth +3 -0
  41. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/scheduler.pt +3 -0
  42. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/special_tokens_map.json +20 -0
  43. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/tokenizer.json +0 -0
  44. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/tokenizer_config.json +44 -0
  45. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/trainer_state.json +1835 -0
  46. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/training_args.bin +3 -0
  47. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/vocab.json +0 -0
  48. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/README.md +202 -0
  49. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/adapter_config.json +34 -0
  50. llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/adapter_model.safetensors +3 -0
llama-factory/saves/Qwen2-72B-Instruct/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-72B-Instruct
3
+ library_name: peft
4
+ license: other
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: Qwen2-72B-Instruct
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # Qwen2-72B-Instruct
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) on the alpaca_mac dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 2.6303
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 2
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 16
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 6.0
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss |
54
+ |:-------------:|:------:|:----:|:---------------:|
55
+ | 1.5022 | 0.2499 | 70 | 1.4518 |
56
+ | 1.4121 | 0.4998 | 140 | 1.3727 |
57
+ | 1.4038 | 0.7497 | 210 | 1.3051 |
58
+ | 1.2739 | 0.9996 | 280 | 1.2890 |
59
+ | 1.1436 | 1.2494 | 350 | 1.3195 |
60
+ | 1.0783 | 1.4993 | 420 | 1.3106 |
61
+ | 1.1219 | 1.7492 | 490 | 1.3045 |
62
+ | 1.0966 | 1.9991 | 560 | 1.3094 |
63
+ | 0.6134 | 2.2490 | 630 | 1.4946 |
64
+ | 0.6342 | 2.4989 | 700 | 1.4859 |
65
+ | 0.6665 | 2.7488 | 770 | 1.5236 |
66
+ | 0.6101 | 2.9987 | 840 | 1.5220 |
67
+ | 0.2467 | 3.2485 | 910 | 1.8390 |
68
+ | 0.2284 | 3.4984 | 980 | 1.8253 |
69
+ | 0.2839 | 3.7483 | 1050 | 1.8688 |
70
+ | 0.2111 | 3.9982 | 1120 | 1.8910 |
71
+ | 0.0753 | 4.2481 | 1190 | 2.2224 |
72
+ | 0.072 | 4.4980 | 1260 | 2.3093 |
73
+ | 0.0351 | 4.7479 | 1330 | 2.2221 |
74
+ | 0.0644 | 4.9978 | 1400 | 2.2804 |
75
+ | 0.0257 | 5.2477 | 1470 | 2.5593 |
76
+ | 0.0249 | 5.4975 | 1540 | 2.6220 |
77
+ | 0.0238 | 5.7474 | 1610 | 2.6189 |
78
+ | 0.0262 | 5.9973 | 1680 | 2.6303 |
79
+
80
+
81
+ ### Framework versions
82
+
83
+ - PEFT 0.11.1
84
+ - Transformers 4.43.3
85
+ - Pytorch 2.4.0+cu121
86
+ - Datasets 2.19.1
87
+ - Tokenizers 0.19.1
llama-factory/saves/Qwen2-72B-Instruct/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-72B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama-factory/saves/Qwen2-72B-Instruct/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70b713d1dd86c53aeb06439656b440c5511b68ae3a93663b7bc8f4dc16d0bcd
3
+ size 421218912
llama-factory/saves/Qwen2-72B-Instruct/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
llama-factory/saves/Qwen2-72B-Instruct/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.997322623828648,
3
+ "eval_loss": 2.630276679992676,
4
+ "eval_runtime": 17.6344,
5
+ "eval_samples_per_second": 2.609,
6
+ "eval_steps_per_second": 2.609,
7
+ "total_flos": 1.7687758754493235e+18,
8
+ "train_loss": 0.5969825589208908,
9
+ "train_runtime": 20472.2071,
10
+ "train_samples_per_second": 1.314,
11
+ "train_steps_per_second": 0.082
12
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-72B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-72B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f9ee749b7ec9f57969e61f47f449dedddab9fd8a5c86414dc13c660b0e5f594
3
+ size 421218912
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56daaff59f546cafcc65377710ce5c1e896cb4af5f540665e525035fbe0e30cc
3
+ size 843085810
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d10ad6c90de119ae0a3eeecda7366634ce107ef267b5fdd3ddd046f0a5ae742
3
+ size 14244
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d92c8dde1a48ab9074d104d6a69af052b31b74322f7d2541206e582ecea745b7
3
+ size 1064
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 131072,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/trainer_state.json ADDED
@@ -0,0 +1,1623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.748326639892905,
5
+ "eval_steps": 70,
6
+ "global_step": 1050,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0178491744756805,
13
+ "grad_norm": 1.8217403888702393,
14
+ "learning_rate": 2.9761904761904763e-06,
15
+ "loss": 2.7425,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.035698348951361,
20
+ "grad_norm": 2.104698419570923,
21
+ "learning_rate": 5.9523809523809525e-06,
22
+ "loss": 2.861,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.0535475234270415,
27
+ "grad_norm": 2.7389333248138428,
28
+ "learning_rate": 8.92857142857143e-06,
29
+ "loss": 2.8281,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.071396697902722,
34
+ "grad_norm": 3.9298207759857178,
35
+ "learning_rate": 1.1904761904761905e-05,
36
+ "loss": 3.1888,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.0892458723784025,
41
+ "grad_norm": 2.648014783859253,
42
+ "learning_rate": 1.4880952380952381e-05,
43
+ "loss": 2.6461,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.107095046854083,
48
+ "grad_norm": 1.587472915649414,
49
+ "learning_rate": 1.785714285714286e-05,
50
+ "loss": 2.3212,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.1249442213297635,
55
+ "grad_norm": 0.8390935063362122,
56
+ "learning_rate": 2.0833333333333336e-05,
57
+ "loss": 1.8036,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.142793395805444,
62
+ "grad_norm": 0.46670979261398315,
63
+ "learning_rate": 2.380952380952381e-05,
64
+ "loss": 1.5552,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.1606425702811245,
69
+ "grad_norm": 0.45171597599983215,
70
+ "learning_rate": 2.6785714285714288e-05,
71
+ "loss": 1.6626,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.178491744756805,
76
+ "grad_norm": 0.5605499744415283,
77
+ "learning_rate": 2.9761904761904762e-05,
78
+ "loss": 1.4897,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.1963409192324855,
83
+ "grad_norm": 0.5553259253501892,
84
+ "learning_rate": 3.273809523809524e-05,
85
+ "loss": 1.5373,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.214190093708166,
90
+ "grad_norm": 0.6260251402854919,
91
+ "learning_rate": 3.571428571428572e-05,
92
+ "loss": 1.4779,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.2320392681838465,
97
+ "grad_norm": 0.6063796877861023,
98
+ "learning_rate": 3.8690476190476195e-05,
99
+ "loss": 1.483,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.249888442659527,
104
+ "grad_norm": 0.5549850463867188,
105
+ "learning_rate": 4.166666666666667e-05,
106
+ "loss": 1.5022,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.249888442659527,
111
+ "eval_loss": 1.451762318611145,
112
+ "eval_runtime": 17.7549,
113
+ "eval_samples_per_second": 2.591,
114
+ "eval_steps_per_second": 2.591,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.2677376171352075,
119
+ "grad_norm": 0.482930988073349,
120
+ "learning_rate": 4.464285714285715e-05,
121
+ "loss": 1.4256,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 0.285586791610888,
126
+ "grad_norm": 0.4240593910217285,
127
+ "learning_rate": 4.761904761904762e-05,
128
+ "loss": 1.3655,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.3034359660865685,
133
+ "grad_norm": 0.4872314929962158,
134
+ "learning_rate": 5.05952380952381e-05,
135
+ "loss": 1.4478,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 0.321285140562249,
140
+ "grad_norm": 0.42132768034935,
141
+ "learning_rate": 5.3571428571428575e-05,
142
+ "loss": 1.3305,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 0.3391343150379295,
147
+ "grad_norm": 0.6932046413421631,
148
+ "learning_rate": 5.6547619047619046e-05,
149
+ "loss": 1.4279,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 0.35698348951361,
154
+ "grad_norm": 0.6714524626731873,
155
+ "learning_rate": 5.9523809523809524e-05,
156
+ "loss": 1.4967,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.3748326639892905,
161
+ "grad_norm": 0.5682816505432129,
162
+ "learning_rate": 6.25e-05,
163
+ "loss": 1.4739,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.392681838464971,
168
+ "grad_norm": 0.7795937657356262,
169
+ "learning_rate": 6.547619047619048e-05,
170
+ "loss": 1.3751,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.4105310129406515,
175
+ "grad_norm": 0.8056842088699341,
176
+ "learning_rate": 6.845238095238096e-05,
177
+ "loss": 1.3699,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.428380187416332,
182
+ "grad_norm": 0.8373801112174988,
183
+ "learning_rate": 7.142857142857143e-05,
184
+ "loss": 1.4696,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.4462293618920125,
189
+ "grad_norm": 1.0051416158676147,
190
+ "learning_rate": 7.440476190476191e-05,
191
+ "loss": 1.4059,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.464078536367693,
196
+ "grad_norm": 0.5304180383682251,
197
+ "learning_rate": 7.738095238095239e-05,
198
+ "loss": 1.3072,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.4819277108433735,
203
+ "grad_norm": 0.8797634243965149,
204
+ "learning_rate": 8.035714285714287e-05,
205
+ "loss": 1.4132,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.499776885319054,
210
+ "grad_norm": 0.9049625396728516,
211
+ "learning_rate": 8.333333333333334e-05,
212
+ "loss": 1.4121,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.499776885319054,
217
+ "eval_loss": 1.3727394342422485,
218
+ "eval_runtime": 17.745,
219
+ "eval_samples_per_second": 2.592,
220
+ "eval_steps_per_second": 2.592,
221
+ "step": 140
222
+ },
223
+ {
224
+ "epoch": 0.5176260597947345,
225
+ "grad_norm": 0.6793915033340454,
226
+ "learning_rate": 8.630952380952382e-05,
227
+ "loss": 1.3109,
228
+ "step": 145
229
+ },
230
+ {
231
+ "epoch": 0.535475234270415,
232
+ "grad_norm": 0.7171015739440918,
233
+ "learning_rate": 8.92857142857143e-05,
234
+ "loss": 1.3781,
235
+ "step": 150
236
+ },
237
+ {
238
+ "epoch": 0.5533244087460955,
239
+ "grad_norm": 0.6738716959953308,
240
+ "learning_rate": 9.226190476190478e-05,
241
+ "loss": 1.3564,
242
+ "step": 155
243
+ },
244
+ {
245
+ "epoch": 0.571173583221776,
246
+ "grad_norm": 0.699975311756134,
247
+ "learning_rate": 9.523809523809524e-05,
248
+ "loss": 1.2387,
249
+ "step": 160
250
+ },
251
+ {
252
+ "epoch": 0.5890227576974565,
253
+ "grad_norm": 0.7659904956817627,
254
+ "learning_rate": 9.821428571428572e-05,
255
+ "loss": 1.3042,
256
+ "step": 165
257
+ },
258
+ {
259
+ "epoch": 0.606871932173137,
260
+ "grad_norm": 0.9782125353813171,
261
+ "learning_rate": 9.999956828659095e-05,
262
+ "loss": 1.3709,
263
+ "step": 170
264
+ },
265
+ {
266
+ "epoch": 0.6247211066488175,
267
+ "grad_norm": 1.0532957315444946,
268
+ "learning_rate": 9.999471159635539e-05,
269
+ "loss": 1.3844,
270
+ "step": 175
271
+ },
272
+ {
273
+ "epoch": 0.642570281124498,
274
+ "grad_norm": 0.7373877167701721,
275
+ "learning_rate": 9.998445910004082e-05,
276
+ "loss": 1.2852,
277
+ "step": 180
278
+ },
279
+ {
280
+ "epoch": 0.6604194556001785,
281
+ "grad_norm": 1.0207768678665161,
282
+ "learning_rate": 9.996881190417393e-05,
283
+ "loss": 1.4652,
284
+ "step": 185
285
+ },
286
+ {
287
+ "epoch": 0.678268630075859,
288
+ "grad_norm": 0.7943917512893677,
289
+ "learning_rate": 9.994777169751806e-05,
290
+ "loss": 1.3743,
291
+ "step": 190
292
+ },
293
+ {
294
+ "epoch": 0.6961178045515395,
295
+ "grad_norm": 0.7461659908294678,
296
+ "learning_rate": 9.992134075089084e-05,
297
+ "loss": 1.2423,
298
+ "step": 195
299
+ },
300
+ {
301
+ "epoch": 0.71396697902722,
302
+ "grad_norm": 0.9689913988113403,
303
+ "learning_rate": 9.988952191691925e-05,
304
+ "loss": 1.3113,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.7318161535029005,
309
+ "grad_norm": 0.766276478767395,
310
+ "learning_rate": 9.985231862973168e-05,
311
+ "loss": 1.3524,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.749665327978581,
316
+ "grad_norm": 0.6728419661521912,
317
+ "learning_rate": 9.980973490458728e-05,
318
+ "loss": 1.4038,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.749665327978581,
323
+ "eval_loss": 1.3051044940948486,
324
+ "eval_runtime": 17.7559,
325
+ "eval_samples_per_second": 2.591,
326
+ "eval_steps_per_second": 2.591,
327
+ "step": 210
328
+ },
329
+ {
330
+ "epoch": 0.7675145024542614,
331
+ "grad_norm": 1.0456575155258179,
332
+ "learning_rate": 9.976177533744261e-05,
333
+ "loss": 1.3626,
334
+ "step": 215
335
+ },
336
+ {
337
+ "epoch": 0.785363676929942,
338
+ "grad_norm": 0.9017456769943237,
339
+ "learning_rate": 9.97084451044556e-05,
340
+ "loss": 1.3232,
341
+ "step": 220
342
+ },
343
+ {
344
+ "epoch": 0.8032128514056225,
345
+ "grad_norm": 0.9113703966140747,
346
+ "learning_rate": 9.964974996142698e-05,
347
+ "loss": 1.2826,
348
+ "step": 225
349
+ },
350
+ {
351
+ "epoch": 0.821062025881303,
352
+ "grad_norm": 0.7177279591560364,
353
+ "learning_rate": 9.958569624317893e-05,
354
+ "loss": 1.2794,
355
+ "step": 230
356
+ },
357
+ {
358
+ "epoch": 0.8389112003569835,
359
+ "grad_norm": 0.9058728814125061,
360
+ "learning_rate": 9.951629086287151e-05,
361
+ "loss": 1.3853,
362
+ "step": 235
363
+ },
364
+ {
365
+ "epoch": 0.856760374832664,
366
+ "grad_norm": 0.6813459992408752,
367
+ "learning_rate": 9.944154131125642e-05,
368
+ "loss": 1.3533,
369
+ "step": 240
370
+ },
371
+ {
372
+ "epoch": 0.8746095493083444,
373
+ "grad_norm": 0.7113555073738098,
374
+ "learning_rate": 9.936145565586871e-05,
375
+ "loss": 1.3395,
376
+ "step": 245
377
+ },
378
+ {
379
+ "epoch": 0.892458723784025,
380
+ "grad_norm": 1.243597149848938,
381
+ "learning_rate": 9.927604254015585e-05,
382
+ "loss": 1.443,
383
+ "step": 250
384
+ },
385
+ {
386
+ "epoch": 0.9103078982597055,
387
+ "grad_norm": 0.8651953339576721,
388
+ "learning_rate": 9.918531118254507e-05,
389
+ "loss": 1.398,
390
+ "step": 255
391
+ },
392
+ {
393
+ "epoch": 0.928157072735386,
394
+ "grad_norm": 0.8877395987510681,
395
+ "learning_rate": 9.90892713754483e-05,
396
+ "loss": 1.346,
397
+ "step": 260
398
+ },
399
+ {
400
+ "epoch": 0.9460062472110665,
401
+ "grad_norm": 0.8857008814811707,
402
+ "learning_rate": 9.898793348420536e-05,
403
+ "loss": 1.3921,
404
+ "step": 265
405
+ },
406
+ {
407
+ "epoch": 0.963855421686747,
408
+ "grad_norm": 0.8319969177246094,
409
+ "learning_rate": 9.888130844596524e-05,
410
+ "loss": 1.3838,
411
+ "step": 270
412
+ },
413
+ {
414
+ "epoch": 0.9817045961624274,
415
+ "grad_norm": 0.7452044486999512,
416
+ "learning_rate": 9.876940776850569e-05,
417
+ "loss": 1.3529,
418
+ "step": 275
419
+ },
420
+ {
421
+ "epoch": 0.999553770638108,
422
+ "grad_norm": 0.7535015940666199,
423
+ "learning_rate": 9.865224352899119e-05,
424
+ "loss": 1.2739,
425
+ "step": 280
426
+ },
427
+ {
428
+ "epoch": 0.999553770638108,
429
+ "eval_loss": 1.289029836654663,
430
+ "eval_runtime": 17.7491,
431
+ "eval_samples_per_second": 2.592,
432
+ "eval_steps_per_second": 2.592,
433
+ "step": 280
434
+ },
435
+ {
436
+ "epoch": 1.0174029451137885,
437
+ "grad_norm": 0.7779117226600647,
438
+ "learning_rate": 9.852982837266955e-05,
439
+ "loss": 1.2339,
440
+ "step": 285
441
+ },
442
+ {
443
+ "epoch": 1.035252119589469,
444
+ "grad_norm": 0.8113610744476318,
445
+ "learning_rate": 9.840217551150706e-05,
446
+ "loss": 1.0982,
447
+ "step": 290
448
+ },
449
+ {
450
+ "epoch": 1.0531012940651494,
451
+ "grad_norm": 1.004701852798462,
452
+ "learning_rate": 9.826929872276255e-05,
453
+ "loss": 1.2537,
454
+ "step": 295
455
+ },
456
+ {
457
+ "epoch": 1.07095046854083,
458
+ "grad_norm": 1.524734616279602,
459
+ "learning_rate": 9.81312123475006e-05,
460
+ "loss": 1.1664,
461
+ "step": 300
462
+ },
463
+ {
464
+ "epoch": 1.0887996430165106,
465
+ "grad_norm": 1.5680856704711914,
466
+ "learning_rate": 9.798793128904356e-05,
467
+ "loss": 1.08,
468
+ "step": 305
469
+ },
470
+ {
471
+ "epoch": 1.106648817492191,
472
+ "grad_norm": 1.4838035106658936,
473
+ "learning_rate": 9.78394710113631e-05,
474
+ "loss": 1.1029,
475
+ "step": 310
476
+ },
477
+ {
478
+ "epoch": 1.1244979919678715,
479
+ "grad_norm": 1.522316575050354,
480
+ "learning_rate": 9.768584753741134e-05,
481
+ "loss": 1.1524,
482
+ "step": 315
483
+ },
484
+ {
485
+ "epoch": 1.142347166443552,
486
+ "grad_norm": 1.3976528644561768,
487
+ "learning_rate": 9.752707744739145e-05,
488
+ "loss": 1.1328,
489
+ "step": 320
490
+ },
491
+ {
492
+ "epoch": 1.1601963409192324,
493
+ "grad_norm": 1.4764764308929443,
494
+ "learning_rate": 9.736317787696816e-05,
495
+ "loss": 1.1174,
496
+ "step": 325
497
+ },
498
+ {
499
+ "epoch": 1.178045515394913,
500
+ "grad_norm": 1.3623173236846924,
501
+ "learning_rate": 9.719416651541839e-05,
502
+ "loss": 1.0493,
503
+ "step": 330
504
+ },
505
+ {
506
+ "epoch": 1.1958946898705936,
507
+ "grad_norm": 1.3625001907348633,
508
+ "learning_rate": 9.702006160372209e-05,
509
+ "loss": 1.0479,
510
+ "step": 335
511
+ },
512
+ {
513
+ "epoch": 1.213743864346274,
514
+ "grad_norm": 1.7509726285934448,
515
+ "learning_rate": 9.684088193259355e-05,
516
+ "loss": 1.1043,
517
+ "step": 340
518
+ },
519
+ {
520
+ "epoch": 1.2315930388219545,
521
+ "grad_norm": 1.5920188426971436,
522
+ "learning_rate": 9.665664684045333e-05,
523
+ "loss": 1.1096,
524
+ "step": 345
525
+ },
526
+ {
527
+ "epoch": 1.249442213297635,
528
+ "grad_norm": 1.6554943323135376,
529
+ "learning_rate": 9.646737621134112e-05,
530
+ "loss": 1.1436,
531
+ "step": 350
532
+ },
533
+ {
534
+ "epoch": 1.249442213297635,
535
+ "eval_loss": 1.3194608688354492,
536
+ "eval_runtime": 17.7382,
537
+ "eval_samples_per_second": 2.593,
538
+ "eval_steps_per_second": 2.593,
539
+ "step": 350
540
+ },
541
+ {
542
+ "epoch": 1.2672913877733154,
543
+ "grad_norm": 1.881818175315857,
544
+ "learning_rate": 9.627309047276974e-05,
545
+ "loss": 1.0549,
546
+ "step": 355
547
+ },
548
+ {
549
+ "epoch": 1.285140562248996,
550
+ "grad_norm": 1.8770464658737183,
551
+ "learning_rate": 9.607381059352038e-05,
552
+ "loss": 1.1576,
553
+ "step": 360
554
+ },
555
+ {
556
+ "epoch": 1.3029897367246766,
557
+ "grad_norm": 1.6901912689208984,
558
+ "learning_rate": 9.586955808137958e-05,
559
+ "loss": 1.1246,
560
+ "step": 365
561
+ },
562
+ {
563
+ "epoch": 1.320838911200357,
564
+ "grad_norm": 1.7667070627212524,
565
+ "learning_rate": 9.566035498081784e-05,
566
+ "loss": 1.125,
567
+ "step": 370
568
+ },
569
+ {
570
+ "epoch": 1.3386880856760375,
571
+ "grad_norm": 1.6150933504104614,
572
+ "learning_rate": 9.544622387061055e-05,
573
+ "loss": 1.1687,
574
+ "step": 375
575
+ },
576
+ {
577
+ "epoch": 1.356537260151718,
578
+ "grad_norm": 1.5824884176254272,
579
+ "learning_rate": 9.522718786140097e-05,
580
+ "loss": 0.9699,
581
+ "step": 380
582
+ },
583
+ {
584
+ "epoch": 1.3743864346273984,
585
+ "grad_norm": 1.5410280227661133,
586
+ "learning_rate": 9.500327059320606e-05,
587
+ "loss": 1.1379,
588
+ "step": 385
589
+ },
590
+ {
591
+ "epoch": 1.392235609103079,
592
+ "grad_norm": 2.264235496520996,
593
+ "learning_rate": 9.477449623286505e-05,
594
+ "loss": 1.0511,
595
+ "step": 390
596
+ },
597
+ {
598
+ "epoch": 1.4100847835787595,
599
+ "grad_norm": 1.7440612316131592,
600
+ "learning_rate": 9.454088947143116e-05,
601
+ "loss": 1.0003,
602
+ "step": 395
603
+ },
604
+ {
605
+ "epoch": 1.42793395805444,
606
+ "grad_norm": 1.770466923713684,
607
+ "learning_rate": 9.430247552150673e-05,
608
+ "loss": 1.1631,
609
+ "step": 400
610
+ },
611
+ {
612
+ "epoch": 1.4457831325301205,
613
+ "grad_norm": 1.9537169933319092,
614
+ "learning_rate": 9.405928011452211e-05,
615
+ "loss": 1.045,
616
+ "step": 405
617
+ },
618
+ {
619
+ "epoch": 1.463632307005801,
620
+ "grad_norm": 1.452445387840271,
621
+ "learning_rate": 9.381132949795861e-05,
622
+ "loss": 1.0511,
623
+ "step": 410
624
+ },
625
+ {
626
+ "epoch": 1.4814814814814814,
627
+ "grad_norm": 2.176547050476074,
628
+ "learning_rate": 9.35586504325155e-05,
629
+ "loss": 1.1637,
630
+ "step": 415
631
+ },
632
+ {
633
+ "epoch": 1.499330655957162,
634
+ "grad_norm": 2.15567684173584,
635
+ "learning_rate": 9.330127018922194e-05,
636
+ "loss": 1.0783,
637
+ "step": 420
638
+ },
639
+ {
640
+ "epoch": 1.499330655957162,
641
+ "eval_loss": 1.3106330633163452,
642
+ "eval_runtime": 17.7447,
643
+ "eval_samples_per_second": 2.592,
644
+ "eval_steps_per_second": 2.592,
645
+ "step": 420
646
+ },
647
+ {
648
+ "epoch": 1.5171798304328425,
649
+ "grad_norm": 1.6800014972686768,
650
+ "learning_rate": 9.303921654649362e-05,
651
+ "loss": 1.0406,
652
+ "step": 425
653
+ },
654
+ {
655
+ "epoch": 1.5350290049085231,
656
+ "grad_norm": 1.926607370376587,
657
+ "learning_rate": 9.277251778713474e-05,
658
+ "loss": 1.1469,
659
+ "step": 430
660
+ },
661
+ {
662
+ "epoch": 1.5528781793842035,
663
+ "grad_norm": 1.7155028581619263,
664
+ "learning_rate": 9.250120269528546e-05,
665
+ "loss": 1.0453,
666
+ "step": 435
667
+ },
668
+ {
669
+ "epoch": 1.5707273538598838,
670
+ "grad_norm": 1.9001247882843018,
671
+ "learning_rate": 9.22253005533154e-05,
672
+ "loss": 1.0611,
673
+ "step": 440
674
+ },
675
+ {
676
+ "epoch": 1.5885765283355644,
677
+ "grad_norm": 2.2804248332977295,
678
+ "learning_rate": 9.194484113866313e-05,
679
+ "loss": 1.082,
680
+ "step": 445
681
+ },
682
+ {
683
+ "epoch": 1.606425702811245,
684
+ "grad_norm": 1.9318439960479736,
685
+ "learning_rate": 9.165985472062246e-05,
686
+ "loss": 1.2404,
687
+ "step": 450
688
+ },
689
+ {
690
+ "epoch": 1.6242748772869255,
691
+ "grad_norm": 1.6018136739730835,
692
+ "learning_rate": 9.137037205707552e-05,
693
+ "loss": 1.0436,
694
+ "step": 455
695
+ },
696
+ {
697
+ "epoch": 1.6421240517626061,
698
+ "grad_norm": 2.1986541748046875,
699
+ "learning_rate": 9.107642439117321e-05,
700
+ "loss": 1.1227,
701
+ "step": 460
702
+ },
703
+ {
704
+ "epoch": 1.6599732262382865,
705
+ "grad_norm": 1.5558295249938965,
706
+ "learning_rate": 9.077804344796302e-05,
707
+ "loss": 1.0858,
708
+ "step": 465
709
+ },
710
+ {
711
+ "epoch": 1.6778224007139668,
712
+ "grad_norm": 1.8423618078231812,
713
+ "learning_rate": 9.04752614309652e-05,
714
+ "loss": 1.0998,
715
+ "step": 470
716
+ },
717
+ {
718
+ "epoch": 1.6956715751896474,
719
+ "grad_norm": 1.9065622091293335,
720
+ "learning_rate": 9.01681110186971e-05,
721
+ "loss": 1.0433,
722
+ "step": 475
723
+ },
724
+ {
725
+ "epoch": 1.713520749665328,
726
+ "grad_norm": 2.0103020668029785,
727
+ "learning_rate": 8.985662536114613e-05,
728
+ "loss": 1.0798,
729
+ "step": 480
730
+ },
731
+ {
732
+ "epoch": 1.7313699241410085,
733
+ "grad_norm": 1.5299313068389893,
734
+ "learning_rate": 8.954083807619208e-05,
735
+ "loss": 1.1012,
736
+ "step": 485
737
+ },
738
+ {
739
+ "epoch": 1.7492190986166891,
740
+ "grad_norm": 1.6331924200057983,
741
+ "learning_rate": 8.922078324597879e-05,
742
+ "loss": 1.1219,
743
+ "step": 490
744
+ },
745
+ {
746
+ "epoch": 1.7492190986166891,
747
+ "eval_loss": 1.3044873476028442,
748
+ "eval_runtime": 17.7401,
749
+ "eval_samples_per_second": 2.593,
750
+ "eval_steps_per_second": 2.593,
751
+ "step": 490
752
+ },
753
+ {
754
+ "epoch": 1.7670682730923695,
755
+ "grad_norm": 1.6050705909729004,
756
+ "learning_rate": 8.889649541323574e-05,
757
+ "loss": 1.16,
758
+ "step": 495
759
+ },
760
+ {
761
+ "epoch": 1.7849174475680498,
762
+ "grad_norm": 1.7604998350143433,
763
+ "learning_rate": 8.856800957755e-05,
764
+ "loss": 1.091,
765
+ "step": 500
766
+ },
767
+ {
768
+ "epoch": 1.8027666220437304,
769
+ "grad_norm": 1.6485258340835571,
770
+ "learning_rate": 8.823536119158864e-05,
771
+ "loss": 1.072,
772
+ "step": 505
773
+ },
774
+ {
775
+ "epoch": 1.820615796519411,
776
+ "grad_norm": 1.8173716068267822,
777
+ "learning_rate": 8.789858615727265e-05,
778
+ "loss": 1.0635,
779
+ "step": 510
780
+ },
781
+ {
782
+ "epoch": 1.8384649709950915,
783
+ "grad_norm": 1.468127965927124,
784
+ "learning_rate": 8.755772082190194e-05,
785
+ "loss": 1.0258,
786
+ "step": 515
787
+ },
788
+ {
789
+ "epoch": 1.8563141454707721,
790
+ "grad_norm": 1.4476536512374878,
791
+ "learning_rate": 8.721280197423258e-05,
792
+ "loss": 1.2011,
793
+ "step": 520
794
+ },
795
+ {
796
+ "epoch": 1.8741633199464525,
797
+ "grad_norm": 2.054915189743042,
798
+ "learning_rate": 8.68638668405062e-05,
799
+ "loss": 1.0539,
800
+ "step": 525
801
+ },
802
+ {
803
+ "epoch": 1.8920124944221328,
804
+ "grad_norm": 1.8471094369888306,
805
+ "learning_rate": 8.651095308043232e-05,
806
+ "loss": 1.0948,
807
+ "step": 530
808
+ },
809
+ {
810
+ "epoch": 1.9098616688978134,
811
+ "grad_norm": 1.7790355682373047,
812
+ "learning_rate": 8.61540987831238e-05,
813
+ "loss": 1.1245,
814
+ "step": 535
815
+ },
816
+ {
817
+ "epoch": 1.927710843373494,
818
+ "grad_norm": 1.6644902229309082,
819
+ "learning_rate": 8.579334246298593e-05,
820
+ "loss": 1.2039,
821
+ "step": 540
822
+ },
823
+ {
824
+ "epoch": 1.9455600178491745,
825
+ "grad_norm": 1.9952303171157837,
826
+ "learning_rate": 8.542872305555978e-05,
827
+ "loss": 1.1077,
828
+ "step": 545
829
+ },
830
+ {
831
+ "epoch": 1.9634091923248551,
832
+ "grad_norm": 2.225977659225464,
833
+ "learning_rate": 8.50602799133199e-05,
834
+ "loss": 1.0603,
835
+ "step": 550
836
+ },
837
+ {
838
+ "epoch": 1.9812583668005355,
839
+ "grad_norm": 1.777342438697815,
840
+ "learning_rate": 8.468805280142709e-05,
841
+ "loss": 1.1376,
842
+ "step": 555
843
+ },
844
+ {
845
+ "epoch": 1.9991075412762158,
846
+ "grad_norm": 2.2195017337799072,
847
+ "learning_rate": 8.43120818934367e-05,
848
+ "loss": 1.0966,
849
+ "step": 560
850
+ },
851
+ {
852
+ "epoch": 1.9991075412762158,
853
+ "eval_loss": 1.3094360828399658,
854
+ "eval_runtime": 17.7539,
855
+ "eval_samples_per_second": 2.591,
856
+ "eval_steps_per_second": 2.591,
857
+ "step": 560
858
+ },
859
+ {
860
+ "epoch": 2.0169567157518964,
861
+ "grad_norm": 2.012312173843384,
862
+ "learning_rate": 8.393240776696274e-05,
863
+ "loss": 0.6867,
864
+ "step": 565
865
+ },
866
+ {
867
+ "epoch": 2.034805890227577,
868
+ "grad_norm": 3.092951774597168,
869
+ "learning_rate": 8.354907139929851e-05,
870
+ "loss": 0.6025,
871
+ "step": 570
872
+ },
873
+ {
874
+ "epoch": 2.0526550647032575,
875
+ "grad_norm": 4.8303399085998535,
876
+ "learning_rate": 8.316211416299397e-05,
877
+ "loss": 0.6497,
878
+ "step": 575
879
+ },
880
+ {
881
+ "epoch": 2.070504239178938,
882
+ "grad_norm": 3.1457698345184326,
883
+ "learning_rate": 8.27715778213905e-05,
884
+ "loss": 0.5803,
885
+ "step": 580
886
+ },
887
+ {
888
+ "epoch": 2.0883534136546187,
889
+ "grad_norm": 2.5240321159362793,
890
+ "learning_rate": 8.237750452411353e-05,
891
+ "loss": 0.494,
892
+ "step": 585
893
+ },
894
+ {
895
+ "epoch": 2.106202588130299,
896
+ "grad_norm": 2.630946636199951,
897
+ "learning_rate": 8.197993680252334e-05,
898
+ "loss": 0.6428,
899
+ "step": 590
900
+ },
901
+ {
902
+ "epoch": 2.1240517626059794,
903
+ "grad_norm": 2.9942588806152344,
904
+ "learning_rate": 8.157891756512488e-05,
905
+ "loss": 0.6612,
906
+ "step": 595
907
+ },
908
+ {
909
+ "epoch": 2.14190093708166,
910
+ "grad_norm": 2.8771650791168213,
911
+ "learning_rate": 8.117449009293668e-05,
912
+ "loss": 0.5783,
913
+ "step": 600
914
+ },
915
+ {
916
+ "epoch": 2.1597501115573405,
917
+ "grad_norm": 3.1111013889312744,
918
+ "learning_rate": 8.076669803481965e-05,
919
+ "loss": 0.5799,
920
+ "step": 605
921
+ },
922
+ {
923
+ "epoch": 2.177599286033021,
924
+ "grad_norm": 3.715027093887329,
925
+ "learning_rate": 8.035558540276618e-05,
926
+ "loss": 0.5344,
927
+ "step": 610
928
+ },
929
+ {
930
+ "epoch": 2.1954484605087012,
931
+ "grad_norm": 2.936890125274658,
932
+ "learning_rate": 7.994119656715002e-05,
933
+ "loss": 0.5605,
934
+ "step": 615
935
+ },
936
+ {
937
+ "epoch": 2.213297634984382,
938
+ "grad_norm": 2.79441499710083,
939
+ "learning_rate": 7.952357625193749e-05,
940
+ "loss": 0.5923,
941
+ "step": 620
942
+ },
943
+ {
944
+ "epoch": 2.2311468094600624,
945
+ "grad_norm": 3.444474697113037,
946
+ "learning_rate": 7.91027695298606e-05,
947
+ "loss": 0.6067,
948
+ "step": 625
949
+ },
950
+ {
951
+ "epoch": 2.248995983935743,
952
+ "grad_norm": 3.034071445465088,
953
+ "learning_rate": 7.86788218175523e-05,
954
+ "loss": 0.6134,
955
+ "step": 630
956
+ },
957
+ {
958
+ "epoch": 2.248995983935743,
959
+ "eval_loss": 1.4945974349975586,
960
+ "eval_runtime": 17.7423,
961
+ "eval_samples_per_second": 2.593,
962
+ "eval_steps_per_second": 2.593,
963
+ "step": 630
964
+ },
965
+ {
966
+ "epoch": 2.2668451584114235,
967
+ "grad_norm": 3.0743188858032227,
968
+ "learning_rate": 7.8251778870645e-05,
969
+ "loss": 0.5798,
970
+ "step": 635
971
+ },
972
+ {
973
+ "epoch": 2.284694332887104,
974
+ "grad_norm": 3.250493049621582,
975
+ "learning_rate": 7.782168677883206e-05,
976
+ "loss": 0.5705,
977
+ "step": 640
978
+ },
979
+ {
980
+ "epoch": 2.3025435073627847,
981
+ "grad_norm": 2.4863390922546387,
982
+ "learning_rate": 7.738859196089358e-05,
983
+ "loss": 0.6119,
984
+ "step": 645
985
+ },
986
+ {
987
+ "epoch": 2.320392681838465,
988
+ "grad_norm": 3.1027884483337402,
989
+ "learning_rate": 7.695254115968648e-05,
990
+ "loss": 0.6352,
991
+ "step": 650
992
+ },
993
+ {
994
+ "epoch": 2.3382418563141454,
995
+ "grad_norm": 2.840583562850952,
996
+ "learning_rate": 7.651358143709972e-05,
997
+ "loss": 0.6341,
998
+ "step": 655
999
+ },
1000
+ {
1001
+ "epoch": 2.356091030789826,
1002
+ "grad_norm": 3.057770252227783,
1003
+ "learning_rate": 7.60717601689749e-05,
1004
+ "loss": 0.6695,
1005
+ "step": 660
1006
+ },
1007
+ {
1008
+ "epoch": 2.3739402052655065,
1009
+ "grad_norm": 3.563372850418091,
1010
+ "learning_rate": 7.562712503999327e-05,
1011
+ "loss": 0.5715,
1012
+ "step": 665
1013
+ },
1014
+ {
1015
+ "epoch": 2.391789379741187,
1016
+ "grad_norm": 3.2286486625671387,
1017
+ "learning_rate": 7.517972403852905e-05,
1018
+ "loss": 0.7753,
1019
+ "step": 670
1020
+ },
1021
+ {
1022
+ "epoch": 2.4096385542168672,
1023
+ "grad_norm": 2.9088051319122314,
1024
+ "learning_rate": 7.472960545147038e-05,
1025
+ "loss": 0.5529,
1026
+ "step": 675
1027
+ },
1028
+ {
1029
+ "epoch": 2.427487728692548,
1030
+ "grad_norm": 2.9432833194732666,
1031
+ "learning_rate": 7.427681785900761e-05,
1032
+ "loss": 0.5715,
1033
+ "step": 680
1034
+ },
1035
+ {
1036
+ "epoch": 2.4453369031682284,
1037
+ "grad_norm": 2.483222723007202,
1038
+ "learning_rate": 7.382141012939034e-05,
1039
+ "loss": 0.6085,
1040
+ "step": 685
1041
+ },
1042
+ {
1043
+ "epoch": 2.463186077643909,
1044
+ "grad_norm": 2.9013617038726807,
1045
+ "learning_rate": 7.33634314136531e-05,
1046
+ "loss": 0.627,
1047
+ "step": 690
1048
+ },
1049
+ {
1050
+ "epoch": 2.4810352521195895,
1051
+ "grad_norm": 2.746309995651245,
1052
+ "learning_rate": 7.290293114031061e-05,
1053
+ "loss": 0.6403,
1054
+ "step": 695
1055
+ },
1056
+ {
1057
+ "epoch": 2.49888442659527,
1058
+ "grad_norm": 2.8350794315338135,
1059
+ "learning_rate": 7.243995901002312e-05,
1060
+ "loss": 0.6342,
1061
+ "step": 700
1062
+ },
1063
+ {
1064
+ "epoch": 2.49888442659527,
1065
+ "eval_loss": 1.4858874082565308,
1066
+ "eval_runtime": 17.7385,
1067
+ "eval_samples_per_second": 2.593,
1068
+ "eval_steps_per_second": 2.593,
1069
+ "step": 700
1070
+ },
1071
+ {
1072
+ "epoch": 2.5167336010709507,
1073
+ "grad_norm": 3.006899833679199,
1074
+ "learning_rate": 7.197456499023225e-05,
1075
+ "loss": 0.5921,
1076
+ "step": 705
1077
+ },
1078
+ {
1079
+ "epoch": 2.534582775546631,
1080
+ "grad_norm": 2.9739573001861572,
1081
+ "learning_rate": 7.150679930976825e-05,
1082
+ "loss": 0.5873,
1083
+ "step": 710
1084
+ },
1085
+ {
1086
+ "epoch": 2.5524319500223114,
1087
+ "grad_norm": 3.7028846740722656,
1088
+ "learning_rate": 7.103671245342887e-05,
1089
+ "loss": 0.6661,
1090
+ "step": 715
1091
+ },
1092
+ {
1093
+ "epoch": 2.570281124497992,
1094
+ "grad_norm": 3.090599775314331,
1095
+ "learning_rate": 7.056435515653059e-05,
1096
+ "loss": 0.5388,
1097
+ "step": 720
1098
+ },
1099
+ {
1100
+ "epoch": 2.5881302989736725,
1101
+ "grad_norm": 2.799252986907959,
1102
+ "learning_rate": 7.008977839943299e-05,
1103
+ "loss": 0.6641,
1104
+ "step": 725
1105
+ },
1106
+ {
1107
+ "epoch": 2.605979473449353,
1108
+ "grad_norm": 2.8093032836914062,
1109
+ "learning_rate": 6.961303340203653e-05,
1110
+ "loss": 0.6221,
1111
+ "step": 730
1112
+ },
1113
+ {
1114
+ "epoch": 2.6238286479250332,
1115
+ "grad_norm": 3.6351985931396484,
1116
+ "learning_rate": 6.91341716182545e-05,
1117
+ "loss": 0.599,
1118
+ "step": 735
1119
+ },
1120
+ {
1121
+ "epoch": 2.641677822400714,
1122
+ "grad_norm": 2.6190829277038574,
1123
+ "learning_rate": 6.86532447304597e-05,
1124
+ "loss": 0.6047,
1125
+ "step": 740
1126
+ },
1127
+ {
1128
+ "epoch": 2.6595269968763944,
1129
+ "grad_norm": 3.227262020111084,
1130
+ "learning_rate": 6.817030464390656e-05,
1131
+ "loss": 0.614,
1132
+ "step": 745
1133
+ },
1134
+ {
1135
+ "epoch": 2.677376171352075,
1136
+ "grad_norm": 2.5810439586639404,
1137
+ "learning_rate": 6.768540348112907e-05,
1138
+ "loss": 0.6367,
1139
+ "step": 750
1140
+ },
1141
+ {
1142
+ "epoch": 2.6952253458277555,
1143
+ "grad_norm": 3.030888557434082,
1144
+ "learning_rate": 6.719859357631535e-05,
1145
+ "loss": 0.5681,
1146
+ "step": 755
1147
+ },
1148
+ {
1149
+ "epoch": 2.713074520303436,
1150
+ "grad_norm": 3.1176657676696777,
1151
+ "learning_rate": 6.670992746965938e-05,
1152
+ "loss": 0.5723,
1153
+ "step": 760
1154
+ },
1155
+ {
1156
+ "epoch": 2.7309236947791167,
1157
+ "grad_norm": 3.0151100158691406,
1158
+ "learning_rate": 6.621945790169036e-05,
1159
+ "loss": 0.6385,
1160
+ "step": 765
1161
+ },
1162
+ {
1163
+ "epoch": 2.748772869254797,
1164
+ "grad_norm": 3.4799766540527344,
1165
+ "learning_rate": 6.572723780758069e-05,
1166
+ "loss": 0.6665,
1167
+ "step": 770
1168
+ },
1169
+ {
1170
+ "epoch": 2.748772869254797,
1171
+ "eval_loss": 1.5236101150512695,
1172
+ "eval_runtime": 17.7462,
1173
+ "eval_samples_per_second": 2.592,
1174
+ "eval_steps_per_second": 2.592,
1175
+ "step": 770
1176
+ },
1177
+ {
1178
+ "epoch": 2.7666220437304774,
1179
+ "grad_norm": 3.1448163986206055,
1180
+ "learning_rate": 6.523332031143272e-05,
1181
+ "loss": 0.6083,
1182
+ "step": 775
1183
+ },
1184
+ {
1185
+ "epoch": 2.784471218206158,
1186
+ "grad_norm": 2.874833106994629,
1187
+ "learning_rate": 6.473775872054521e-05,
1188
+ "loss": 0.6493,
1189
+ "step": 780
1190
+ },
1191
+ {
1192
+ "epoch": 2.8023203926818385,
1193
+ "grad_norm": 3.2550127506256104,
1194
+ "learning_rate": 6.424060651966007e-05,
1195
+ "loss": 0.5722,
1196
+ "step": 785
1197
+ },
1198
+ {
1199
+ "epoch": 2.820169567157519,
1200
+ "grad_norm": 3.066908121109009,
1201
+ "learning_rate": 6.374191736518974e-05,
1202
+ "loss": 0.611,
1203
+ "step": 790
1204
+ },
1205
+ {
1206
+ "epoch": 2.8380187416331992,
1207
+ "grad_norm": 3.05871319770813,
1208
+ "learning_rate": 6.324174507942637e-05,
1209
+ "loss": 0.6202,
1210
+ "step": 795
1211
+ },
1212
+ {
1213
+ "epoch": 2.85586791610888,
1214
+ "grad_norm": 3.2599833011627197,
1215
+ "learning_rate": 6.274014364473274e-05,
1216
+ "loss": 0.5593,
1217
+ "step": 800
1218
+ },
1219
+ {
1220
+ "epoch": 2.8737170905845604,
1221
+ "grad_norm": 2.897418260574341,
1222
+ "learning_rate": 6.22371671977162e-05,
1223
+ "loss": 0.7415,
1224
+ "step": 805
1225
+ },
1226
+ {
1227
+ "epoch": 2.891566265060241,
1228
+ "grad_norm": 3.032317876815796,
1229
+ "learning_rate": 6.173287002338577e-05,
1230
+ "loss": 0.6544,
1231
+ "step": 810
1232
+ },
1233
+ {
1234
+ "epoch": 2.9094154395359215,
1235
+ "grad_norm": 2.7111008167266846,
1236
+ "learning_rate": 6.122730654929334e-05,
1237
+ "loss": 0.6421,
1238
+ "step": 815
1239
+ },
1240
+ {
1241
+ "epoch": 2.927264614011602,
1242
+ "grad_norm": 2.7735886573791504,
1243
+ "learning_rate": 6.072053133965938e-05,
1244
+ "loss": 0.6332,
1245
+ "step": 820
1246
+ },
1247
+ {
1248
+ "epoch": 2.9451137884872827,
1249
+ "grad_norm": 3.4417500495910645,
1250
+ "learning_rate": 6.021259908948402e-05,
1251
+ "loss": 0.6508,
1252
+ "step": 825
1253
+ },
1254
+ {
1255
+ "epoch": 2.962962962962963,
1256
+ "grad_norm": 3.432999849319458,
1257
+ "learning_rate": 5.970356461864391e-05,
1258
+ "loss": 0.621,
1259
+ "step": 830
1260
+ },
1261
+ {
1262
+ "epoch": 2.9808121374386434,
1263
+ "grad_norm": 3.470132827758789,
1264
+ "learning_rate": 5.919348286597569e-05,
1265
+ "loss": 0.6347,
1266
+ "step": 835
1267
+ },
1268
+ {
1269
+ "epoch": 2.998661311914324,
1270
+ "grad_norm": 3.153116226196289,
1271
+ "learning_rate": 5.868240888334653e-05,
1272
+ "loss": 0.6101,
1273
+ "step": 840
1274
+ },
1275
+ {
1276
+ "epoch": 2.998661311914324,
1277
+ "eval_loss": 1.5220016241073608,
1278
+ "eval_runtime": 17.7399,
1279
+ "eval_samples_per_second": 2.593,
1280
+ "eval_steps_per_second": 2.593,
1281
+ "step": 840
1282
+ },
1283
+ {
1284
+ "epoch": 3.0165104863900045,
1285
+ "grad_norm": 2.5395278930664062,
1286
+ "learning_rate": 5.8170397829712485e-05,
1287
+ "loss": 0.4183,
1288
+ "step": 845
1289
+ },
1290
+ {
1291
+ "epoch": 3.034359660865685,
1292
+ "grad_norm": 2.833970308303833,
1293
+ "learning_rate": 5.765750496516547e-05,
1294
+ "loss": 0.1667,
1295
+ "step": 850
1296
+ },
1297
+ {
1298
+ "epoch": 3.0522088353413657,
1299
+ "grad_norm": 3.447057008743286,
1300
+ "learning_rate": 5.714378564496901e-05,
1301
+ "loss": 0.255,
1302
+ "step": 855
1303
+ },
1304
+ {
1305
+ "epoch": 3.070058009817046,
1306
+ "grad_norm": 3.9993224143981934,
1307
+ "learning_rate": 5.6629295313583974e-05,
1308
+ "loss": 0.2424,
1309
+ "step": 860
1310
+ },
1311
+ {
1312
+ "epoch": 3.0879071842927264,
1313
+ "grad_norm": 3.626281499862671,
1314
+ "learning_rate": 5.611408949868457e-05,
1315
+ "loss": 0.2097,
1316
+ "step": 865
1317
+ },
1318
+ {
1319
+ "epoch": 3.105756358768407,
1320
+ "grad_norm": 2.693284034729004,
1321
+ "learning_rate": 5.559822380516539e-05,
1322
+ "loss": 0.2271,
1323
+ "step": 870
1324
+ },
1325
+ {
1326
+ "epoch": 3.1236055332440875,
1327
+ "grad_norm": 2.439389705657959,
1328
+ "learning_rate": 5.5081753909140096e-05,
1329
+ "loss": 0.1982,
1330
+ "step": 875
1331
+ },
1332
+ {
1333
+ "epoch": 3.141454707719768,
1334
+ "grad_norm": 2.6163575649261475,
1335
+ "learning_rate": 5.456473555193242e-05,
1336
+ "loss": 0.2192,
1337
+ "step": 880
1338
+ },
1339
+ {
1340
+ "epoch": 3.1593038821954487,
1341
+ "grad_norm": 2.405829668045044,
1342
+ "learning_rate": 5.404722453406017e-05,
1343
+ "loss": 0.2097,
1344
+ "step": 885
1345
+ },
1346
+ {
1347
+ "epoch": 3.177153056671129,
1348
+ "grad_norm": 2.819413423538208,
1349
+ "learning_rate": 5.3529276709212816e-05,
1350
+ "loss": 0.2213,
1351
+ "step": 890
1352
+ },
1353
+ {
1354
+ "epoch": 3.1950022311468094,
1355
+ "grad_norm": 3.6370203495025635,
1356
+ "learning_rate": 5.30109479782233e-05,
1357
+ "loss": 0.2559,
1358
+ "step": 895
1359
+ },
1360
+ {
1361
+ "epoch": 3.21285140562249,
1362
+ "grad_norm": 3.4090726375579834,
1363
+ "learning_rate": 5.249229428303486e-05,
1364
+ "loss": 0.1955,
1365
+ "step": 900
1366
+ },
1367
+ {
1368
+ "epoch": 3.2307005800981705,
1369
+ "grad_norm": 2.8171908855438232,
1370
+ "learning_rate": 5.197337160066331e-05,
1371
+ "loss": 0.2642,
1372
+ "step": 905
1373
+ },
1374
+ {
1375
+ "epoch": 3.248549754573851,
1376
+ "grad_norm": 3.926447629928589,
1377
+ "learning_rate": 5.145423593715557e-05,
1378
+ "loss": 0.2467,
1379
+ "step": 910
1380
+ },
1381
+ {
1382
+ "epoch": 3.248549754573851,
1383
+ "eval_loss": 1.8390079736709595,
1384
+ "eval_runtime": 17.7348,
1385
+ "eval_samples_per_second": 2.594,
1386
+ "eval_steps_per_second": 2.594,
1387
+ "step": 910
1388
+ },
1389
+ {
1390
+ "epoch": 3.266398929049531,
1391
+ "grad_norm": 2.7143030166625977,
1392
+ "learning_rate": 5.0934943321545115e-05,
1393
+ "loss": 0.2239,
1394
+ "step": 915
1395
+ },
1396
+ {
1397
+ "epoch": 3.284248103525212,
1398
+ "grad_norm": 2.717496871948242,
1399
+ "learning_rate": 5.041554979980486e-05,
1400
+ "loss": 0.1545,
1401
+ "step": 920
1402
+ },
1403
+ {
1404
+ "epoch": 3.3020972780008924,
1405
+ "grad_norm": 3.516397714614868,
1406
+ "learning_rate": 4.9896111428798254e-05,
1407
+ "loss": 0.2819,
1408
+ "step": 925
1409
+ },
1410
+ {
1411
+ "epoch": 3.319946452476573,
1412
+ "grad_norm": 3.3290677070617676,
1413
+ "learning_rate": 4.9376684270229254e-05,
1414
+ "loss": 0.3043,
1415
+ "step": 930
1416
+ },
1417
+ {
1418
+ "epoch": 3.3377956269522535,
1419
+ "grad_norm": 2.914736032485962,
1420
+ "learning_rate": 4.8857324384591653e-05,
1421
+ "loss": 0.2494,
1422
+ "step": 935
1423
+ },
1424
+ {
1425
+ "epoch": 3.355644801427934,
1426
+ "grad_norm": 3.37791109085083,
1427
+ "learning_rate": 4.8338087825118675e-05,
1428
+ "loss": 0.2271,
1429
+ "step": 940
1430
+ },
1431
+ {
1432
+ "epoch": 3.3734939759036147,
1433
+ "grad_norm": 3.295100688934326,
1434
+ "learning_rate": 4.781903063173321e-05,
1435
+ "loss": 0.242,
1436
+ "step": 945
1437
+ },
1438
+ {
1439
+ "epoch": 3.391343150379295,
1440
+ "grad_norm": 2.5792458057403564,
1441
+ "learning_rate": 4.730020882499964e-05,
1442
+ "loss": 0.2244,
1443
+ "step": 950
1444
+ },
1445
+ {
1446
+ "epoch": 3.4091923248549754,
1447
+ "grad_norm": 3.0014591217041016,
1448
+ "learning_rate": 4.678167840007767e-05,
1449
+ "loss": 0.2552,
1450
+ "step": 955
1451
+ },
1452
+ {
1453
+ "epoch": 3.427041499330656,
1454
+ "grad_norm": 3.207282066345215,
1455
+ "learning_rate": 4.626349532067879e-05,
1456
+ "loss": 0.2542,
1457
+ "step": 960
1458
+ },
1459
+ {
1460
+ "epoch": 3.4448906738063365,
1461
+ "grad_norm": 3.85109543800354,
1462
+ "learning_rate": 4.574571551302647e-05,
1463
+ "loss": 0.3249,
1464
+ "step": 965
1465
+ },
1466
+ {
1467
+ "epoch": 3.462739848282017,
1468
+ "grad_norm": 3.3335843086242676,
1469
+ "learning_rate": 4.522839485981994e-05,
1470
+ "loss": 0.2729,
1471
+ "step": 970
1472
+ },
1473
+ {
1474
+ "epoch": 3.480589022757697,
1475
+ "grad_norm": 2.885708808898926,
1476
+ "learning_rate": 4.471158919420312e-05,
1477
+ "loss": 0.2595,
1478
+ "step": 975
1479
+ },
1480
+ {
1481
+ "epoch": 3.498438197233378,
1482
+ "grad_norm": 3.215789556503296,
1483
+ "learning_rate": 4.4195354293738484e-05,
1484
+ "loss": 0.2284,
1485
+ "step": 980
1486
+ },
1487
+ {
1488
+ "epoch": 3.498438197233378,
1489
+ "eval_loss": 1.82525634765625,
1490
+ "eval_runtime": 17.7537,
1491
+ "eval_samples_per_second": 2.591,
1492
+ "eval_steps_per_second": 2.591,
1493
+ "step": 980
1494
+ },
1495
+ {
1496
+ "epoch": 3.5162873717090584,
1497
+ "grad_norm": 3.4772818088531494,
1498
+ "learning_rate": 4.367974587438733e-05,
1499
+ "loss": 0.1947,
1500
+ "step": 985
1501
+ },
1502
+ {
1503
+ "epoch": 3.534136546184739,
1504
+ "grad_norm": 2.6401774883270264,
1505
+ "learning_rate": 4.316481958449634e-05,
1506
+ "loss": 0.2352,
1507
+ "step": 990
1508
+ },
1509
+ {
1510
+ "epoch": 3.5519857206604195,
1511
+ "grad_norm": 3.997591733932495,
1512
+ "learning_rate": 4.2650630998791615e-05,
1513
+ "loss": 0.2047,
1514
+ "step": 995
1515
+ },
1516
+ {
1517
+ "epoch": 3.5698348951361,
1518
+ "grad_norm": 2.5615384578704834,
1519
+ "learning_rate": 4.213723561238074e-05,
1520
+ "loss": 0.2369,
1521
+ "step": 1000
1522
+ },
1523
+ {
1524
+ "epoch": 3.5876840696117807,
1525
+ "grad_norm": 2.5114736557006836,
1526
+ "learning_rate": 4.162468883476319e-05,
1527
+ "loss": 0.2416,
1528
+ "step": 1005
1529
+ },
1530
+ {
1531
+ "epoch": 3.605533244087461,
1532
+ "grad_norm": 4.23993444442749,
1533
+ "learning_rate": 4.111304598385018e-05,
1534
+ "loss": 0.2353,
1535
+ "step": 1010
1536
+ },
1537
+ {
1538
+ "epoch": 3.6233824185631414,
1539
+ "grad_norm": 3.239319324493408,
1540
+ "learning_rate": 4.060236227999441e-05,
1541
+ "loss": 0.2155,
1542
+ "step": 1015
1543
+ },
1544
+ {
1545
+ "epoch": 3.641231593038822,
1546
+ "grad_norm": 2.030393600463867,
1547
+ "learning_rate": 4.0092692840030134e-05,
1548
+ "loss": 0.2241,
1549
+ "step": 1020
1550
+ },
1551
+ {
1552
+ "epoch": 3.6590807675145025,
1553
+ "grad_norm": 3.636963367462158,
1554
+ "learning_rate": 3.9584092671324606e-05,
1555
+ "loss": 0.2408,
1556
+ "step": 1025
1557
+ },
1558
+ {
1559
+ "epoch": 3.676929941990183,
1560
+ "grad_norm": 4.295063495635986,
1561
+ "learning_rate": 3.907661666584131e-05,
1562
+ "loss": 0.2423,
1563
+ "step": 1030
1564
+ },
1565
+ {
1566
+ "epoch": 3.694779116465863,
1567
+ "grad_norm": 3.268596887588501,
1568
+ "learning_rate": 3.857031959421553e-05,
1569
+ "loss": 0.2581,
1570
+ "step": 1035
1571
+ },
1572
+ {
1573
+ "epoch": 3.7126282909415442,
1574
+ "grad_norm": 3.0428457260131836,
1575
+ "learning_rate": 3.806525609984312e-05,
1576
+ "loss": 0.206,
1577
+ "step": 1040
1578
+ },
1579
+ {
1580
+ "epoch": 3.7304774654172244,
1581
+ "grad_norm": 3.523777484893799,
1582
+ "learning_rate": 3.7561480692983006e-05,
1583
+ "loss": 0.1956,
1584
+ "step": 1045
1585
+ },
1586
+ {
1587
+ "epoch": 3.748326639892905,
1588
+ "grad_norm": 2.972714900970459,
1589
+ "learning_rate": 3.705904774487396e-05,
1590
+ "loss": 0.2839,
1591
+ "step": 1050
1592
+ },
1593
+ {
1594
+ "epoch": 3.748326639892905,
1595
+ "eval_loss": 1.8687995672225952,
1596
+ "eval_runtime": 17.732,
1597
+ "eval_samples_per_second": 2.594,
1598
+ "eval_steps_per_second": 2.594,
1599
+ "step": 1050
1600
+ }
1601
+ ],
1602
+ "logging_steps": 5,
1603
+ "max_steps": 1680,
1604
+ "num_input_tokens_seen": 0,
1605
+ "num_train_epochs": 6,
1606
+ "save_steps": 70,
1607
+ "stateful_callbacks": {
1608
+ "TrainerControl": {
1609
+ "args": {
1610
+ "should_epoch_stop": false,
1611
+ "should_evaluate": false,
1612
+ "should_log": false,
1613
+ "should_save": true,
1614
+ "should_training_stop": false
1615
+ },
1616
+ "attributes": {}
1617
+ }
1618
+ },
1619
+ "total_flos": 1.106779976195113e+18,
1620
+ "train_batch_size": 2,
1621
+ "trial_name": null,
1622
+ "trial_params": null
1623
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b061b26ebc79da396fea201dbc3aded12f572b2061bb961d9cec13867ed1c18f
3
+ size 5368
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1050/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-72B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-72B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b932053fa22010ce7dfdda17f007812702e1b8f0ba3e562ae64a76f7560ec2c9
3
+ size 421218912
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33be79f5e6b9458cb25b023d754de2daa677eb354a8c56629b263a43d5e1cca6
3
+ size 843085810
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9afbf853352cfbcfea61884ff6a2ddcd2aee1ce8618589cf5b56912c1b160011
3
+ size 14244
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca9ae0ba3d67685eade4c511fdd7b532faa6ab6e28687db11fdc378288b76e9
3
+ size 1064
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 131072,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/trainer_state.json ADDED
@@ -0,0 +1,1729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.998215082552432,
5
+ "eval_steps": 70,
6
+ "global_step": 1120,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0178491744756805,
13
+ "grad_norm": 1.8217403888702393,
14
+ "learning_rate": 2.9761904761904763e-06,
15
+ "loss": 2.7425,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.035698348951361,
20
+ "grad_norm": 2.104698419570923,
21
+ "learning_rate": 5.9523809523809525e-06,
22
+ "loss": 2.861,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.0535475234270415,
27
+ "grad_norm": 2.7389333248138428,
28
+ "learning_rate": 8.92857142857143e-06,
29
+ "loss": 2.8281,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.071396697902722,
34
+ "grad_norm": 3.9298207759857178,
35
+ "learning_rate": 1.1904761904761905e-05,
36
+ "loss": 3.1888,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.0892458723784025,
41
+ "grad_norm": 2.648014783859253,
42
+ "learning_rate": 1.4880952380952381e-05,
43
+ "loss": 2.6461,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.107095046854083,
48
+ "grad_norm": 1.587472915649414,
49
+ "learning_rate": 1.785714285714286e-05,
50
+ "loss": 2.3212,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.1249442213297635,
55
+ "grad_norm": 0.8390935063362122,
56
+ "learning_rate": 2.0833333333333336e-05,
57
+ "loss": 1.8036,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.142793395805444,
62
+ "grad_norm": 0.46670979261398315,
63
+ "learning_rate": 2.380952380952381e-05,
64
+ "loss": 1.5552,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.1606425702811245,
69
+ "grad_norm": 0.45171597599983215,
70
+ "learning_rate": 2.6785714285714288e-05,
71
+ "loss": 1.6626,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.178491744756805,
76
+ "grad_norm": 0.5605499744415283,
77
+ "learning_rate": 2.9761904761904762e-05,
78
+ "loss": 1.4897,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.1963409192324855,
83
+ "grad_norm": 0.5553259253501892,
84
+ "learning_rate": 3.273809523809524e-05,
85
+ "loss": 1.5373,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.214190093708166,
90
+ "grad_norm": 0.6260251402854919,
91
+ "learning_rate": 3.571428571428572e-05,
92
+ "loss": 1.4779,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.2320392681838465,
97
+ "grad_norm": 0.6063796877861023,
98
+ "learning_rate": 3.8690476190476195e-05,
99
+ "loss": 1.483,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.249888442659527,
104
+ "grad_norm": 0.5549850463867188,
105
+ "learning_rate": 4.166666666666667e-05,
106
+ "loss": 1.5022,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.249888442659527,
111
+ "eval_loss": 1.451762318611145,
112
+ "eval_runtime": 17.7549,
113
+ "eval_samples_per_second": 2.591,
114
+ "eval_steps_per_second": 2.591,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.2677376171352075,
119
+ "grad_norm": 0.482930988073349,
120
+ "learning_rate": 4.464285714285715e-05,
121
+ "loss": 1.4256,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 0.285586791610888,
126
+ "grad_norm": 0.4240593910217285,
127
+ "learning_rate": 4.761904761904762e-05,
128
+ "loss": 1.3655,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.3034359660865685,
133
+ "grad_norm": 0.4872314929962158,
134
+ "learning_rate": 5.05952380952381e-05,
135
+ "loss": 1.4478,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 0.321285140562249,
140
+ "grad_norm": 0.42132768034935,
141
+ "learning_rate": 5.3571428571428575e-05,
142
+ "loss": 1.3305,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 0.3391343150379295,
147
+ "grad_norm": 0.6932046413421631,
148
+ "learning_rate": 5.6547619047619046e-05,
149
+ "loss": 1.4279,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 0.35698348951361,
154
+ "grad_norm": 0.6714524626731873,
155
+ "learning_rate": 5.9523809523809524e-05,
156
+ "loss": 1.4967,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.3748326639892905,
161
+ "grad_norm": 0.5682816505432129,
162
+ "learning_rate": 6.25e-05,
163
+ "loss": 1.4739,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.392681838464971,
168
+ "grad_norm": 0.7795937657356262,
169
+ "learning_rate": 6.547619047619048e-05,
170
+ "loss": 1.3751,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.4105310129406515,
175
+ "grad_norm": 0.8056842088699341,
176
+ "learning_rate": 6.845238095238096e-05,
177
+ "loss": 1.3699,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.428380187416332,
182
+ "grad_norm": 0.8373801112174988,
183
+ "learning_rate": 7.142857142857143e-05,
184
+ "loss": 1.4696,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.4462293618920125,
189
+ "grad_norm": 1.0051416158676147,
190
+ "learning_rate": 7.440476190476191e-05,
191
+ "loss": 1.4059,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.464078536367693,
196
+ "grad_norm": 0.5304180383682251,
197
+ "learning_rate": 7.738095238095239e-05,
198
+ "loss": 1.3072,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.4819277108433735,
203
+ "grad_norm": 0.8797634243965149,
204
+ "learning_rate": 8.035714285714287e-05,
205
+ "loss": 1.4132,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.499776885319054,
210
+ "grad_norm": 0.9049625396728516,
211
+ "learning_rate": 8.333333333333334e-05,
212
+ "loss": 1.4121,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.499776885319054,
217
+ "eval_loss": 1.3727394342422485,
218
+ "eval_runtime": 17.745,
219
+ "eval_samples_per_second": 2.592,
220
+ "eval_steps_per_second": 2.592,
221
+ "step": 140
222
+ },
223
+ {
224
+ "epoch": 0.5176260597947345,
225
+ "grad_norm": 0.6793915033340454,
226
+ "learning_rate": 8.630952380952382e-05,
227
+ "loss": 1.3109,
228
+ "step": 145
229
+ },
230
+ {
231
+ "epoch": 0.535475234270415,
232
+ "grad_norm": 0.7171015739440918,
233
+ "learning_rate": 8.92857142857143e-05,
234
+ "loss": 1.3781,
235
+ "step": 150
236
+ },
237
+ {
238
+ "epoch": 0.5533244087460955,
239
+ "grad_norm": 0.6738716959953308,
240
+ "learning_rate": 9.226190476190478e-05,
241
+ "loss": 1.3564,
242
+ "step": 155
243
+ },
244
+ {
245
+ "epoch": 0.571173583221776,
246
+ "grad_norm": 0.699975311756134,
247
+ "learning_rate": 9.523809523809524e-05,
248
+ "loss": 1.2387,
249
+ "step": 160
250
+ },
251
+ {
252
+ "epoch": 0.5890227576974565,
253
+ "grad_norm": 0.7659904956817627,
254
+ "learning_rate": 9.821428571428572e-05,
255
+ "loss": 1.3042,
256
+ "step": 165
257
+ },
258
+ {
259
+ "epoch": 0.606871932173137,
260
+ "grad_norm": 0.9782125353813171,
261
+ "learning_rate": 9.999956828659095e-05,
262
+ "loss": 1.3709,
263
+ "step": 170
264
+ },
265
+ {
266
+ "epoch": 0.6247211066488175,
267
+ "grad_norm": 1.0532957315444946,
268
+ "learning_rate": 9.999471159635539e-05,
269
+ "loss": 1.3844,
270
+ "step": 175
271
+ },
272
+ {
273
+ "epoch": 0.642570281124498,
274
+ "grad_norm": 0.7373877167701721,
275
+ "learning_rate": 9.998445910004082e-05,
276
+ "loss": 1.2852,
277
+ "step": 180
278
+ },
279
+ {
280
+ "epoch": 0.6604194556001785,
281
+ "grad_norm": 1.0207768678665161,
282
+ "learning_rate": 9.996881190417393e-05,
283
+ "loss": 1.4652,
284
+ "step": 185
285
+ },
286
+ {
287
+ "epoch": 0.678268630075859,
288
+ "grad_norm": 0.7943917512893677,
289
+ "learning_rate": 9.994777169751806e-05,
290
+ "loss": 1.3743,
291
+ "step": 190
292
+ },
293
+ {
294
+ "epoch": 0.6961178045515395,
295
+ "grad_norm": 0.7461659908294678,
296
+ "learning_rate": 9.992134075089084e-05,
297
+ "loss": 1.2423,
298
+ "step": 195
299
+ },
300
+ {
301
+ "epoch": 0.71396697902722,
302
+ "grad_norm": 0.9689913988113403,
303
+ "learning_rate": 9.988952191691925e-05,
304
+ "loss": 1.3113,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.7318161535029005,
309
+ "grad_norm": 0.766276478767395,
310
+ "learning_rate": 9.985231862973168e-05,
311
+ "loss": 1.3524,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.749665327978581,
316
+ "grad_norm": 0.6728419661521912,
317
+ "learning_rate": 9.980973490458728e-05,
318
+ "loss": 1.4038,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.749665327978581,
323
+ "eval_loss": 1.3051044940948486,
324
+ "eval_runtime": 17.7559,
325
+ "eval_samples_per_second": 2.591,
326
+ "eval_steps_per_second": 2.591,
327
+ "step": 210
328
+ },
329
+ {
330
+ "epoch": 0.7675145024542614,
331
+ "grad_norm": 1.0456575155258179,
332
+ "learning_rate": 9.976177533744261e-05,
333
+ "loss": 1.3626,
334
+ "step": 215
335
+ },
336
+ {
337
+ "epoch": 0.785363676929942,
338
+ "grad_norm": 0.9017456769943237,
339
+ "learning_rate": 9.97084451044556e-05,
340
+ "loss": 1.3232,
341
+ "step": 220
342
+ },
343
+ {
344
+ "epoch": 0.8032128514056225,
345
+ "grad_norm": 0.9113703966140747,
346
+ "learning_rate": 9.964974996142698e-05,
347
+ "loss": 1.2826,
348
+ "step": 225
349
+ },
350
+ {
351
+ "epoch": 0.821062025881303,
352
+ "grad_norm": 0.7177279591560364,
353
+ "learning_rate": 9.958569624317893e-05,
354
+ "loss": 1.2794,
355
+ "step": 230
356
+ },
357
+ {
358
+ "epoch": 0.8389112003569835,
359
+ "grad_norm": 0.9058728814125061,
360
+ "learning_rate": 9.951629086287151e-05,
361
+ "loss": 1.3853,
362
+ "step": 235
363
+ },
364
+ {
365
+ "epoch": 0.856760374832664,
366
+ "grad_norm": 0.6813459992408752,
367
+ "learning_rate": 9.944154131125642e-05,
368
+ "loss": 1.3533,
369
+ "step": 240
370
+ },
371
+ {
372
+ "epoch": 0.8746095493083444,
373
+ "grad_norm": 0.7113555073738098,
374
+ "learning_rate": 9.936145565586871e-05,
375
+ "loss": 1.3395,
376
+ "step": 245
377
+ },
378
+ {
379
+ "epoch": 0.892458723784025,
380
+ "grad_norm": 1.243597149848938,
381
+ "learning_rate": 9.927604254015585e-05,
382
+ "loss": 1.443,
383
+ "step": 250
384
+ },
385
+ {
386
+ "epoch": 0.9103078982597055,
387
+ "grad_norm": 0.8651953339576721,
388
+ "learning_rate": 9.918531118254507e-05,
389
+ "loss": 1.398,
390
+ "step": 255
391
+ },
392
+ {
393
+ "epoch": 0.928157072735386,
394
+ "grad_norm": 0.8877395987510681,
395
+ "learning_rate": 9.90892713754483e-05,
396
+ "loss": 1.346,
397
+ "step": 260
398
+ },
399
+ {
400
+ "epoch": 0.9460062472110665,
401
+ "grad_norm": 0.8857008814811707,
402
+ "learning_rate": 9.898793348420536e-05,
403
+ "loss": 1.3921,
404
+ "step": 265
405
+ },
406
+ {
407
+ "epoch": 0.963855421686747,
408
+ "grad_norm": 0.8319969177246094,
409
+ "learning_rate": 9.888130844596524e-05,
410
+ "loss": 1.3838,
411
+ "step": 270
412
+ },
413
+ {
414
+ "epoch": 0.9817045961624274,
415
+ "grad_norm": 0.7452044486999512,
416
+ "learning_rate": 9.876940776850569e-05,
417
+ "loss": 1.3529,
418
+ "step": 275
419
+ },
420
+ {
421
+ "epoch": 0.999553770638108,
422
+ "grad_norm": 0.7535015940666199,
423
+ "learning_rate": 9.865224352899119e-05,
424
+ "loss": 1.2739,
425
+ "step": 280
426
+ },
427
+ {
428
+ "epoch": 0.999553770638108,
429
+ "eval_loss": 1.289029836654663,
430
+ "eval_runtime": 17.7491,
431
+ "eval_samples_per_second": 2.592,
432
+ "eval_steps_per_second": 2.592,
433
+ "step": 280
434
+ },
435
+ {
436
+ "epoch": 1.0174029451137885,
437
+ "grad_norm": 0.7779117226600647,
438
+ "learning_rate": 9.852982837266955e-05,
439
+ "loss": 1.2339,
440
+ "step": 285
441
+ },
442
+ {
443
+ "epoch": 1.035252119589469,
444
+ "grad_norm": 0.8113610744476318,
445
+ "learning_rate": 9.840217551150706e-05,
446
+ "loss": 1.0982,
447
+ "step": 290
448
+ },
449
+ {
450
+ "epoch": 1.0531012940651494,
451
+ "grad_norm": 1.004701852798462,
452
+ "learning_rate": 9.826929872276255e-05,
453
+ "loss": 1.2537,
454
+ "step": 295
455
+ },
456
+ {
457
+ "epoch": 1.07095046854083,
458
+ "grad_norm": 1.524734616279602,
459
+ "learning_rate": 9.81312123475006e-05,
460
+ "loss": 1.1664,
461
+ "step": 300
462
+ },
463
+ {
464
+ "epoch": 1.0887996430165106,
465
+ "grad_norm": 1.5680856704711914,
466
+ "learning_rate": 9.798793128904356e-05,
467
+ "loss": 1.08,
468
+ "step": 305
469
+ },
470
+ {
471
+ "epoch": 1.106648817492191,
472
+ "grad_norm": 1.4838035106658936,
473
+ "learning_rate": 9.78394710113631e-05,
474
+ "loss": 1.1029,
475
+ "step": 310
476
+ },
477
+ {
478
+ "epoch": 1.1244979919678715,
479
+ "grad_norm": 1.522316575050354,
480
+ "learning_rate": 9.768584753741134e-05,
481
+ "loss": 1.1524,
482
+ "step": 315
483
+ },
484
+ {
485
+ "epoch": 1.142347166443552,
486
+ "grad_norm": 1.3976528644561768,
487
+ "learning_rate": 9.752707744739145e-05,
488
+ "loss": 1.1328,
489
+ "step": 320
490
+ },
491
+ {
492
+ "epoch": 1.1601963409192324,
493
+ "grad_norm": 1.4764764308929443,
494
+ "learning_rate": 9.736317787696816e-05,
495
+ "loss": 1.1174,
496
+ "step": 325
497
+ },
498
+ {
499
+ "epoch": 1.178045515394913,
500
+ "grad_norm": 1.3623173236846924,
501
+ "learning_rate": 9.719416651541839e-05,
502
+ "loss": 1.0493,
503
+ "step": 330
504
+ },
505
+ {
506
+ "epoch": 1.1958946898705936,
507
+ "grad_norm": 1.3625001907348633,
508
+ "learning_rate": 9.702006160372209e-05,
509
+ "loss": 1.0479,
510
+ "step": 335
511
+ },
512
+ {
513
+ "epoch": 1.213743864346274,
514
+ "grad_norm": 1.7509726285934448,
515
+ "learning_rate": 9.684088193259355e-05,
516
+ "loss": 1.1043,
517
+ "step": 340
518
+ },
519
+ {
520
+ "epoch": 1.2315930388219545,
521
+ "grad_norm": 1.5920188426971436,
522
+ "learning_rate": 9.665664684045333e-05,
523
+ "loss": 1.1096,
524
+ "step": 345
525
+ },
526
+ {
527
+ "epoch": 1.249442213297635,
528
+ "grad_norm": 1.6554943323135376,
529
+ "learning_rate": 9.646737621134112e-05,
530
+ "loss": 1.1436,
531
+ "step": 350
532
+ },
533
+ {
534
+ "epoch": 1.249442213297635,
535
+ "eval_loss": 1.3194608688354492,
536
+ "eval_runtime": 17.7382,
537
+ "eval_samples_per_second": 2.593,
538
+ "eval_steps_per_second": 2.593,
539
+ "step": 350
540
+ },
541
+ {
542
+ "epoch": 1.2672913877733154,
543
+ "grad_norm": 1.881818175315857,
544
+ "learning_rate": 9.627309047276974e-05,
545
+ "loss": 1.0549,
546
+ "step": 355
547
+ },
548
+ {
549
+ "epoch": 1.285140562248996,
550
+ "grad_norm": 1.8770464658737183,
551
+ "learning_rate": 9.607381059352038e-05,
552
+ "loss": 1.1576,
553
+ "step": 360
554
+ },
555
+ {
556
+ "epoch": 1.3029897367246766,
557
+ "grad_norm": 1.6901912689208984,
558
+ "learning_rate": 9.586955808137958e-05,
559
+ "loss": 1.1246,
560
+ "step": 365
561
+ },
562
+ {
563
+ "epoch": 1.320838911200357,
564
+ "grad_norm": 1.7667070627212524,
565
+ "learning_rate": 9.566035498081784e-05,
566
+ "loss": 1.125,
567
+ "step": 370
568
+ },
569
+ {
570
+ "epoch": 1.3386880856760375,
571
+ "grad_norm": 1.6150933504104614,
572
+ "learning_rate": 9.544622387061055e-05,
573
+ "loss": 1.1687,
574
+ "step": 375
575
+ },
576
+ {
577
+ "epoch": 1.356537260151718,
578
+ "grad_norm": 1.5824884176254272,
579
+ "learning_rate": 9.522718786140097e-05,
580
+ "loss": 0.9699,
581
+ "step": 380
582
+ },
583
+ {
584
+ "epoch": 1.3743864346273984,
585
+ "grad_norm": 1.5410280227661133,
586
+ "learning_rate": 9.500327059320606e-05,
587
+ "loss": 1.1379,
588
+ "step": 385
589
+ },
590
+ {
591
+ "epoch": 1.392235609103079,
592
+ "grad_norm": 2.264235496520996,
593
+ "learning_rate": 9.477449623286505e-05,
594
+ "loss": 1.0511,
595
+ "step": 390
596
+ },
597
+ {
598
+ "epoch": 1.4100847835787595,
599
+ "grad_norm": 1.7440612316131592,
600
+ "learning_rate": 9.454088947143116e-05,
601
+ "loss": 1.0003,
602
+ "step": 395
603
+ },
604
+ {
605
+ "epoch": 1.42793395805444,
606
+ "grad_norm": 1.770466923713684,
607
+ "learning_rate": 9.430247552150673e-05,
608
+ "loss": 1.1631,
609
+ "step": 400
610
+ },
611
+ {
612
+ "epoch": 1.4457831325301205,
613
+ "grad_norm": 1.9537169933319092,
614
+ "learning_rate": 9.405928011452211e-05,
615
+ "loss": 1.045,
616
+ "step": 405
617
+ },
618
+ {
619
+ "epoch": 1.463632307005801,
620
+ "grad_norm": 1.452445387840271,
621
+ "learning_rate": 9.381132949795861e-05,
622
+ "loss": 1.0511,
623
+ "step": 410
624
+ },
625
+ {
626
+ "epoch": 1.4814814814814814,
627
+ "grad_norm": 2.176547050476074,
628
+ "learning_rate": 9.35586504325155e-05,
629
+ "loss": 1.1637,
630
+ "step": 415
631
+ },
632
+ {
633
+ "epoch": 1.499330655957162,
634
+ "grad_norm": 2.15567684173584,
635
+ "learning_rate": 9.330127018922194e-05,
636
+ "loss": 1.0783,
637
+ "step": 420
638
+ },
639
+ {
640
+ "epoch": 1.499330655957162,
641
+ "eval_loss": 1.3106330633163452,
642
+ "eval_runtime": 17.7447,
643
+ "eval_samples_per_second": 2.592,
644
+ "eval_steps_per_second": 2.592,
645
+ "step": 420
646
+ },
647
+ {
648
+ "epoch": 1.5171798304328425,
649
+ "grad_norm": 1.6800014972686768,
650
+ "learning_rate": 9.303921654649362e-05,
651
+ "loss": 1.0406,
652
+ "step": 425
653
+ },
654
+ {
655
+ "epoch": 1.5350290049085231,
656
+ "grad_norm": 1.926607370376587,
657
+ "learning_rate": 9.277251778713474e-05,
658
+ "loss": 1.1469,
659
+ "step": 430
660
+ },
661
+ {
662
+ "epoch": 1.5528781793842035,
663
+ "grad_norm": 1.7155028581619263,
664
+ "learning_rate": 9.250120269528546e-05,
665
+ "loss": 1.0453,
666
+ "step": 435
667
+ },
668
+ {
669
+ "epoch": 1.5707273538598838,
670
+ "grad_norm": 1.9001247882843018,
671
+ "learning_rate": 9.22253005533154e-05,
672
+ "loss": 1.0611,
673
+ "step": 440
674
+ },
675
+ {
676
+ "epoch": 1.5885765283355644,
677
+ "grad_norm": 2.2804248332977295,
678
+ "learning_rate": 9.194484113866313e-05,
679
+ "loss": 1.082,
680
+ "step": 445
681
+ },
682
+ {
683
+ "epoch": 1.606425702811245,
684
+ "grad_norm": 1.9318439960479736,
685
+ "learning_rate": 9.165985472062246e-05,
686
+ "loss": 1.2404,
687
+ "step": 450
688
+ },
689
+ {
690
+ "epoch": 1.6242748772869255,
691
+ "grad_norm": 1.6018136739730835,
692
+ "learning_rate": 9.137037205707552e-05,
693
+ "loss": 1.0436,
694
+ "step": 455
695
+ },
696
+ {
697
+ "epoch": 1.6421240517626061,
698
+ "grad_norm": 2.1986541748046875,
699
+ "learning_rate": 9.107642439117321e-05,
700
+ "loss": 1.1227,
701
+ "step": 460
702
+ },
703
+ {
704
+ "epoch": 1.6599732262382865,
705
+ "grad_norm": 1.5558295249938965,
706
+ "learning_rate": 9.077804344796302e-05,
707
+ "loss": 1.0858,
708
+ "step": 465
709
+ },
710
+ {
711
+ "epoch": 1.6778224007139668,
712
+ "grad_norm": 1.8423618078231812,
713
+ "learning_rate": 9.04752614309652e-05,
714
+ "loss": 1.0998,
715
+ "step": 470
716
+ },
717
+ {
718
+ "epoch": 1.6956715751896474,
719
+ "grad_norm": 1.9065622091293335,
720
+ "learning_rate": 9.01681110186971e-05,
721
+ "loss": 1.0433,
722
+ "step": 475
723
+ },
724
+ {
725
+ "epoch": 1.713520749665328,
726
+ "grad_norm": 2.0103020668029785,
727
+ "learning_rate": 8.985662536114613e-05,
728
+ "loss": 1.0798,
729
+ "step": 480
730
+ },
731
+ {
732
+ "epoch": 1.7313699241410085,
733
+ "grad_norm": 1.5299313068389893,
734
+ "learning_rate": 8.954083807619208e-05,
735
+ "loss": 1.1012,
736
+ "step": 485
737
+ },
738
+ {
739
+ "epoch": 1.7492190986166891,
740
+ "grad_norm": 1.6331924200057983,
741
+ "learning_rate": 8.922078324597879e-05,
742
+ "loss": 1.1219,
743
+ "step": 490
744
+ },
745
+ {
746
+ "epoch": 1.7492190986166891,
747
+ "eval_loss": 1.3044873476028442,
748
+ "eval_runtime": 17.7401,
749
+ "eval_samples_per_second": 2.593,
750
+ "eval_steps_per_second": 2.593,
751
+ "step": 490
752
+ },
753
+ {
754
+ "epoch": 1.7670682730923695,
755
+ "grad_norm": 1.6050705909729004,
756
+ "learning_rate": 8.889649541323574e-05,
757
+ "loss": 1.16,
758
+ "step": 495
759
+ },
760
+ {
761
+ "epoch": 1.7849174475680498,
762
+ "grad_norm": 1.7604998350143433,
763
+ "learning_rate": 8.856800957755e-05,
764
+ "loss": 1.091,
765
+ "step": 500
766
+ },
767
+ {
768
+ "epoch": 1.8027666220437304,
769
+ "grad_norm": 1.6485258340835571,
770
+ "learning_rate": 8.823536119158864e-05,
771
+ "loss": 1.072,
772
+ "step": 505
773
+ },
774
+ {
775
+ "epoch": 1.820615796519411,
776
+ "grad_norm": 1.8173716068267822,
777
+ "learning_rate": 8.789858615727265e-05,
778
+ "loss": 1.0635,
779
+ "step": 510
780
+ },
781
+ {
782
+ "epoch": 1.8384649709950915,
783
+ "grad_norm": 1.468127965927124,
784
+ "learning_rate": 8.755772082190194e-05,
785
+ "loss": 1.0258,
786
+ "step": 515
787
+ },
788
+ {
789
+ "epoch": 1.8563141454707721,
790
+ "grad_norm": 1.4476536512374878,
791
+ "learning_rate": 8.721280197423258e-05,
792
+ "loss": 1.2011,
793
+ "step": 520
794
+ },
795
+ {
796
+ "epoch": 1.8741633199464525,
797
+ "grad_norm": 2.054915189743042,
798
+ "learning_rate": 8.68638668405062e-05,
799
+ "loss": 1.0539,
800
+ "step": 525
801
+ },
802
+ {
803
+ "epoch": 1.8920124944221328,
804
+ "grad_norm": 1.8471094369888306,
805
+ "learning_rate": 8.651095308043232e-05,
806
+ "loss": 1.0948,
807
+ "step": 530
808
+ },
809
+ {
810
+ "epoch": 1.9098616688978134,
811
+ "grad_norm": 1.7790355682373047,
812
+ "learning_rate": 8.61540987831238e-05,
813
+ "loss": 1.1245,
814
+ "step": 535
815
+ },
816
+ {
817
+ "epoch": 1.927710843373494,
818
+ "grad_norm": 1.6644902229309082,
819
+ "learning_rate": 8.579334246298593e-05,
820
+ "loss": 1.2039,
821
+ "step": 540
822
+ },
823
+ {
824
+ "epoch": 1.9455600178491745,
825
+ "grad_norm": 1.9952303171157837,
826
+ "learning_rate": 8.542872305555978e-05,
827
+ "loss": 1.1077,
828
+ "step": 545
829
+ },
830
+ {
831
+ "epoch": 1.9634091923248551,
832
+ "grad_norm": 2.225977659225464,
833
+ "learning_rate": 8.50602799133199e-05,
834
+ "loss": 1.0603,
835
+ "step": 550
836
+ },
837
+ {
838
+ "epoch": 1.9812583668005355,
839
+ "grad_norm": 1.777342438697815,
840
+ "learning_rate": 8.468805280142709e-05,
841
+ "loss": 1.1376,
842
+ "step": 555
843
+ },
844
+ {
845
+ "epoch": 1.9991075412762158,
846
+ "grad_norm": 2.2195017337799072,
847
+ "learning_rate": 8.43120818934367e-05,
848
+ "loss": 1.0966,
849
+ "step": 560
850
+ },
851
+ {
852
+ "epoch": 1.9991075412762158,
853
+ "eval_loss": 1.3094360828399658,
854
+ "eval_runtime": 17.7539,
855
+ "eval_samples_per_second": 2.591,
856
+ "eval_steps_per_second": 2.591,
857
+ "step": 560
858
+ },
859
+ {
860
+ "epoch": 2.0169567157518964,
861
+ "grad_norm": 2.012312173843384,
862
+ "learning_rate": 8.393240776696274e-05,
863
+ "loss": 0.6867,
864
+ "step": 565
865
+ },
866
+ {
867
+ "epoch": 2.034805890227577,
868
+ "grad_norm": 3.092951774597168,
869
+ "learning_rate": 8.354907139929851e-05,
870
+ "loss": 0.6025,
871
+ "step": 570
872
+ },
873
+ {
874
+ "epoch": 2.0526550647032575,
875
+ "grad_norm": 4.8303399085998535,
876
+ "learning_rate": 8.316211416299397e-05,
877
+ "loss": 0.6497,
878
+ "step": 575
879
+ },
880
+ {
881
+ "epoch": 2.070504239178938,
882
+ "grad_norm": 3.1457698345184326,
883
+ "learning_rate": 8.27715778213905e-05,
884
+ "loss": 0.5803,
885
+ "step": 580
886
+ },
887
+ {
888
+ "epoch": 2.0883534136546187,
889
+ "grad_norm": 2.5240321159362793,
890
+ "learning_rate": 8.237750452411353e-05,
891
+ "loss": 0.494,
892
+ "step": 585
893
+ },
894
+ {
895
+ "epoch": 2.106202588130299,
896
+ "grad_norm": 2.630946636199951,
897
+ "learning_rate": 8.197993680252334e-05,
898
+ "loss": 0.6428,
899
+ "step": 590
900
+ },
901
+ {
902
+ "epoch": 2.1240517626059794,
903
+ "grad_norm": 2.9942588806152344,
904
+ "learning_rate": 8.157891756512488e-05,
905
+ "loss": 0.6612,
906
+ "step": 595
907
+ },
908
+ {
909
+ "epoch": 2.14190093708166,
910
+ "grad_norm": 2.8771650791168213,
911
+ "learning_rate": 8.117449009293668e-05,
912
+ "loss": 0.5783,
913
+ "step": 600
914
+ },
915
+ {
916
+ "epoch": 2.1597501115573405,
917
+ "grad_norm": 3.1111013889312744,
918
+ "learning_rate": 8.076669803481965e-05,
919
+ "loss": 0.5799,
920
+ "step": 605
921
+ },
922
+ {
923
+ "epoch": 2.177599286033021,
924
+ "grad_norm": 3.715027093887329,
925
+ "learning_rate": 8.035558540276618e-05,
926
+ "loss": 0.5344,
927
+ "step": 610
928
+ },
929
+ {
930
+ "epoch": 2.1954484605087012,
931
+ "grad_norm": 2.936890125274658,
932
+ "learning_rate": 7.994119656715002e-05,
933
+ "loss": 0.5605,
934
+ "step": 615
935
+ },
936
+ {
937
+ "epoch": 2.213297634984382,
938
+ "grad_norm": 2.79441499710083,
939
+ "learning_rate": 7.952357625193749e-05,
940
+ "loss": 0.5923,
941
+ "step": 620
942
+ },
943
+ {
944
+ "epoch": 2.2311468094600624,
945
+ "grad_norm": 3.444474697113037,
946
+ "learning_rate": 7.91027695298606e-05,
947
+ "loss": 0.6067,
948
+ "step": 625
949
+ },
950
+ {
951
+ "epoch": 2.248995983935743,
952
+ "grad_norm": 3.034071445465088,
953
+ "learning_rate": 7.86788218175523e-05,
954
+ "loss": 0.6134,
955
+ "step": 630
956
+ },
957
+ {
958
+ "epoch": 2.248995983935743,
959
+ "eval_loss": 1.4945974349975586,
960
+ "eval_runtime": 17.7423,
961
+ "eval_samples_per_second": 2.593,
962
+ "eval_steps_per_second": 2.593,
963
+ "step": 630
964
+ },
965
+ {
966
+ "epoch": 2.2668451584114235,
967
+ "grad_norm": 3.0743188858032227,
968
+ "learning_rate": 7.8251778870645e-05,
969
+ "loss": 0.5798,
970
+ "step": 635
971
+ },
972
+ {
973
+ "epoch": 2.284694332887104,
974
+ "grad_norm": 3.250493049621582,
975
+ "learning_rate": 7.782168677883206e-05,
976
+ "loss": 0.5705,
977
+ "step": 640
978
+ },
979
+ {
980
+ "epoch": 2.3025435073627847,
981
+ "grad_norm": 2.4863390922546387,
982
+ "learning_rate": 7.738859196089358e-05,
983
+ "loss": 0.6119,
984
+ "step": 645
985
+ },
986
+ {
987
+ "epoch": 2.320392681838465,
988
+ "grad_norm": 3.1027884483337402,
989
+ "learning_rate": 7.695254115968648e-05,
990
+ "loss": 0.6352,
991
+ "step": 650
992
+ },
993
+ {
994
+ "epoch": 2.3382418563141454,
995
+ "grad_norm": 2.840583562850952,
996
+ "learning_rate": 7.651358143709972e-05,
997
+ "loss": 0.6341,
998
+ "step": 655
999
+ },
1000
+ {
1001
+ "epoch": 2.356091030789826,
1002
+ "grad_norm": 3.057770252227783,
1003
+ "learning_rate": 7.60717601689749e-05,
1004
+ "loss": 0.6695,
1005
+ "step": 660
1006
+ },
1007
+ {
1008
+ "epoch": 2.3739402052655065,
1009
+ "grad_norm": 3.563372850418091,
1010
+ "learning_rate": 7.562712503999327e-05,
1011
+ "loss": 0.5715,
1012
+ "step": 665
1013
+ },
1014
+ {
1015
+ "epoch": 2.391789379741187,
1016
+ "grad_norm": 3.2286486625671387,
1017
+ "learning_rate": 7.517972403852905e-05,
1018
+ "loss": 0.7753,
1019
+ "step": 670
1020
+ },
1021
+ {
1022
+ "epoch": 2.4096385542168672,
1023
+ "grad_norm": 2.9088051319122314,
1024
+ "learning_rate": 7.472960545147038e-05,
1025
+ "loss": 0.5529,
1026
+ "step": 675
1027
+ },
1028
+ {
1029
+ "epoch": 2.427487728692548,
1030
+ "grad_norm": 2.9432833194732666,
1031
+ "learning_rate": 7.427681785900761e-05,
1032
+ "loss": 0.5715,
1033
+ "step": 680
1034
+ },
1035
+ {
1036
+ "epoch": 2.4453369031682284,
1037
+ "grad_norm": 2.483222723007202,
1038
+ "learning_rate": 7.382141012939034e-05,
1039
+ "loss": 0.6085,
1040
+ "step": 685
1041
+ },
1042
+ {
1043
+ "epoch": 2.463186077643909,
1044
+ "grad_norm": 2.9013617038726807,
1045
+ "learning_rate": 7.33634314136531e-05,
1046
+ "loss": 0.627,
1047
+ "step": 690
1048
+ },
1049
+ {
1050
+ "epoch": 2.4810352521195895,
1051
+ "grad_norm": 2.746309995651245,
1052
+ "learning_rate": 7.290293114031061e-05,
1053
+ "loss": 0.6403,
1054
+ "step": 695
1055
+ },
1056
+ {
1057
+ "epoch": 2.49888442659527,
1058
+ "grad_norm": 2.8350794315338135,
1059
+ "learning_rate": 7.243995901002312e-05,
1060
+ "loss": 0.6342,
1061
+ "step": 700
1062
+ },
1063
+ {
1064
+ "epoch": 2.49888442659527,
1065
+ "eval_loss": 1.4858874082565308,
1066
+ "eval_runtime": 17.7385,
1067
+ "eval_samples_per_second": 2.593,
1068
+ "eval_steps_per_second": 2.593,
1069
+ "step": 700
1070
+ },
1071
+ {
1072
+ "epoch": 2.5167336010709507,
1073
+ "grad_norm": 3.006899833679199,
1074
+ "learning_rate": 7.197456499023225e-05,
1075
+ "loss": 0.5921,
1076
+ "step": 705
1077
+ },
1078
+ {
1079
+ "epoch": 2.534582775546631,
1080
+ "grad_norm": 2.9739573001861572,
1081
+ "learning_rate": 7.150679930976825e-05,
1082
+ "loss": 0.5873,
1083
+ "step": 710
1084
+ },
1085
+ {
1086
+ "epoch": 2.5524319500223114,
1087
+ "grad_norm": 3.7028846740722656,
1088
+ "learning_rate": 7.103671245342887e-05,
1089
+ "loss": 0.6661,
1090
+ "step": 715
1091
+ },
1092
+ {
1093
+ "epoch": 2.570281124497992,
1094
+ "grad_norm": 3.090599775314331,
1095
+ "learning_rate": 7.056435515653059e-05,
1096
+ "loss": 0.5388,
1097
+ "step": 720
1098
+ },
1099
+ {
1100
+ "epoch": 2.5881302989736725,
1101
+ "grad_norm": 2.799252986907959,
1102
+ "learning_rate": 7.008977839943299e-05,
1103
+ "loss": 0.6641,
1104
+ "step": 725
1105
+ },
1106
+ {
1107
+ "epoch": 2.605979473449353,
1108
+ "grad_norm": 2.8093032836914062,
1109
+ "learning_rate": 6.961303340203653e-05,
1110
+ "loss": 0.6221,
1111
+ "step": 730
1112
+ },
1113
+ {
1114
+ "epoch": 2.6238286479250332,
1115
+ "grad_norm": 3.6351985931396484,
1116
+ "learning_rate": 6.91341716182545e-05,
1117
+ "loss": 0.599,
1118
+ "step": 735
1119
+ },
1120
+ {
1121
+ "epoch": 2.641677822400714,
1122
+ "grad_norm": 2.6190829277038574,
1123
+ "learning_rate": 6.86532447304597e-05,
1124
+ "loss": 0.6047,
1125
+ "step": 740
1126
+ },
1127
+ {
1128
+ "epoch": 2.6595269968763944,
1129
+ "grad_norm": 3.227262020111084,
1130
+ "learning_rate": 6.817030464390656e-05,
1131
+ "loss": 0.614,
1132
+ "step": 745
1133
+ },
1134
+ {
1135
+ "epoch": 2.677376171352075,
1136
+ "grad_norm": 2.5810439586639404,
1137
+ "learning_rate": 6.768540348112907e-05,
1138
+ "loss": 0.6367,
1139
+ "step": 750
1140
+ },
1141
+ {
1142
+ "epoch": 2.6952253458277555,
1143
+ "grad_norm": 3.030888557434082,
1144
+ "learning_rate": 6.719859357631535e-05,
1145
+ "loss": 0.5681,
1146
+ "step": 755
1147
+ },
1148
+ {
1149
+ "epoch": 2.713074520303436,
1150
+ "grad_norm": 3.1176657676696777,
1151
+ "learning_rate": 6.670992746965938e-05,
1152
+ "loss": 0.5723,
1153
+ "step": 760
1154
+ },
1155
+ {
1156
+ "epoch": 2.7309236947791167,
1157
+ "grad_norm": 3.0151100158691406,
1158
+ "learning_rate": 6.621945790169036e-05,
1159
+ "loss": 0.6385,
1160
+ "step": 765
1161
+ },
1162
+ {
1163
+ "epoch": 2.748772869254797,
1164
+ "grad_norm": 3.4799766540527344,
1165
+ "learning_rate": 6.572723780758069e-05,
1166
+ "loss": 0.6665,
1167
+ "step": 770
1168
+ },
1169
+ {
1170
+ "epoch": 2.748772869254797,
1171
+ "eval_loss": 1.5236101150512695,
1172
+ "eval_runtime": 17.7462,
1173
+ "eval_samples_per_second": 2.592,
1174
+ "eval_steps_per_second": 2.592,
1175
+ "step": 770
1176
+ },
1177
+ {
1178
+ "epoch": 2.7666220437304774,
1179
+ "grad_norm": 3.1448163986206055,
1180
+ "learning_rate": 6.523332031143272e-05,
1181
+ "loss": 0.6083,
1182
+ "step": 775
1183
+ },
1184
+ {
1185
+ "epoch": 2.784471218206158,
1186
+ "grad_norm": 2.874833106994629,
1187
+ "learning_rate": 6.473775872054521e-05,
1188
+ "loss": 0.6493,
1189
+ "step": 780
1190
+ },
1191
+ {
1192
+ "epoch": 2.8023203926818385,
1193
+ "grad_norm": 3.2550127506256104,
1194
+ "learning_rate": 6.424060651966007e-05,
1195
+ "loss": 0.5722,
1196
+ "step": 785
1197
+ },
1198
+ {
1199
+ "epoch": 2.820169567157519,
1200
+ "grad_norm": 3.066908121109009,
1201
+ "learning_rate": 6.374191736518974e-05,
1202
+ "loss": 0.611,
1203
+ "step": 790
1204
+ },
1205
+ {
1206
+ "epoch": 2.8380187416331992,
1207
+ "grad_norm": 3.05871319770813,
1208
+ "learning_rate": 6.324174507942637e-05,
1209
+ "loss": 0.6202,
1210
+ "step": 795
1211
+ },
1212
+ {
1213
+ "epoch": 2.85586791610888,
1214
+ "grad_norm": 3.2599833011627197,
1215
+ "learning_rate": 6.274014364473274e-05,
1216
+ "loss": 0.5593,
1217
+ "step": 800
1218
+ },
1219
+ {
1220
+ "epoch": 2.8737170905845604,
1221
+ "grad_norm": 2.897418260574341,
1222
+ "learning_rate": 6.22371671977162e-05,
1223
+ "loss": 0.7415,
1224
+ "step": 805
1225
+ },
1226
+ {
1227
+ "epoch": 2.891566265060241,
1228
+ "grad_norm": 3.032317876815796,
1229
+ "learning_rate": 6.173287002338577e-05,
1230
+ "loss": 0.6544,
1231
+ "step": 810
1232
+ },
1233
+ {
1234
+ "epoch": 2.9094154395359215,
1235
+ "grad_norm": 2.7111008167266846,
1236
+ "learning_rate": 6.122730654929334e-05,
1237
+ "loss": 0.6421,
1238
+ "step": 815
1239
+ },
1240
+ {
1241
+ "epoch": 2.927264614011602,
1242
+ "grad_norm": 2.7735886573791504,
1243
+ "learning_rate": 6.072053133965938e-05,
1244
+ "loss": 0.6332,
1245
+ "step": 820
1246
+ },
1247
+ {
1248
+ "epoch": 2.9451137884872827,
1249
+ "grad_norm": 3.4417500495910645,
1250
+ "learning_rate": 6.021259908948402e-05,
1251
+ "loss": 0.6508,
1252
+ "step": 825
1253
+ },
1254
+ {
1255
+ "epoch": 2.962962962962963,
1256
+ "grad_norm": 3.432999849319458,
1257
+ "learning_rate": 5.970356461864391e-05,
1258
+ "loss": 0.621,
1259
+ "step": 830
1260
+ },
1261
+ {
1262
+ "epoch": 2.9808121374386434,
1263
+ "grad_norm": 3.470132827758789,
1264
+ "learning_rate": 5.919348286597569e-05,
1265
+ "loss": 0.6347,
1266
+ "step": 835
1267
+ },
1268
+ {
1269
+ "epoch": 2.998661311914324,
1270
+ "grad_norm": 3.153116226196289,
1271
+ "learning_rate": 5.868240888334653e-05,
1272
+ "loss": 0.6101,
1273
+ "step": 840
1274
+ },
1275
+ {
1276
+ "epoch": 2.998661311914324,
1277
+ "eval_loss": 1.5220016241073608,
1278
+ "eval_runtime": 17.7399,
1279
+ "eval_samples_per_second": 2.593,
1280
+ "eval_steps_per_second": 2.593,
1281
+ "step": 840
1282
+ },
1283
+ {
1284
+ "epoch": 3.0165104863900045,
1285
+ "grad_norm": 2.5395278930664062,
1286
+ "learning_rate": 5.8170397829712485e-05,
1287
+ "loss": 0.4183,
1288
+ "step": 845
1289
+ },
1290
+ {
1291
+ "epoch": 3.034359660865685,
1292
+ "grad_norm": 2.833970308303833,
1293
+ "learning_rate": 5.765750496516547e-05,
1294
+ "loss": 0.1667,
1295
+ "step": 850
1296
+ },
1297
+ {
1298
+ "epoch": 3.0522088353413657,
1299
+ "grad_norm": 3.447057008743286,
1300
+ "learning_rate": 5.714378564496901e-05,
1301
+ "loss": 0.255,
1302
+ "step": 855
1303
+ },
1304
+ {
1305
+ "epoch": 3.070058009817046,
1306
+ "grad_norm": 3.9993224143981934,
1307
+ "learning_rate": 5.6629295313583974e-05,
1308
+ "loss": 0.2424,
1309
+ "step": 860
1310
+ },
1311
+ {
1312
+ "epoch": 3.0879071842927264,
1313
+ "grad_norm": 3.626281499862671,
1314
+ "learning_rate": 5.611408949868457e-05,
1315
+ "loss": 0.2097,
1316
+ "step": 865
1317
+ },
1318
+ {
1319
+ "epoch": 3.105756358768407,
1320
+ "grad_norm": 2.693284034729004,
1321
+ "learning_rate": 5.559822380516539e-05,
1322
+ "loss": 0.2271,
1323
+ "step": 870
1324
+ },
1325
+ {
1326
+ "epoch": 3.1236055332440875,
1327
+ "grad_norm": 2.439389705657959,
1328
+ "learning_rate": 5.5081753909140096e-05,
1329
+ "loss": 0.1982,
1330
+ "step": 875
1331
+ },
1332
+ {
1333
+ "epoch": 3.141454707719768,
1334
+ "grad_norm": 2.6163575649261475,
1335
+ "learning_rate": 5.456473555193242e-05,
1336
+ "loss": 0.2192,
1337
+ "step": 880
1338
+ },
1339
+ {
1340
+ "epoch": 3.1593038821954487,
1341
+ "grad_norm": 2.405829668045044,
1342
+ "learning_rate": 5.404722453406017e-05,
1343
+ "loss": 0.2097,
1344
+ "step": 885
1345
+ },
1346
+ {
1347
+ "epoch": 3.177153056671129,
1348
+ "grad_norm": 2.819413423538208,
1349
+ "learning_rate": 5.3529276709212816e-05,
1350
+ "loss": 0.2213,
1351
+ "step": 890
1352
+ },
1353
+ {
1354
+ "epoch": 3.1950022311468094,
1355
+ "grad_norm": 3.6370203495025635,
1356
+ "learning_rate": 5.30109479782233e-05,
1357
+ "loss": 0.2559,
1358
+ "step": 895
1359
+ },
1360
+ {
1361
+ "epoch": 3.21285140562249,
1362
+ "grad_norm": 3.4090726375579834,
1363
+ "learning_rate": 5.249229428303486e-05,
1364
+ "loss": 0.1955,
1365
+ "step": 900
1366
+ },
1367
+ {
1368
+ "epoch": 3.2307005800981705,
1369
+ "grad_norm": 2.8171908855438232,
1370
+ "learning_rate": 5.197337160066331e-05,
1371
+ "loss": 0.2642,
1372
+ "step": 905
1373
+ },
1374
+ {
1375
+ "epoch": 3.248549754573851,
1376
+ "grad_norm": 3.926447629928589,
1377
+ "learning_rate": 5.145423593715557e-05,
1378
+ "loss": 0.2467,
1379
+ "step": 910
1380
+ },
1381
+ {
1382
+ "epoch": 3.248549754573851,
1383
+ "eval_loss": 1.8390079736709595,
1384
+ "eval_runtime": 17.7348,
1385
+ "eval_samples_per_second": 2.594,
1386
+ "eval_steps_per_second": 2.594,
1387
+ "step": 910
1388
+ },
1389
+ {
1390
+ "epoch": 3.266398929049531,
1391
+ "grad_norm": 2.7143030166625977,
1392
+ "learning_rate": 5.0934943321545115e-05,
1393
+ "loss": 0.2239,
1394
+ "step": 915
1395
+ },
1396
+ {
1397
+ "epoch": 3.284248103525212,
1398
+ "grad_norm": 2.717496871948242,
1399
+ "learning_rate": 5.041554979980486e-05,
1400
+ "loss": 0.1545,
1401
+ "step": 920
1402
+ },
1403
+ {
1404
+ "epoch": 3.3020972780008924,
1405
+ "grad_norm": 3.516397714614868,
1406
+ "learning_rate": 4.9896111428798254e-05,
1407
+ "loss": 0.2819,
1408
+ "step": 925
1409
+ },
1410
+ {
1411
+ "epoch": 3.319946452476573,
1412
+ "grad_norm": 3.3290677070617676,
1413
+ "learning_rate": 4.9376684270229254e-05,
1414
+ "loss": 0.3043,
1415
+ "step": 930
1416
+ },
1417
+ {
1418
+ "epoch": 3.3377956269522535,
1419
+ "grad_norm": 2.914736032485962,
1420
+ "learning_rate": 4.8857324384591653e-05,
1421
+ "loss": 0.2494,
1422
+ "step": 935
1423
+ },
1424
+ {
1425
+ "epoch": 3.355644801427934,
1426
+ "grad_norm": 3.37791109085083,
1427
+ "learning_rate": 4.8338087825118675e-05,
1428
+ "loss": 0.2271,
1429
+ "step": 940
1430
+ },
1431
+ {
1432
+ "epoch": 3.3734939759036147,
1433
+ "grad_norm": 3.295100688934326,
1434
+ "learning_rate": 4.781903063173321e-05,
1435
+ "loss": 0.242,
1436
+ "step": 945
1437
+ },
1438
+ {
1439
+ "epoch": 3.391343150379295,
1440
+ "grad_norm": 2.5792458057403564,
1441
+ "learning_rate": 4.730020882499964e-05,
1442
+ "loss": 0.2244,
1443
+ "step": 950
1444
+ },
1445
+ {
1446
+ "epoch": 3.4091923248549754,
1447
+ "grad_norm": 3.0014591217041016,
1448
+ "learning_rate": 4.678167840007767e-05,
1449
+ "loss": 0.2552,
1450
+ "step": 955
1451
+ },
1452
+ {
1453
+ "epoch": 3.427041499330656,
1454
+ "grad_norm": 3.207282066345215,
1455
+ "learning_rate": 4.626349532067879e-05,
1456
+ "loss": 0.2542,
1457
+ "step": 960
1458
+ },
1459
+ {
1460
+ "epoch": 3.4448906738063365,
1461
+ "grad_norm": 3.85109543800354,
1462
+ "learning_rate": 4.574571551302647e-05,
1463
+ "loss": 0.3249,
1464
+ "step": 965
1465
+ },
1466
+ {
1467
+ "epoch": 3.462739848282017,
1468
+ "grad_norm": 3.3335843086242676,
1469
+ "learning_rate": 4.522839485981994e-05,
1470
+ "loss": 0.2729,
1471
+ "step": 970
1472
+ },
1473
+ {
1474
+ "epoch": 3.480589022757697,
1475
+ "grad_norm": 2.885708808898926,
1476
+ "learning_rate": 4.471158919420312e-05,
1477
+ "loss": 0.2595,
1478
+ "step": 975
1479
+ },
1480
+ {
1481
+ "epoch": 3.498438197233378,
1482
+ "grad_norm": 3.215789556503296,
1483
+ "learning_rate": 4.4195354293738484e-05,
1484
+ "loss": 0.2284,
1485
+ "step": 980
1486
+ },
1487
+ {
1488
+ "epoch": 3.498438197233378,
1489
+ "eval_loss": 1.82525634765625,
1490
+ "eval_runtime": 17.7537,
1491
+ "eval_samples_per_second": 2.591,
1492
+ "eval_steps_per_second": 2.591,
1493
+ "step": 980
1494
+ },
1495
+ {
1496
+ "epoch": 3.5162873717090584,
1497
+ "grad_norm": 3.4772818088531494,
1498
+ "learning_rate": 4.367974587438733e-05,
1499
+ "loss": 0.1947,
1500
+ "step": 985
1501
+ },
1502
+ {
1503
+ "epoch": 3.534136546184739,
1504
+ "grad_norm": 2.6401774883270264,
1505
+ "learning_rate": 4.316481958449634e-05,
1506
+ "loss": 0.2352,
1507
+ "step": 990
1508
+ },
1509
+ {
1510
+ "epoch": 3.5519857206604195,
1511
+ "grad_norm": 3.997591733932495,
1512
+ "learning_rate": 4.2650630998791615e-05,
1513
+ "loss": 0.2047,
1514
+ "step": 995
1515
+ },
1516
+ {
1517
+ "epoch": 3.5698348951361,
1518
+ "grad_norm": 2.5615384578704834,
1519
+ "learning_rate": 4.213723561238074e-05,
1520
+ "loss": 0.2369,
1521
+ "step": 1000
1522
+ },
1523
+ {
1524
+ "epoch": 3.5876840696117807,
1525
+ "grad_norm": 2.5114736557006836,
1526
+ "learning_rate": 4.162468883476319e-05,
1527
+ "loss": 0.2416,
1528
+ "step": 1005
1529
+ },
1530
+ {
1531
+ "epoch": 3.605533244087461,
1532
+ "grad_norm": 4.23993444442749,
1533
+ "learning_rate": 4.111304598385018e-05,
1534
+ "loss": 0.2353,
1535
+ "step": 1010
1536
+ },
1537
+ {
1538
+ "epoch": 3.6233824185631414,
1539
+ "grad_norm": 3.239319324493408,
1540
+ "learning_rate": 4.060236227999441e-05,
1541
+ "loss": 0.2155,
1542
+ "step": 1015
1543
+ },
1544
+ {
1545
+ "epoch": 3.641231593038822,
1546
+ "grad_norm": 2.030393600463867,
1547
+ "learning_rate": 4.0092692840030134e-05,
1548
+ "loss": 0.2241,
1549
+ "step": 1020
1550
+ },
1551
+ {
1552
+ "epoch": 3.6590807675145025,
1553
+ "grad_norm": 3.636963367462158,
1554
+ "learning_rate": 3.9584092671324606e-05,
1555
+ "loss": 0.2408,
1556
+ "step": 1025
1557
+ },
1558
+ {
1559
+ "epoch": 3.676929941990183,
1560
+ "grad_norm": 4.295063495635986,
1561
+ "learning_rate": 3.907661666584131e-05,
1562
+ "loss": 0.2423,
1563
+ "step": 1030
1564
+ },
1565
+ {
1566
+ "epoch": 3.694779116465863,
1567
+ "grad_norm": 3.268596887588501,
1568
+ "learning_rate": 3.857031959421553e-05,
1569
+ "loss": 0.2581,
1570
+ "step": 1035
1571
+ },
1572
+ {
1573
+ "epoch": 3.7126282909415442,
1574
+ "grad_norm": 3.0428457260131836,
1575
+ "learning_rate": 3.806525609984312e-05,
1576
+ "loss": 0.206,
1577
+ "step": 1040
1578
+ },
1579
+ {
1580
+ "epoch": 3.7304774654172244,
1581
+ "grad_norm": 3.523777484893799,
1582
+ "learning_rate": 3.7561480692983006e-05,
1583
+ "loss": 0.1956,
1584
+ "step": 1045
1585
+ },
1586
+ {
1587
+ "epoch": 3.748326639892905,
1588
+ "grad_norm": 2.972714900970459,
1589
+ "learning_rate": 3.705904774487396e-05,
1590
+ "loss": 0.2839,
1591
+ "step": 1050
1592
+ },
1593
+ {
1594
+ "epoch": 3.748326639892905,
1595
+ "eval_loss": 1.8687995672225952,
1596
+ "eval_runtime": 17.732,
1597
+ "eval_samples_per_second": 2.594,
1598
+ "eval_steps_per_second": 2.594,
1599
+ "step": 1050
1600
+ },
1601
+ {
1602
+ "epoch": 3.7661758143685855,
1603
+ "grad_norm": 3.9769251346588135,
1604
+ "learning_rate": 3.655801148186655e-05,
1605
+ "loss": 0.2433,
1606
+ "step": 1055
1607
+ },
1608
+ {
1609
+ "epoch": 3.784024988844266,
1610
+ "grad_norm": 3.03606915473938,
1611
+ "learning_rate": 3.6058425979570485e-05,
1612
+ "loss": 0.2085,
1613
+ "step": 1060
1614
+ },
1615
+ {
1616
+ "epoch": 3.8018741633199467,
1617
+ "grad_norm": 3.5858893394470215,
1618
+ "learning_rate": 3.556034515701852e-05,
1619
+ "loss": 0.2277,
1620
+ "step": 1065
1621
+ },
1622
+ {
1623
+ "epoch": 3.819723337795627,
1624
+ "grad_norm": 2.5949602127075195,
1625
+ "learning_rate": 3.506382277084696e-05,
1626
+ "loss": 0.2497,
1627
+ "step": 1070
1628
+ },
1629
+ {
1630
+ "epoch": 3.8375725122713074,
1631
+ "grad_norm": 2.8706088066101074,
1632
+ "learning_rate": 3.4568912409493945e-05,
1633
+ "loss": 0.2462,
1634
+ "step": 1075
1635
+ },
1636
+ {
1637
+ "epoch": 3.855421686746988,
1638
+ "grad_norm": 3.238346576690674,
1639
+ "learning_rate": 3.4075667487415785e-05,
1640
+ "loss": 0.2004,
1641
+ "step": 1080
1642
+ },
1643
+ {
1644
+ "epoch": 3.8732708612226685,
1645
+ "grad_norm": 3.36478590965271,
1646
+ "learning_rate": 3.358414123932195e-05,
1647
+ "loss": 0.226,
1648
+ "step": 1085
1649
+ },
1650
+ {
1651
+ "epoch": 3.891120035698349,
1652
+ "grad_norm": 3.0954155921936035,
1653
+ "learning_rate": 3.3094386714429724e-05,
1654
+ "loss": 0.2114,
1655
+ "step": 1090
1656
+ },
1657
+ {
1658
+ "epoch": 3.908969210174029,
1659
+ "grad_norm": 3.016141891479492,
1660
+ "learning_rate": 3.2606456770738636e-05,
1661
+ "loss": 0.2694,
1662
+ "step": 1095
1663
+ },
1664
+ {
1665
+ "epoch": 3.9268183846497102,
1666
+ "grad_norm": 2.976658821105957,
1667
+ "learning_rate": 3.212040406932569e-05,
1668
+ "loss": 0.1828,
1669
+ "step": 1100
1670
+ },
1671
+ {
1672
+ "epoch": 3.9446675591253904,
1673
+ "grad_norm": 2.8186426162719727,
1674
+ "learning_rate": 3.163628106866172e-05,
1675
+ "loss": 0.1451,
1676
+ "step": 1105
1677
+ },
1678
+ {
1679
+ "epoch": 3.962516733601071,
1680
+ "grad_norm": 2.959024429321289,
1681
+ "learning_rate": 3.115414001894974e-05,
1682
+ "loss": 0.2349,
1683
+ "step": 1110
1684
+ },
1685
+ {
1686
+ "epoch": 3.9803659080767515,
1687
+ "grad_norm": 2.9852728843688965,
1688
+ "learning_rate": 3.067403295648566e-05,
1689
+ "loss": 0.2235,
1690
+ "step": 1115
1691
+ },
1692
+ {
1693
+ "epoch": 3.998215082552432,
1694
+ "grad_norm": 2.79172945022583,
1695
+ "learning_rate": 3.019601169804216e-05,
1696
+ "loss": 0.2111,
1697
+ "step": 1120
1698
+ },
1699
+ {
1700
+ "epoch": 3.998215082552432,
1701
+ "eval_loss": 1.891045093536377,
1702
+ "eval_runtime": 17.7382,
1703
+ "eval_samples_per_second": 2.593,
1704
+ "eval_steps_per_second": 2.593,
1705
+ "step": 1120
1706
+ }
1707
+ ],
1708
+ "logging_steps": 5,
1709
+ "max_steps": 1680,
1710
+ "num_input_tokens_seen": 0,
1711
+ "num_train_epochs": 6,
1712
+ "save_steps": 70,
1713
+ "stateful_callbacks": {
1714
+ "TrainerControl": {
1715
+ "args": {
1716
+ "should_epoch_stop": false,
1717
+ "should_evaluate": false,
1718
+ "should_log": false,
1719
+ "should_save": true,
1720
+ "should_training_stop": false
1721
+ },
1722
+ "attributes": {}
1723
+ }
1724
+ },
1725
+ "total_flos": 1.1790762820593254e+18,
1726
+ "train_batch_size": 2,
1727
+ "trial_name": null,
1728
+ "trial_params": null
1729
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b061b26ebc79da396fea201dbc3aded12f572b2061bb961d9cec13867ed1c18f
3
+ size 5368
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1120/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-72B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-72B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffc414420e2ebd13164f6070829e0f38f1c5adb0b36096321c79b3394feb3e76
3
+ size 421218912
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df468211295bce1660f965172990782116fc79224957b04f9e2cbb63b0d5130c
3
+ size 843085810
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec28ea0c416565eeac14a0e9c944f185ac250f4ed4bd15c84ff77ed78ba9301
3
+ size 14244
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286bdc0977b4416304a0005b04424c27c5335a69c4eac00cd2a430d254b088ef
3
+ size 1064
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 131072,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/trainer_state.json ADDED
@@ -0,0 +1,1835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.248103525211959,
5
+ "eval_steps": 70,
6
+ "global_step": 1190,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0178491744756805,
13
+ "grad_norm": 1.8217403888702393,
14
+ "learning_rate": 2.9761904761904763e-06,
15
+ "loss": 2.7425,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.035698348951361,
20
+ "grad_norm": 2.104698419570923,
21
+ "learning_rate": 5.9523809523809525e-06,
22
+ "loss": 2.861,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.0535475234270415,
27
+ "grad_norm": 2.7389333248138428,
28
+ "learning_rate": 8.92857142857143e-06,
29
+ "loss": 2.8281,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.071396697902722,
34
+ "grad_norm": 3.9298207759857178,
35
+ "learning_rate": 1.1904761904761905e-05,
36
+ "loss": 3.1888,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.0892458723784025,
41
+ "grad_norm": 2.648014783859253,
42
+ "learning_rate": 1.4880952380952381e-05,
43
+ "loss": 2.6461,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.107095046854083,
48
+ "grad_norm": 1.587472915649414,
49
+ "learning_rate": 1.785714285714286e-05,
50
+ "loss": 2.3212,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.1249442213297635,
55
+ "grad_norm": 0.8390935063362122,
56
+ "learning_rate": 2.0833333333333336e-05,
57
+ "loss": 1.8036,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.142793395805444,
62
+ "grad_norm": 0.46670979261398315,
63
+ "learning_rate": 2.380952380952381e-05,
64
+ "loss": 1.5552,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.1606425702811245,
69
+ "grad_norm": 0.45171597599983215,
70
+ "learning_rate": 2.6785714285714288e-05,
71
+ "loss": 1.6626,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.178491744756805,
76
+ "grad_norm": 0.5605499744415283,
77
+ "learning_rate": 2.9761904761904762e-05,
78
+ "loss": 1.4897,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.1963409192324855,
83
+ "grad_norm": 0.5553259253501892,
84
+ "learning_rate": 3.273809523809524e-05,
85
+ "loss": 1.5373,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.214190093708166,
90
+ "grad_norm": 0.6260251402854919,
91
+ "learning_rate": 3.571428571428572e-05,
92
+ "loss": 1.4779,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.2320392681838465,
97
+ "grad_norm": 0.6063796877861023,
98
+ "learning_rate": 3.8690476190476195e-05,
99
+ "loss": 1.483,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.249888442659527,
104
+ "grad_norm": 0.5549850463867188,
105
+ "learning_rate": 4.166666666666667e-05,
106
+ "loss": 1.5022,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.249888442659527,
111
+ "eval_loss": 1.451762318611145,
112
+ "eval_runtime": 17.7549,
113
+ "eval_samples_per_second": 2.591,
114
+ "eval_steps_per_second": 2.591,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.2677376171352075,
119
+ "grad_norm": 0.482930988073349,
120
+ "learning_rate": 4.464285714285715e-05,
121
+ "loss": 1.4256,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 0.285586791610888,
126
+ "grad_norm": 0.4240593910217285,
127
+ "learning_rate": 4.761904761904762e-05,
128
+ "loss": 1.3655,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.3034359660865685,
133
+ "grad_norm": 0.4872314929962158,
134
+ "learning_rate": 5.05952380952381e-05,
135
+ "loss": 1.4478,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 0.321285140562249,
140
+ "grad_norm": 0.42132768034935,
141
+ "learning_rate": 5.3571428571428575e-05,
142
+ "loss": 1.3305,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 0.3391343150379295,
147
+ "grad_norm": 0.6932046413421631,
148
+ "learning_rate": 5.6547619047619046e-05,
149
+ "loss": 1.4279,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 0.35698348951361,
154
+ "grad_norm": 0.6714524626731873,
155
+ "learning_rate": 5.9523809523809524e-05,
156
+ "loss": 1.4967,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.3748326639892905,
161
+ "grad_norm": 0.5682816505432129,
162
+ "learning_rate": 6.25e-05,
163
+ "loss": 1.4739,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.392681838464971,
168
+ "grad_norm": 0.7795937657356262,
169
+ "learning_rate": 6.547619047619048e-05,
170
+ "loss": 1.3751,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.4105310129406515,
175
+ "grad_norm": 0.8056842088699341,
176
+ "learning_rate": 6.845238095238096e-05,
177
+ "loss": 1.3699,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.428380187416332,
182
+ "grad_norm": 0.8373801112174988,
183
+ "learning_rate": 7.142857142857143e-05,
184
+ "loss": 1.4696,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.4462293618920125,
189
+ "grad_norm": 1.0051416158676147,
190
+ "learning_rate": 7.440476190476191e-05,
191
+ "loss": 1.4059,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.464078536367693,
196
+ "grad_norm": 0.5304180383682251,
197
+ "learning_rate": 7.738095238095239e-05,
198
+ "loss": 1.3072,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.4819277108433735,
203
+ "grad_norm": 0.8797634243965149,
204
+ "learning_rate": 8.035714285714287e-05,
205
+ "loss": 1.4132,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.499776885319054,
210
+ "grad_norm": 0.9049625396728516,
211
+ "learning_rate": 8.333333333333334e-05,
212
+ "loss": 1.4121,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.499776885319054,
217
+ "eval_loss": 1.3727394342422485,
218
+ "eval_runtime": 17.745,
219
+ "eval_samples_per_second": 2.592,
220
+ "eval_steps_per_second": 2.592,
221
+ "step": 140
222
+ },
223
+ {
224
+ "epoch": 0.5176260597947345,
225
+ "grad_norm": 0.6793915033340454,
226
+ "learning_rate": 8.630952380952382e-05,
227
+ "loss": 1.3109,
228
+ "step": 145
229
+ },
230
+ {
231
+ "epoch": 0.535475234270415,
232
+ "grad_norm": 0.7171015739440918,
233
+ "learning_rate": 8.92857142857143e-05,
234
+ "loss": 1.3781,
235
+ "step": 150
236
+ },
237
+ {
238
+ "epoch": 0.5533244087460955,
239
+ "grad_norm": 0.6738716959953308,
240
+ "learning_rate": 9.226190476190478e-05,
241
+ "loss": 1.3564,
242
+ "step": 155
243
+ },
244
+ {
245
+ "epoch": 0.571173583221776,
246
+ "grad_norm": 0.699975311756134,
247
+ "learning_rate": 9.523809523809524e-05,
248
+ "loss": 1.2387,
249
+ "step": 160
250
+ },
251
+ {
252
+ "epoch": 0.5890227576974565,
253
+ "grad_norm": 0.7659904956817627,
254
+ "learning_rate": 9.821428571428572e-05,
255
+ "loss": 1.3042,
256
+ "step": 165
257
+ },
258
+ {
259
+ "epoch": 0.606871932173137,
260
+ "grad_norm": 0.9782125353813171,
261
+ "learning_rate": 9.999956828659095e-05,
262
+ "loss": 1.3709,
263
+ "step": 170
264
+ },
265
+ {
266
+ "epoch": 0.6247211066488175,
267
+ "grad_norm": 1.0532957315444946,
268
+ "learning_rate": 9.999471159635539e-05,
269
+ "loss": 1.3844,
270
+ "step": 175
271
+ },
272
+ {
273
+ "epoch": 0.642570281124498,
274
+ "grad_norm": 0.7373877167701721,
275
+ "learning_rate": 9.998445910004082e-05,
276
+ "loss": 1.2852,
277
+ "step": 180
278
+ },
279
+ {
280
+ "epoch": 0.6604194556001785,
281
+ "grad_norm": 1.0207768678665161,
282
+ "learning_rate": 9.996881190417393e-05,
283
+ "loss": 1.4652,
284
+ "step": 185
285
+ },
286
+ {
287
+ "epoch": 0.678268630075859,
288
+ "grad_norm": 0.7943917512893677,
289
+ "learning_rate": 9.994777169751806e-05,
290
+ "loss": 1.3743,
291
+ "step": 190
292
+ },
293
+ {
294
+ "epoch": 0.6961178045515395,
295
+ "grad_norm": 0.7461659908294678,
296
+ "learning_rate": 9.992134075089084e-05,
297
+ "loss": 1.2423,
298
+ "step": 195
299
+ },
300
+ {
301
+ "epoch": 0.71396697902722,
302
+ "grad_norm": 0.9689913988113403,
303
+ "learning_rate": 9.988952191691925e-05,
304
+ "loss": 1.3113,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.7318161535029005,
309
+ "grad_norm": 0.766276478767395,
310
+ "learning_rate": 9.985231862973168e-05,
311
+ "loss": 1.3524,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.749665327978581,
316
+ "grad_norm": 0.6728419661521912,
317
+ "learning_rate": 9.980973490458728e-05,
318
+ "loss": 1.4038,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.749665327978581,
323
+ "eval_loss": 1.3051044940948486,
324
+ "eval_runtime": 17.7559,
325
+ "eval_samples_per_second": 2.591,
326
+ "eval_steps_per_second": 2.591,
327
+ "step": 210
328
+ },
329
+ {
330
+ "epoch": 0.7675145024542614,
331
+ "grad_norm": 1.0456575155258179,
332
+ "learning_rate": 9.976177533744261e-05,
333
+ "loss": 1.3626,
334
+ "step": 215
335
+ },
336
+ {
337
+ "epoch": 0.785363676929942,
338
+ "grad_norm": 0.9017456769943237,
339
+ "learning_rate": 9.97084451044556e-05,
340
+ "loss": 1.3232,
341
+ "step": 220
342
+ },
343
+ {
344
+ "epoch": 0.8032128514056225,
345
+ "grad_norm": 0.9113703966140747,
346
+ "learning_rate": 9.964974996142698e-05,
347
+ "loss": 1.2826,
348
+ "step": 225
349
+ },
350
+ {
351
+ "epoch": 0.821062025881303,
352
+ "grad_norm": 0.7177279591560364,
353
+ "learning_rate": 9.958569624317893e-05,
354
+ "loss": 1.2794,
355
+ "step": 230
356
+ },
357
+ {
358
+ "epoch": 0.8389112003569835,
359
+ "grad_norm": 0.9058728814125061,
360
+ "learning_rate": 9.951629086287151e-05,
361
+ "loss": 1.3853,
362
+ "step": 235
363
+ },
364
+ {
365
+ "epoch": 0.856760374832664,
366
+ "grad_norm": 0.6813459992408752,
367
+ "learning_rate": 9.944154131125642e-05,
368
+ "loss": 1.3533,
369
+ "step": 240
370
+ },
371
+ {
372
+ "epoch": 0.8746095493083444,
373
+ "grad_norm": 0.7113555073738098,
374
+ "learning_rate": 9.936145565586871e-05,
375
+ "loss": 1.3395,
376
+ "step": 245
377
+ },
378
+ {
379
+ "epoch": 0.892458723784025,
380
+ "grad_norm": 1.243597149848938,
381
+ "learning_rate": 9.927604254015585e-05,
382
+ "loss": 1.443,
383
+ "step": 250
384
+ },
385
+ {
386
+ "epoch": 0.9103078982597055,
387
+ "grad_norm": 0.8651953339576721,
388
+ "learning_rate": 9.918531118254507e-05,
389
+ "loss": 1.398,
390
+ "step": 255
391
+ },
392
+ {
393
+ "epoch": 0.928157072735386,
394
+ "grad_norm": 0.8877395987510681,
395
+ "learning_rate": 9.90892713754483e-05,
396
+ "loss": 1.346,
397
+ "step": 260
398
+ },
399
+ {
400
+ "epoch": 0.9460062472110665,
401
+ "grad_norm": 0.8857008814811707,
402
+ "learning_rate": 9.898793348420536e-05,
403
+ "loss": 1.3921,
404
+ "step": 265
405
+ },
406
+ {
407
+ "epoch": 0.963855421686747,
408
+ "grad_norm": 0.8319969177246094,
409
+ "learning_rate": 9.888130844596524e-05,
410
+ "loss": 1.3838,
411
+ "step": 270
412
+ },
413
+ {
414
+ "epoch": 0.9817045961624274,
415
+ "grad_norm": 0.7452044486999512,
416
+ "learning_rate": 9.876940776850569e-05,
417
+ "loss": 1.3529,
418
+ "step": 275
419
+ },
420
+ {
421
+ "epoch": 0.999553770638108,
422
+ "grad_norm": 0.7535015940666199,
423
+ "learning_rate": 9.865224352899119e-05,
424
+ "loss": 1.2739,
425
+ "step": 280
426
+ },
427
+ {
428
+ "epoch": 0.999553770638108,
429
+ "eval_loss": 1.289029836654663,
430
+ "eval_runtime": 17.7491,
431
+ "eval_samples_per_second": 2.592,
432
+ "eval_steps_per_second": 2.592,
433
+ "step": 280
434
+ },
435
+ {
436
+ "epoch": 1.0174029451137885,
437
+ "grad_norm": 0.7779117226600647,
438
+ "learning_rate": 9.852982837266955e-05,
439
+ "loss": 1.2339,
440
+ "step": 285
441
+ },
442
+ {
443
+ "epoch": 1.035252119589469,
444
+ "grad_norm": 0.8113610744476318,
445
+ "learning_rate": 9.840217551150706e-05,
446
+ "loss": 1.0982,
447
+ "step": 290
448
+ },
449
+ {
450
+ "epoch": 1.0531012940651494,
451
+ "grad_norm": 1.004701852798462,
452
+ "learning_rate": 9.826929872276255e-05,
453
+ "loss": 1.2537,
454
+ "step": 295
455
+ },
456
+ {
457
+ "epoch": 1.07095046854083,
458
+ "grad_norm": 1.524734616279602,
459
+ "learning_rate": 9.81312123475006e-05,
460
+ "loss": 1.1664,
461
+ "step": 300
462
+ },
463
+ {
464
+ "epoch": 1.0887996430165106,
465
+ "grad_norm": 1.5680856704711914,
466
+ "learning_rate": 9.798793128904356e-05,
467
+ "loss": 1.08,
468
+ "step": 305
469
+ },
470
+ {
471
+ "epoch": 1.106648817492191,
472
+ "grad_norm": 1.4838035106658936,
473
+ "learning_rate": 9.78394710113631e-05,
474
+ "loss": 1.1029,
475
+ "step": 310
476
+ },
477
+ {
478
+ "epoch": 1.1244979919678715,
479
+ "grad_norm": 1.522316575050354,
480
+ "learning_rate": 9.768584753741134e-05,
481
+ "loss": 1.1524,
482
+ "step": 315
483
+ },
484
+ {
485
+ "epoch": 1.142347166443552,
486
+ "grad_norm": 1.3976528644561768,
487
+ "learning_rate": 9.752707744739145e-05,
488
+ "loss": 1.1328,
489
+ "step": 320
490
+ },
491
+ {
492
+ "epoch": 1.1601963409192324,
493
+ "grad_norm": 1.4764764308929443,
494
+ "learning_rate": 9.736317787696816e-05,
495
+ "loss": 1.1174,
496
+ "step": 325
497
+ },
498
+ {
499
+ "epoch": 1.178045515394913,
500
+ "grad_norm": 1.3623173236846924,
501
+ "learning_rate": 9.719416651541839e-05,
502
+ "loss": 1.0493,
503
+ "step": 330
504
+ },
505
+ {
506
+ "epoch": 1.1958946898705936,
507
+ "grad_norm": 1.3625001907348633,
508
+ "learning_rate": 9.702006160372209e-05,
509
+ "loss": 1.0479,
510
+ "step": 335
511
+ },
512
+ {
513
+ "epoch": 1.213743864346274,
514
+ "grad_norm": 1.7509726285934448,
515
+ "learning_rate": 9.684088193259355e-05,
516
+ "loss": 1.1043,
517
+ "step": 340
518
+ },
519
+ {
520
+ "epoch": 1.2315930388219545,
521
+ "grad_norm": 1.5920188426971436,
522
+ "learning_rate": 9.665664684045333e-05,
523
+ "loss": 1.1096,
524
+ "step": 345
525
+ },
526
+ {
527
+ "epoch": 1.249442213297635,
528
+ "grad_norm": 1.6554943323135376,
529
+ "learning_rate": 9.646737621134112e-05,
530
+ "loss": 1.1436,
531
+ "step": 350
532
+ },
533
+ {
534
+ "epoch": 1.249442213297635,
535
+ "eval_loss": 1.3194608688354492,
536
+ "eval_runtime": 17.7382,
537
+ "eval_samples_per_second": 2.593,
538
+ "eval_steps_per_second": 2.593,
539
+ "step": 350
540
+ },
541
+ {
542
+ "epoch": 1.2672913877733154,
543
+ "grad_norm": 1.881818175315857,
544
+ "learning_rate": 9.627309047276974e-05,
545
+ "loss": 1.0549,
546
+ "step": 355
547
+ },
548
+ {
549
+ "epoch": 1.285140562248996,
550
+ "grad_norm": 1.8770464658737183,
551
+ "learning_rate": 9.607381059352038e-05,
552
+ "loss": 1.1576,
553
+ "step": 360
554
+ },
555
+ {
556
+ "epoch": 1.3029897367246766,
557
+ "grad_norm": 1.6901912689208984,
558
+ "learning_rate": 9.586955808137958e-05,
559
+ "loss": 1.1246,
560
+ "step": 365
561
+ },
562
+ {
563
+ "epoch": 1.320838911200357,
564
+ "grad_norm": 1.7667070627212524,
565
+ "learning_rate": 9.566035498081784e-05,
566
+ "loss": 1.125,
567
+ "step": 370
568
+ },
569
+ {
570
+ "epoch": 1.3386880856760375,
571
+ "grad_norm": 1.6150933504104614,
572
+ "learning_rate": 9.544622387061055e-05,
573
+ "loss": 1.1687,
574
+ "step": 375
575
+ },
576
+ {
577
+ "epoch": 1.356537260151718,
578
+ "grad_norm": 1.5824884176254272,
579
+ "learning_rate": 9.522718786140097e-05,
580
+ "loss": 0.9699,
581
+ "step": 380
582
+ },
583
+ {
584
+ "epoch": 1.3743864346273984,
585
+ "grad_norm": 1.5410280227661133,
586
+ "learning_rate": 9.500327059320606e-05,
587
+ "loss": 1.1379,
588
+ "step": 385
589
+ },
590
+ {
591
+ "epoch": 1.392235609103079,
592
+ "grad_norm": 2.264235496520996,
593
+ "learning_rate": 9.477449623286505e-05,
594
+ "loss": 1.0511,
595
+ "step": 390
596
+ },
597
+ {
598
+ "epoch": 1.4100847835787595,
599
+ "grad_norm": 1.7440612316131592,
600
+ "learning_rate": 9.454088947143116e-05,
601
+ "loss": 1.0003,
602
+ "step": 395
603
+ },
604
+ {
605
+ "epoch": 1.42793395805444,
606
+ "grad_norm": 1.770466923713684,
607
+ "learning_rate": 9.430247552150673e-05,
608
+ "loss": 1.1631,
609
+ "step": 400
610
+ },
611
+ {
612
+ "epoch": 1.4457831325301205,
613
+ "grad_norm": 1.9537169933319092,
614
+ "learning_rate": 9.405928011452211e-05,
615
+ "loss": 1.045,
616
+ "step": 405
617
+ },
618
+ {
619
+ "epoch": 1.463632307005801,
620
+ "grad_norm": 1.452445387840271,
621
+ "learning_rate": 9.381132949795861e-05,
622
+ "loss": 1.0511,
623
+ "step": 410
624
+ },
625
+ {
626
+ "epoch": 1.4814814814814814,
627
+ "grad_norm": 2.176547050476074,
628
+ "learning_rate": 9.35586504325155e-05,
629
+ "loss": 1.1637,
630
+ "step": 415
631
+ },
632
+ {
633
+ "epoch": 1.499330655957162,
634
+ "grad_norm": 2.15567684173584,
635
+ "learning_rate": 9.330127018922194e-05,
636
+ "loss": 1.0783,
637
+ "step": 420
638
+ },
639
+ {
640
+ "epoch": 1.499330655957162,
641
+ "eval_loss": 1.3106330633163452,
642
+ "eval_runtime": 17.7447,
643
+ "eval_samples_per_second": 2.592,
644
+ "eval_steps_per_second": 2.592,
645
+ "step": 420
646
+ },
647
+ {
648
+ "epoch": 1.5171798304328425,
649
+ "grad_norm": 1.6800014972686768,
650
+ "learning_rate": 9.303921654649362e-05,
651
+ "loss": 1.0406,
652
+ "step": 425
653
+ },
654
+ {
655
+ "epoch": 1.5350290049085231,
656
+ "grad_norm": 1.926607370376587,
657
+ "learning_rate": 9.277251778713474e-05,
658
+ "loss": 1.1469,
659
+ "step": 430
660
+ },
661
+ {
662
+ "epoch": 1.5528781793842035,
663
+ "grad_norm": 1.7155028581619263,
664
+ "learning_rate": 9.250120269528546e-05,
665
+ "loss": 1.0453,
666
+ "step": 435
667
+ },
668
+ {
669
+ "epoch": 1.5707273538598838,
670
+ "grad_norm": 1.9001247882843018,
671
+ "learning_rate": 9.22253005533154e-05,
672
+ "loss": 1.0611,
673
+ "step": 440
674
+ },
675
+ {
676
+ "epoch": 1.5885765283355644,
677
+ "grad_norm": 2.2804248332977295,
678
+ "learning_rate": 9.194484113866313e-05,
679
+ "loss": 1.082,
680
+ "step": 445
681
+ },
682
+ {
683
+ "epoch": 1.606425702811245,
684
+ "grad_norm": 1.9318439960479736,
685
+ "learning_rate": 9.165985472062246e-05,
686
+ "loss": 1.2404,
687
+ "step": 450
688
+ },
689
+ {
690
+ "epoch": 1.6242748772869255,
691
+ "grad_norm": 1.6018136739730835,
692
+ "learning_rate": 9.137037205707552e-05,
693
+ "loss": 1.0436,
694
+ "step": 455
695
+ },
696
+ {
697
+ "epoch": 1.6421240517626061,
698
+ "grad_norm": 2.1986541748046875,
699
+ "learning_rate": 9.107642439117321e-05,
700
+ "loss": 1.1227,
701
+ "step": 460
702
+ },
703
+ {
704
+ "epoch": 1.6599732262382865,
705
+ "grad_norm": 1.5558295249938965,
706
+ "learning_rate": 9.077804344796302e-05,
707
+ "loss": 1.0858,
708
+ "step": 465
709
+ },
710
+ {
711
+ "epoch": 1.6778224007139668,
712
+ "grad_norm": 1.8423618078231812,
713
+ "learning_rate": 9.04752614309652e-05,
714
+ "loss": 1.0998,
715
+ "step": 470
716
+ },
717
+ {
718
+ "epoch": 1.6956715751896474,
719
+ "grad_norm": 1.9065622091293335,
720
+ "learning_rate": 9.01681110186971e-05,
721
+ "loss": 1.0433,
722
+ "step": 475
723
+ },
724
+ {
725
+ "epoch": 1.713520749665328,
726
+ "grad_norm": 2.0103020668029785,
727
+ "learning_rate": 8.985662536114613e-05,
728
+ "loss": 1.0798,
729
+ "step": 480
730
+ },
731
+ {
732
+ "epoch": 1.7313699241410085,
733
+ "grad_norm": 1.5299313068389893,
734
+ "learning_rate": 8.954083807619208e-05,
735
+ "loss": 1.1012,
736
+ "step": 485
737
+ },
738
+ {
739
+ "epoch": 1.7492190986166891,
740
+ "grad_norm": 1.6331924200057983,
741
+ "learning_rate": 8.922078324597879e-05,
742
+ "loss": 1.1219,
743
+ "step": 490
744
+ },
745
+ {
746
+ "epoch": 1.7492190986166891,
747
+ "eval_loss": 1.3044873476028442,
748
+ "eval_runtime": 17.7401,
749
+ "eval_samples_per_second": 2.593,
750
+ "eval_steps_per_second": 2.593,
751
+ "step": 490
752
+ },
753
+ {
754
+ "epoch": 1.7670682730923695,
755
+ "grad_norm": 1.6050705909729004,
756
+ "learning_rate": 8.889649541323574e-05,
757
+ "loss": 1.16,
758
+ "step": 495
759
+ },
760
+ {
761
+ "epoch": 1.7849174475680498,
762
+ "grad_norm": 1.7604998350143433,
763
+ "learning_rate": 8.856800957755e-05,
764
+ "loss": 1.091,
765
+ "step": 500
766
+ },
767
+ {
768
+ "epoch": 1.8027666220437304,
769
+ "grad_norm": 1.6485258340835571,
770
+ "learning_rate": 8.823536119158864e-05,
771
+ "loss": 1.072,
772
+ "step": 505
773
+ },
774
+ {
775
+ "epoch": 1.820615796519411,
776
+ "grad_norm": 1.8173716068267822,
777
+ "learning_rate": 8.789858615727265e-05,
778
+ "loss": 1.0635,
779
+ "step": 510
780
+ },
781
+ {
782
+ "epoch": 1.8384649709950915,
783
+ "grad_norm": 1.468127965927124,
784
+ "learning_rate": 8.755772082190194e-05,
785
+ "loss": 1.0258,
786
+ "step": 515
787
+ },
788
+ {
789
+ "epoch": 1.8563141454707721,
790
+ "grad_norm": 1.4476536512374878,
791
+ "learning_rate": 8.721280197423258e-05,
792
+ "loss": 1.2011,
793
+ "step": 520
794
+ },
795
+ {
796
+ "epoch": 1.8741633199464525,
797
+ "grad_norm": 2.054915189743042,
798
+ "learning_rate": 8.68638668405062e-05,
799
+ "loss": 1.0539,
800
+ "step": 525
801
+ },
802
+ {
803
+ "epoch": 1.8920124944221328,
804
+ "grad_norm": 1.8471094369888306,
805
+ "learning_rate": 8.651095308043232e-05,
806
+ "loss": 1.0948,
807
+ "step": 530
808
+ },
809
+ {
810
+ "epoch": 1.9098616688978134,
811
+ "grad_norm": 1.7790355682373047,
812
+ "learning_rate": 8.61540987831238e-05,
813
+ "loss": 1.1245,
814
+ "step": 535
815
+ },
816
+ {
817
+ "epoch": 1.927710843373494,
818
+ "grad_norm": 1.6644902229309082,
819
+ "learning_rate": 8.579334246298593e-05,
820
+ "loss": 1.2039,
821
+ "step": 540
822
+ },
823
+ {
824
+ "epoch": 1.9455600178491745,
825
+ "grad_norm": 1.9952303171157837,
826
+ "learning_rate": 8.542872305555978e-05,
827
+ "loss": 1.1077,
828
+ "step": 545
829
+ },
830
+ {
831
+ "epoch": 1.9634091923248551,
832
+ "grad_norm": 2.225977659225464,
833
+ "learning_rate": 8.50602799133199e-05,
834
+ "loss": 1.0603,
835
+ "step": 550
836
+ },
837
+ {
838
+ "epoch": 1.9812583668005355,
839
+ "grad_norm": 1.777342438697815,
840
+ "learning_rate": 8.468805280142709e-05,
841
+ "loss": 1.1376,
842
+ "step": 555
843
+ },
844
+ {
845
+ "epoch": 1.9991075412762158,
846
+ "grad_norm": 2.2195017337799072,
847
+ "learning_rate": 8.43120818934367e-05,
848
+ "loss": 1.0966,
849
+ "step": 560
850
+ },
851
+ {
852
+ "epoch": 1.9991075412762158,
853
+ "eval_loss": 1.3094360828399658,
854
+ "eval_runtime": 17.7539,
855
+ "eval_samples_per_second": 2.591,
856
+ "eval_steps_per_second": 2.591,
857
+ "step": 560
858
+ },
859
+ {
860
+ "epoch": 2.0169567157518964,
861
+ "grad_norm": 2.012312173843384,
862
+ "learning_rate": 8.393240776696274e-05,
863
+ "loss": 0.6867,
864
+ "step": 565
865
+ },
866
+ {
867
+ "epoch": 2.034805890227577,
868
+ "grad_norm": 3.092951774597168,
869
+ "learning_rate": 8.354907139929851e-05,
870
+ "loss": 0.6025,
871
+ "step": 570
872
+ },
873
+ {
874
+ "epoch": 2.0526550647032575,
875
+ "grad_norm": 4.8303399085998535,
876
+ "learning_rate": 8.316211416299397e-05,
877
+ "loss": 0.6497,
878
+ "step": 575
879
+ },
880
+ {
881
+ "epoch": 2.070504239178938,
882
+ "grad_norm": 3.1457698345184326,
883
+ "learning_rate": 8.27715778213905e-05,
884
+ "loss": 0.5803,
885
+ "step": 580
886
+ },
887
+ {
888
+ "epoch": 2.0883534136546187,
889
+ "grad_norm": 2.5240321159362793,
890
+ "learning_rate": 8.237750452411353e-05,
891
+ "loss": 0.494,
892
+ "step": 585
893
+ },
894
+ {
895
+ "epoch": 2.106202588130299,
896
+ "grad_norm": 2.630946636199951,
897
+ "learning_rate": 8.197993680252334e-05,
898
+ "loss": 0.6428,
899
+ "step": 590
900
+ },
901
+ {
902
+ "epoch": 2.1240517626059794,
903
+ "grad_norm": 2.9942588806152344,
904
+ "learning_rate": 8.157891756512488e-05,
905
+ "loss": 0.6612,
906
+ "step": 595
907
+ },
908
+ {
909
+ "epoch": 2.14190093708166,
910
+ "grad_norm": 2.8771650791168213,
911
+ "learning_rate": 8.117449009293668e-05,
912
+ "loss": 0.5783,
913
+ "step": 600
914
+ },
915
+ {
916
+ "epoch": 2.1597501115573405,
917
+ "grad_norm": 3.1111013889312744,
918
+ "learning_rate": 8.076669803481965e-05,
919
+ "loss": 0.5799,
920
+ "step": 605
921
+ },
922
+ {
923
+ "epoch": 2.177599286033021,
924
+ "grad_norm": 3.715027093887329,
925
+ "learning_rate": 8.035558540276618e-05,
926
+ "loss": 0.5344,
927
+ "step": 610
928
+ },
929
+ {
930
+ "epoch": 2.1954484605087012,
931
+ "grad_norm": 2.936890125274658,
932
+ "learning_rate": 7.994119656715002e-05,
933
+ "loss": 0.5605,
934
+ "step": 615
935
+ },
936
+ {
937
+ "epoch": 2.213297634984382,
938
+ "grad_norm": 2.79441499710083,
939
+ "learning_rate": 7.952357625193749e-05,
940
+ "loss": 0.5923,
941
+ "step": 620
942
+ },
943
+ {
944
+ "epoch": 2.2311468094600624,
945
+ "grad_norm": 3.444474697113037,
946
+ "learning_rate": 7.91027695298606e-05,
947
+ "loss": 0.6067,
948
+ "step": 625
949
+ },
950
+ {
951
+ "epoch": 2.248995983935743,
952
+ "grad_norm": 3.034071445465088,
953
+ "learning_rate": 7.86788218175523e-05,
954
+ "loss": 0.6134,
955
+ "step": 630
956
+ },
957
+ {
958
+ "epoch": 2.248995983935743,
959
+ "eval_loss": 1.4945974349975586,
960
+ "eval_runtime": 17.7423,
961
+ "eval_samples_per_second": 2.593,
962
+ "eval_steps_per_second": 2.593,
963
+ "step": 630
964
+ },
965
+ {
966
+ "epoch": 2.2668451584114235,
967
+ "grad_norm": 3.0743188858032227,
968
+ "learning_rate": 7.8251778870645e-05,
969
+ "loss": 0.5798,
970
+ "step": 635
971
+ },
972
+ {
973
+ "epoch": 2.284694332887104,
974
+ "grad_norm": 3.250493049621582,
975
+ "learning_rate": 7.782168677883206e-05,
976
+ "loss": 0.5705,
977
+ "step": 640
978
+ },
979
+ {
980
+ "epoch": 2.3025435073627847,
981
+ "grad_norm": 2.4863390922546387,
982
+ "learning_rate": 7.738859196089358e-05,
983
+ "loss": 0.6119,
984
+ "step": 645
985
+ },
986
+ {
987
+ "epoch": 2.320392681838465,
988
+ "grad_norm": 3.1027884483337402,
989
+ "learning_rate": 7.695254115968648e-05,
990
+ "loss": 0.6352,
991
+ "step": 650
992
+ },
993
+ {
994
+ "epoch": 2.3382418563141454,
995
+ "grad_norm": 2.840583562850952,
996
+ "learning_rate": 7.651358143709972e-05,
997
+ "loss": 0.6341,
998
+ "step": 655
999
+ },
1000
+ {
1001
+ "epoch": 2.356091030789826,
1002
+ "grad_norm": 3.057770252227783,
1003
+ "learning_rate": 7.60717601689749e-05,
1004
+ "loss": 0.6695,
1005
+ "step": 660
1006
+ },
1007
+ {
1008
+ "epoch": 2.3739402052655065,
1009
+ "grad_norm": 3.563372850418091,
1010
+ "learning_rate": 7.562712503999327e-05,
1011
+ "loss": 0.5715,
1012
+ "step": 665
1013
+ },
1014
+ {
1015
+ "epoch": 2.391789379741187,
1016
+ "grad_norm": 3.2286486625671387,
1017
+ "learning_rate": 7.517972403852905e-05,
1018
+ "loss": 0.7753,
1019
+ "step": 670
1020
+ },
1021
+ {
1022
+ "epoch": 2.4096385542168672,
1023
+ "grad_norm": 2.9088051319122314,
1024
+ "learning_rate": 7.472960545147038e-05,
1025
+ "loss": 0.5529,
1026
+ "step": 675
1027
+ },
1028
+ {
1029
+ "epoch": 2.427487728692548,
1030
+ "grad_norm": 2.9432833194732666,
1031
+ "learning_rate": 7.427681785900761e-05,
1032
+ "loss": 0.5715,
1033
+ "step": 680
1034
+ },
1035
+ {
1036
+ "epoch": 2.4453369031682284,
1037
+ "grad_norm": 2.483222723007202,
1038
+ "learning_rate": 7.382141012939034e-05,
1039
+ "loss": 0.6085,
1040
+ "step": 685
1041
+ },
1042
+ {
1043
+ "epoch": 2.463186077643909,
1044
+ "grad_norm": 2.9013617038726807,
1045
+ "learning_rate": 7.33634314136531e-05,
1046
+ "loss": 0.627,
1047
+ "step": 690
1048
+ },
1049
+ {
1050
+ "epoch": 2.4810352521195895,
1051
+ "grad_norm": 2.746309995651245,
1052
+ "learning_rate": 7.290293114031061e-05,
1053
+ "loss": 0.6403,
1054
+ "step": 695
1055
+ },
1056
+ {
1057
+ "epoch": 2.49888442659527,
1058
+ "grad_norm": 2.8350794315338135,
1059
+ "learning_rate": 7.243995901002312e-05,
1060
+ "loss": 0.6342,
1061
+ "step": 700
1062
+ },
1063
+ {
1064
+ "epoch": 2.49888442659527,
1065
+ "eval_loss": 1.4858874082565308,
1066
+ "eval_runtime": 17.7385,
1067
+ "eval_samples_per_second": 2.593,
1068
+ "eval_steps_per_second": 2.593,
1069
+ "step": 700
1070
+ },
1071
+ {
1072
+ "epoch": 2.5167336010709507,
1073
+ "grad_norm": 3.006899833679199,
1074
+ "learning_rate": 7.197456499023225e-05,
1075
+ "loss": 0.5921,
1076
+ "step": 705
1077
+ },
1078
+ {
1079
+ "epoch": 2.534582775546631,
1080
+ "grad_norm": 2.9739573001861572,
1081
+ "learning_rate": 7.150679930976825e-05,
1082
+ "loss": 0.5873,
1083
+ "step": 710
1084
+ },
1085
+ {
1086
+ "epoch": 2.5524319500223114,
1087
+ "grad_norm": 3.7028846740722656,
1088
+ "learning_rate": 7.103671245342887e-05,
1089
+ "loss": 0.6661,
1090
+ "step": 715
1091
+ },
1092
+ {
1093
+ "epoch": 2.570281124497992,
1094
+ "grad_norm": 3.090599775314331,
1095
+ "learning_rate": 7.056435515653059e-05,
1096
+ "loss": 0.5388,
1097
+ "step": 720
1098
+ },
1099
+ {
1100
+ "epoch": 2.5881302989736725,
1101
+ "grad_norm": 2.799252986907959,
1102
+ "learning_rate": 7.008977839943299e-05,
1103
+ "loss": 0.6641,
1104
+ "step": 725
1105
+ },
1106
+ {
1107
+ "epoch": 2.605979473449353,
1108
+ "grad_norm": 2.8093032836914062,
1109
+ "learning_rate": 6.961303340203653e-05,
1110
+ "loss": 0.6221,
1111
+ "step": 730
1112
+ },
1113
+ {
1114
+ "epoch": 2.6238286479250332,
1115
+ "grad_norm": 3.6351985931396484,
1116
+ "learning_rate": 6.91341716182545e-05,
1117
+ "loss": 0.599,
1118
+ "step": 735
1119
+ },
1120
+ {
1121
+ "epoch": 2.641677822400714,
1122
+ "grad_norm": 2.6190829277038574,
1123
+ "learning_rate": 6.86532447304597e-05,
1124
+ "loss": 0.6047,
1125
+ "step": 740
1126
+ },
1127
+ {
1128
+ "epoch": 2.6595269968763944,
1129
+ "grad_norm": 3.227262020111084,
1130
+ "learning_rate": 6.817030464390656e-05,
1131
+ "loss": 0.614,
1132
+ "step": 745
1133
+ },
1134
+ {
1135
+ "epoch": 2.677376171352075,
1136
+ "grad_norm": 2.5810439586639404,
1137
+ "learning_rate": 6.768540348112907e-05,
1138
+ "loss": 0.6367,
1139
+ "step": 750
1140
+ },
1141
+ {
1142
+ "epoch": 2.6952253458277555,
1143
+ "grad_norm": 3.030888557434082,
1144
+ "learning_rate": 6.719859357631535e-05,
1145
+ "loss": 0.5681,
1146
+ "step": 755
1147
+ },
1148
+ {
1149
+ "epoch": 2.713074520303436,
1150
+ "grad_norm": 3.1176657676696777,
1151
+ "learning_rate": 6.670992746965938e-05,
1152
+ "loss": 0.5723,
1153
+ "step": 760
1154
+ },
1155
+ {
1156
+ "epoch": 2.7309236947791167,
1157
+ "grad_norm": 3.0151100158691406,
1158
+ "learning_rate": 6.621945790169036e-05,
1159
+ "loss": 0.6385,
1160
+ "step": 765
1161
+ },
1162
+ {
1163
+ "epoch": 2.748772869254797,
1164
+ "grad_norm": 3.4799766540527344,
1165
+ "learning_rate": 6.572723780758069e-05,
1166
+ "loss": 0.6665,
1167
+ "step": 770
1168
+ },
1169
+ {
1170
+ "epoch": 2.748772869254797,
1171
+ "eval_loss": 1.5236101150512695,
1172
+ "eval_runtime": 17.7462,
1173
+ "eval_samples_per_second": 2.592,
1174
+ "eval_steps_per_second": 2.592,
1175
+ "step": 770
1176
+ },
1177
+ {
1178
+ "epoch": 2.7666220437304774,
1179
+ "grad_norm": 3.1448163986206055,
1180
+ "learning_rate": 6.523332031143272e-05,
1181
+ "loss": 0.6083,
1182
+ "step": 775
1183
+ },
1184
+ {
1185
+ "epoch": 2.784471218206158,
1186
+ "grad_norm": 2.874833106994629,
1187
+ "learning_rate": 6.473775872054521e-05,
1188
+ "loss": 0.6493,
1189
+ "step": 780
1190
+ },
1191
+ {
1192
+ "epoch": 2.8023203926818385,
1193
+ "grad_norm": 3.2550127506256104,
1194
+ "learning_rate": 6.424060651966007e-05,
1195
+ "loss": 0.5722,
1196
+ "step": 785
1197
+ },
1198
+ {
1199
+ "epoch": 2.820169567157519,
1200
+ "grad_norm": 3.066908121109009,
1201
+ "learning_rate": 6.374191736518974e-05,
1202
+ "loss": 0.611,
1203
+ "step": 790
1204
+ },
1205
+ {
1206
+ "epoch": 2.8380187416331992,
1207
+ "grad_norm": 3.05871319770813,
1208
+ "learning_rate": 6.324174507942637e-05,
1209
+ "loss": 0.6202,
1210
+ "step": 795
1211
+ },
1212
+ {
1213
+ "epoch": 2.85586791610888,
1214
+ "grad_norm": 3.2599833011627197,
1215
+ "learning_rate": 6.274014364473274e-05,
1216
+ "loss": 0.5593,
1217
+ "step": 800
1218
+ },
1219
+ {
1220
+ "epoch": 2.8737170905845604,
1221
+ "grad_norm": 2.897418260574341,
1222
+ "learning_rate": 6.22371671977162e-05,
1223
+ "loss": 0.7415,
1224
+ "step": 805
1225
+ },
1226
+ {
1227
+ "epoch": 2.891566265060241,
1228
+ "grad_norm": 3.032317876815796,
1229
+ "learning_rate": 6.173287002338577e-05,
1230
+ "loss": 0.6544,
1231
+ "step": 810
1232
+ },
1233
+ {
1234
+ "epoch": 2.9094154395359215,
1235
+ "grad_norm": 2.7111008167266846,
1236
+ "learning_rate": 6.122730654929334e-05,
1237
+ "loss": 0.6421,
1238
+ "step": 815
1239
+ },
1240
+ {
1241
+ "epoch": 2.927264614011602,
1242
+ "grad_norm": 2.7735886573791504,
1243
+ "learning_rate": 6.072053133965938e-05,
1244
+ "loss": 0.6332,
1245
+ "step": 820
1246
+ },
1247
+ {
1248
+ "epoch": 2.9451137884872827,
1249
+ "grad_norm": 3.4417500495910645,
1250
+ "learning_rate": 6.021259908948402e-05,
1251
+ "loss": 0.6508,
1252
+ "step": 825
1253
+ },
1254
+ {
1255
+ "epoch": 2.962962962962963,
1256
+ "grad_norm": 3.432999849319458,
1257
+ "learning_rate": 5.970356461864391e-05,
1258
+ "loss": 0.621,
1259
+ "step": 830
1260
+ },
1261
+ {
1262
+ "epoch": 2.9808121374386434,
1263
+ "grad_norm": 3.470132827758789,
1264
+ "learning_rate": 5.919348286597569e-05,
1265
+ "loss": 0.6347,
1266
+ "step": 835
1267
+ },
1268
+ {
1269
+ "epoch": 2.998661311914324,
1270
+ "grad_norm": 3.153116226196289,
1271
+ "learning_rate": 5.868240888334653e-05,
1272
+ "loss": 0.6101,
1273
+ "step": 840
1274
+ },
1275
+ {
1276
+ "epoch": 2.998661311914324,
1277
+ "eval_loss": 1.5220016241073608,
1278
+ "eval_runtime": 17.7399,
1279
+ "eval_samples_per_second": 2.593,
1280
+ "eval_steps_per_second": 2.593,
1281
+ "step": 840
1282
+ },
1283
+ {
1284
+ "epoch": 3.0165104863900045,
1285
+ "grad_norm": 2.5395278930664062,
1286
+ "learning_rate": 5.8170397829712485e-05,
1287
+ "loss": 0.4183,
1288
+ "step": 845
1289
+ },
1290
+ {
1291
+ "epoch": 3.034359660865685,
1292
+ "grad_norm": 2.833970308303833,
1293
+ "learning_rate": 5.765750496516547e-05,
1294
+ "loss": 0.1667,
1295
+ "step": 850
1296
+ },
1297
+ {
1298
+ "epoch": 3.0522088353413657,
1299
+ "grad_norm": 3.447057008743286,
1300
+ "learning_rate": 5.714378564496901e-05,
1301
+ "loss": 0.255,
1302
+ "step": 855
1303
+ },
1304
+ {
1305
+ "epoch": 3.070058009817046,
1306
+ "grad_norm": 3.9993224143981934,
1307
+ "learning_rate": 5.6629295313583974e-05,
1308
+ "loss": 0.2424,
1309
+ "step": 860
1310
+ },
1311
+ {
1312
+ "epoch": 3.0879071842927264,
1313
+ "grad_norm": 3.626281499862671,
1314
+ "learning_rate": 5.611408949868457e-05,
1315
+ "loss": 0.2097,
1316
+ "step": 865
1317
+ },
1318
+ {
1319
+ "epoch": 3.105756358768407,
1320
+ "grad_norm": 2.693284034729004,
1321
+ "learning_rate": 5.559822380516539e-05,
1322
+ "loss": 0.2271,
1323
+ "step": 870
1324
+ },
1325
+ {
1326
+ "epoch": 3.1236055332440875,
1327
+ "grad_norm": 2.439389705657959,
1328
+ "learning_rate": 5.5081753909140096e-05,
1329
+ "loss": 0.1982,
1330
+ "step": 875
1331
+ },
1332
+ {
1333
+ "epoch": 3.141454707719768,
1334
+ "grad_norm": 2.6163575649261475,
1335
+ "learning_rate": 5.456473555193242e-05,
1336
+ "loss": 0.2192,
1337
+ "step": 880
1338
+ },
1339
+ {
1340
+ "epoch": 3.1593038821954487,
1341
+ "grad_norm": 2.405829668045044,
1342
+ "learning_rate": 5.404722453406017e-05,
1343
+ "loss": 0.2097,
1344
+ "step": 885
1345
+ },
1346
+ {
1347
+ "epoch": 3.177153056671129,
1348
+ "grad_norm": 2.819413423538208,
1349
+ "learning_rate": 5.3529276709212816e-05,
1350
+ "loss": 0.2213,
1351
+ "step": 890
1352
+ },
1353
+ {
1354
+ "epoch": 3.1950022311468094,
1355
+ "grad_norm": 3.6370203495025635,
1356
+ "learning_rate": 5.30109479782233e-05,
1357
+ "loss": 0.2559,
1358
+ "step": 895
1359
+ },
1360
+ {
1361
+ "epoch": 3.21285140562249,
1362
+ "grad_norm": 3.4090726375579834,
1363
+ "learning_rate": 5.249229428303486e-05,
1364
+ "loss": 0.1955,
1365
+ "step": 900
1366
+ },
1367
+ {
1368
+ "epoch": 3.2307005800981705,
1369
+ "grad_norm": 2.8171908855438232,
1370
+ "learning_rate": 5.197337160066331e-05,
1371
+ "loss": 0.2642,
1372
+ "step": 905
1373
+ },
1374
+ {
1375
+ "epoch": 3.248549754573851,
1376
+ "grad_norm": 3.926447629928589,
1377
+ "learning_rate": 5.145423593715557e-05,
1378
+ "loss": 0.2467,
1379
+ "step": 910
1380
+ },
1381
+ {
1382
+ "epoch": 3.248549754573851,
1383
+ "eval_loss": 1.8390079736709595,
1384
+ "eval_runtime": 17.7348,
1385
+ "eval_samples_per_second": 2.594,
1386
+ "eval_steps_per_second": 2.594,
1387
+ "step": 910
1388
+ },
1389
+ {
1390
+ "epoch": 3.266398929049531,
1391
+ "grad_norm": 2.7143030166625977,
1392
+ "learning_rate": 5.0934943321545115e-05,
1393
+ "loss": 0.2239,
1394
+ "step": 915
1395
+ },
1396
+ {
1397
+ "epoch": 3.284248103525212,
1398
+ "grad_norm": 2.717496871948242,
1399
+ "learning_rate": 5.041554979980486e-05,
1400
+ "loss": 0.1545,
1401
+ "step": 920
1402
+ },
1403
+ {
1404
+ "epoch": 3.3020972780008924,
1405
+ "grad_norm": 3.516397714614868,
1406
+ "learning_rate": 4.9896111428798254e-05,
1407
+ "loss": 0.2819,
1408
+ "step": 925
1409
+ },
1410
+ {
1411
+ "epoch": 3.319946452476573,
1412
+ "grad_norm": 3.3290677070617676,
1413
+ "learning_rate": 4.9376684270229254e-05,
1414
+ "loss": 0.3043,
1415
+ "step": 930
1416
+ },
1417
+ {
1418
+ "epoch": 3.3377956269522535,
1419
+ "grad_norm": 2.914736032485962,
1420
+ "learning_rate": 4.8857324384591653e-05,
1421
+ "loss": 0.2494,
1422
+ "step": 935
1423
+ },
1424
+ {
1425
+ "epoch": 3.355644801427934,
1426
+ "grad_norm": 3.37791109085083,
1427
+ "learning_rate": 4.8338087825118675e-05,
1428
+ "loss": 0.2271,
1429
+ "step": 940
1430
+ },
1431
+ {
1432
+ "epoch": 3.3734939759036147,
1433
+ "grad_norm": 3.295100688934326,
1434
+ "learning_rate": 4.781903063173321e-05,
1435
+ "loss": 0.242,
1436
+ "step": 945
1437
+ },
1438
+ {
1439
+ "epoch": 3.391343150379295,
1440
+ "grad_norm": 2.5792458057403564,
1441
+ "learning_rate": 4.730020882499964e-05,
1442
+ "loss": 0.2244,
1443
+ "step": 950
1444
+ },
1445
+ {
1446
+ "epoch": 3.4091923248549754,
1447
+ "grad_norm": 3.0014591217041016,
1448
+ "learning_rate": 4.678167840007767e-05,
1449
+ "loss": 0.2552,
1450
+ "step": 955
1451
+ },
1452
+ {
1453
+ "epoch": 3.427041499330656,
1454
+ "grad_norm": 3.207282066345215,
1455
+ "learning_rate": 4.626349532067879e-05,
1456
+ "loss": 0.2542,
1457
+ "step": 960
1458
+ },
1459
+ {
1460
+ "epoch": 3.4448906738063365,
1461
+ "grad_norm": 3.85109543800354,
1462
+ "learning_rate": 4.574571551302647e-05,
1463
+ "loss": 0.3249,
1464
+ "step": 965
1465
+ },
1466
+ {
1467
+ "epoch": 3.462739848282017,
1468
+ "grad_norm": 3.3335843086242676,
1469
+ "learning_rate": 4.522839485981994e-05,
1470
+ "loss": 0.2729,
1471
+ "step": 970
1472
+ },
1473
+ {
1474
+ "epoch": 3.480589022757697,
1475
+ "grad_norm": 2.885708808898926,
1476
+ "learning_rate": 4.471158919420312e-05,
1477
+ "loss": 0.2595,
1478
+ "step": 975
1479
+ },
1480
+ {
1481
+ "epoch": 3.498438197233378,
1482
+ "grad_norm": 3.215789556503296,
1483
+ "learning_rate": 4.4195354293738484e-05,
1484
+ "loss": 0.2284,
1485
+ "step": 980
1486
+ },
1487
+ {
1488
+ "epoch": 3.498438197233378,
1489
+ "eval_loss": 1.82525634765625,
1490
+ "eval_runtime": 17.7537,
1491
+ "eval_samples_per_second": 2.591,
1492
+ "eval_steps_per_second": 2.591,
1493
+ "step": 980
1494
+ },
1495
+ {
1496
+ "epoch": 3.5162873717090584,
1497
+ "grad_norm": 3.4772818088531494,
1498
+ "learning_rate": 4.367974587438733e-05,
1499
+ "loss": 0.1947,
1500
+ "step": 985
1501
+ },
1502
+ {
1503
+ "epoch": 3.534136546184739,
1504
+ "grad_norm": 2.6401774883270264,
1505
+ "learning_rate": 4.316481958449634e-05,
1506
+ "loss": 0.2352,
1507
+ "step": 990
1508
+ },
1509
+ {
1510
+ "epoch": 3.5519857206604195,
1511
+ "grad_norm": 3.997591733932495,
1512
+ "learning_rate": 4.2650630998791615e-05,
1513
+ "loss": 0.2047,
1514
+ "step": 995
1515
+ },
1516
+ {
1517
+ "epoch": 3.5698348951361,
1518
+ "grad_norm": 2.5615384578704834,
1519
+ "learning_rate": 4.213723561238074e-05,
1520
+ "loss": 0.2369,
1521
+ "step": 1000
1522
+ },
1523
+ {
1524
+ "epoch": 3.5876840696117807,
1525
+ "grad_norm": 2.5114736557006836,
1526
+ "learning_rate": 4.162468883476319e-05,
1527
+ "loss": 0.2416,
1528
+ "step": 1005
1529
+ },
1530
+ {
1531
+ "epoch": 3.605533244087461,
1532
+ "grad_norm": 4.23993444442749,
1533
+ "learning_rate": 4.111304598385018e-05,
1534
+ "loss": 0.2353,
1535
+ "step": 1010
1536
+ },
1537
+ {
1538
+ "epoch": 3.6233824185631414,
1539
+ "grad_norm": 3.239319324493408,
1540
+ "learning_rate": 4.060236227999441e-05,
1541
+ "loss": 0.2155,
1542
+ "step": 1015
1543
+ },
1544
+ {
1545
+ "epoch": 3.641231593038822,
1546
+ "grad_norm": 2.030393600463867,
1547
+ "learning_rate": 4.0092692840030134e-05,
1548
+ "loss": 0.2241,
1549
+ "step": 1020
1550
+ },
1551
+ {
1552
+ "epoch": 3.6590807675145025,
1553
+ "grad_norm": 3.636963367462158,
1554
+ "learning_rate": 3.9584092671324606e-05,
1555
+ "loss": 0.2408,
1556
+ "step": 1025
1557
+ },
1558
+ {
1559
+ "epoch": 3.676929941990183,
1560
+ "grad_norm": 4.295063495635986,
1561
+ "learning_rate": 3.907661666584131e-05,
1562
+ "loss": 0.2423,
1563
+ "step": 1030
1564
+ },
1565
+ {
1566
+ "epoch": 3.694779116465863,
1567
+ "grad_norm": 3.268596887588501,
1568
+ "learning_rate": 3.857031959421553e-05,
1569
+ "loss": 0.2581,
1570
+ "step": 1035
1571
+ },
1572
+ {
1573
+ "epoch": 3.7126282909415442,
1574
+ "grad_norm": 3.0428457260131836,
1575
+ "learning_rate": 3.806525609984312e-05,
1576
+ "loss": 0.206,
1577
+ "step": 1040
1578
+ },
1579
+ {
1580
+ "epoch": 3.7304774654172244,
1581
+ "grad_norm": 3.523777484893799,
1582
+ "learning_rate": 3.7561480692983006e-05,
1583
+ "loss": 0.1956,
1584
+ "step": 1045
1585
+ },
1586
+ {
1587
+ "epoch": 3.748326639892905,
1588
+ "grad_norm": 2.972714900970459,
1589
+ "learning_rate": 3.705904774487396e-05,
1590
+ "loss": 0.2839,
1591
+ "step": 1050
1592
+ },
1593
+ {
1594
+ "epoch": 3.748326639892905,
1595
+ "eval_loss": 1.8687995672225952,
1596
+ "eval_runtime": 17.732,
1597
+ "eval_samples_per_second": 2.594,
1598
+ "eval_steps_per_second": 2.594,
1599
+ "step": 1050
1600
+ },
1601
+ {
1602
+ "epoch": 3.7661758143685855,
1603
+ "grad_norm": 3.9769251346588135,
1604
+ "learning_rate": 3.655801148186655e-05,
1605
+ "loss": 0.2433,
1606
+ "step": 1055
1607
+ },
1608
+ {
1609
+ "epoch": 3.784024988844266,
1610
+ "grad_norm": 3.03606915473938,
1611
+ "learning_rate": 3.6058425979570485e-05,
1612
+ "loss": 0.2085,
1613
+ "step": 1060
1614
+ },
1615
+ {
1616
+ "epoch": 3.8018741633199467,
1617
+ "grad_norm": 3.5858893394470215,
1618
+ "learning_rate": 3.556034515701852e-05,
1619
+ "loss": 0.2277,
1620
+ "step": 1065
1621
+ },
1622
+ {
1623
+ "epoch": 3.819723337795627,
1624
+ "grad_norm": 2.5949602127075195,
1625
+ "learning_rate": 3.506382277084696e-05,
1626
+ "loss": 0.2497,
1627
+ "step": 1070
1628
+ },
1629
+ {
1630
+ "epoch": 3.8375725122713074,
1631
+ "grad_norm": 2.8706088066101074,
1632
+ "learning_rate": 3.4568912409493945e-05,
1633
+ "loss": 0.2462,
1634
+ "step": 1075
1635
+ },
1636
+ {
1637
+ "epoch": 3.855421686746988,
1638
+ "grad_norm": 3.238346576690674,
1639
+ "learning_rate": 3.4075667487415785e-05,
1640
+ "loss": 0.2004,
1641
+ "step": 1080
1642
+ },
1643
+ {
1644
+ "epoch": 3.8732708612226685,
1645
+ "grad_norm": 3.36478590965271,
1646
+ "learning_rate": 3.358414123932195e-05,
1647
+ "loss": 0.226,
1648
+ "step": 1085
1649
+ },
1650
+ {
1651
+ "epoch": 3.891120035698349,
1652
+ "grad_norm": 3.0954155921936035,
1653
+ "learning_rate": 3.3094386714429724e-05,
1654
+ "loss": 0.2114,
1655
+ "step": 1090
1656
+ },
1657
+ {
1658
+ "epoch": 3.908969210174029,
1659
+ "grad_norm": 3.016141891479492,
1660
+ "learning_rate": 3.2606456770738636e-05,
1661
+ "loss": 0.2694,
1662
+ "step": 1095
1663
+ },
1664
+ {
1665
+ "epoch": 3.9268183846497102,
1666
+ "grad_norm": 2.976658821105957,
1667
+ "learning_rate": 3.212040406932569e-05,
1668
+ "loss": 0.1828,
1669
+ "step": 1100
1670
+ },
1671
+ {
1672
+ "epoch": 3.9446675591253904,
1673
+ "grad_norm": 2.8186426162719727,
1674
+ "learning_rate": 3.163628106866172e-05,
1675
+ "loss": 0.1451,
1676
+ "step": 1105
1677
+ },
1678
+ {
1679
+ "epoch": 3.962516733601071,
1680
+ "grad_norm": 2.959024429321289,
1681
+ "learning_rate": 3.115414001894974e-05,
1682
+ "loss": 0.2349,
1683
+ "step": 1110
1684
+ },
1685
+ {
1686
+ "epoch": 3.9803659080767515,
1687
+ "grad_norm": 2.9852728843688965,
1688
+ "learning_rate": 3.067403295648566e-05,
1689
+ "loss": 0.2235,
1690
+ "step": 1115
1691
+ },
1692
+ {
1693
+ "epoch": 3.998215082552432,
1694
+ "grad_norm": 2.79172945022583,
1695
+ "learning_rate": 3.019601169804216e-05,
1696
+ "loss": 0.2111,
1697
+ "step": 1120
1698
+ },
1699
+ {
1700
+ "epoch": 3.998215082552432,
1701
+ "eval_loss": 1.891045093536377,
1702
+ "eval_runtime": 17.7382,
1703
+ "eval_samples_per_second": 2.593,
1704
+ "eval_steps_per_second": 2.593,
1705
+ "step": 1120
1706
+ },
1707
+ {
1708
+ "epoch": 4.016064257028113,
1709
+ "grad_norm": 1.1968103647232056,
1710
+ "learning_rate": 2.9720127835276256e-05,
1711
+ "loss": 0.1074,
1712
+ "step": 1125
1713
+ },
1714
+ {
1715
+ "epoch": 4.033913431503793,
1716
+ "grad_norm": 1.4865480661392212,
1717
+ "learning_rate": 2.9246432729161055e-05,
1718
+ "loss": 0.0628,
1719
+ "step": 1130
1720
+ },
1721
+ {
1722
+ "epoch": 4.051762605979474,
1723
+ "grad_norm": 2.913541078567505,
1724
+ "learning_rate": 2.8774977504442647e-05,
1725
+ "loss": 0.0615,
1726
+ "step": 1135
1727
+ },
1728
+ {
1729
+ "epoch": 4.069611780455154,
1730
+ "grad_norm": 2.1043801307678223,
1731
+ "learning_rate": 2.8305813044122097e-05,
1732
+ "loss": 0.0658,
1733
+ "step": 1140
1734
+ },
1735
+ {
1736
+ "epoch": 4.087460954930834,
1737
+ "grad_norm": 1.942076325416565,
1738
+ "learning_rate": 2.7838989983964065e-05,
1739
+ "loss": 0.0458,
1740
+ "step": 1145
1741
+ },
1742
+ {
1743
+ "epoch": 4.105310129406515,
1744
+ "grad_norm": 2.3953213691711426,
1745
+ "learning_rate": 2.737455870703155e-05,
1746
+ "loss": 0.0877,
1747
+ "step": 1150
1748
+ },
1749
+ {
1750
+ "epoch": 4.123159303882195,
1751
+ "grad_norm": 1.9993913173675537,
1752
+ "learning_rate": 2.6912569338248315e-05,
1753
+ "loss": 0.0567,
1754
+ "step": 1155
1755
+ },
1756
+ {
1757
+ "epoch": 4.141008478357876,
1758
+ "grad_norm": 2.4731192588806152,
1759
+ "learning_rate": 2.645307173898901e-05,
1760
+ "loss": 0.0817,
1761
+ "step": 1160
1762
+ },
1763
+ {
1764
+ "epoch": 4.158857652833556,
1765
+ "grad_norm": 2.3913474082946777,
1766
+ "learning_rate": 2.5996115501697694e-05,
1767
+ "loss": 0.0517,
1768
+ "step": 1165
1769
+ },
1770
+ {
1771
+ "epoch": 4.176706827309237,
1772
+ "grad_norm": 4.154366493225098,
1773
+ "learning_rate": 2.5541749944535554e-05,
1774
+ "loss": 0.0649,
1775
+ "step": 1170
1776
+ },
1777
+ {
1778
+ "epoch": 4.1945560017849175,
1779
+ "grad_norm": 1.4376811981201172,
1780
+ "learning_rate": 2.5090024106057962e-05,
1781
+ "loss": 0.0613,
1782
+ "step": 1175
1783
+ },
1784
+ {
1785
+ "epoch": 4.212405176260598,
1786
+ "grad_norm": 2.038010835647583,
1787
+ "learning_rate": 2.464098673992205e-05,
1788
+ "loss": 0.0763,
1789
+ "step": 1180
1790
+ },
1791
+ {
1792
+ "epoch": 4.230254350736279,
1793
+ "grad_norm": 1.862741470336914,
1794
+ "learning_rate": 2.4194686309624663e-05,
1795
+ "loss": 0.0733,
1796
+ "step": 1185
1797
+ },
1798
+ {
1799
+ "epoch": 4.248103525211959,
1800
+ "grad_norm": 2.7354800701141357,
1801
+ "learning_rate": 2.3751170983272e-05,
1802
+ "loss": 0.0753,
1803
+ "step": 1190
1804
+ },
1805
+ {
1806
+ "epoch": 4.248103525211959,
1807
+ "eval_loss": 2.2224178314208984,
1808
+ "eval_runtime": 17.7489,
1809
+ "eval_samples_per_second": 2.592,
1810
+ "eval_steps_per_second": 2.592,
1811
+ "step": 1190
1812
+ }
1813
+ ],
1814
+ "logging_steps": 5,
1815
+ "max_steps": 1680,
1816
+ "num_input_tokens_seen": 0,
1817
+ "num_train_epochs": 6,
1818
+ "save_steps": 70,
1819
+ "stateful_callbacks": {
1820
+ "TrainerControl": {
1821
+ "args": {
1822
+ "should_epoch_stop": false,
1823
+ "should_evaluate": false,
1824
+ "should_log": false,
1825
+ "should_save": true,
1826
+ "should_training_stop": false
1827
+ },
1828
+ "attributes": {}
1829
+ }
1830
+ },
1831
+ "total_flos": 1.2526229849269862e+18,
1832
+ "train_batch_size": 2,
1833
+ "trial_name": null,
1834
+ "trial_params": null
1835
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b061b26ebc79da396fea201dbc3aded12f572b2061bb961d9cec13867ed1c18f
3
+ size 5368
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1190/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-72B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-72B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
llama-factory/saves/Qwen2-72B-Instruct/checkpoint-1260/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7200021ca104f600d101aef74dc134de2b16a8f030fb7ea6b6deaef2ce249e68
3
+ size 421218912