TokenBender commited on
Commit
973ffd8
1 Parent(s): c27fe32

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +61 -0
  2. adapter_config.json +31 -0
  3. adapter_model.safetensors +3 -0
  4. added_tokens.json +4 -0
  5. checkpoint-1386/README.md +202 -0
  6. checkpoint-1386/adapter_config.json +31 -0
  7. checkpoint-1386/adapter_model.safetensors +3 -0
  8. checkpoint-1386/added_tokens.json +4 -0
  9. checkpoint-1386/merges.txt +0 -0
  10. checkpoint-1386/optimizer.pt +3 -0
  11. checkpoint-1386/rng_state.pth +3 -0
  12. checkpoint-1386/scheduler.pt +3 -0
  13. checkpoint-1386/special_tokens_map.json +28 -0
  14. checkpoint-1386/tokenizer.json +0 -0
  15. checkpoint-1386/tokenizer_config.json +338 -0
  16. checkpoint-1386/trainer_state.json +0 -0
  17. checkpoint-1386/training_args.bin +3 -0
  18. checkpoint-1386/vocab.json +0 -0
  19. checkpoint-462/README.md +202 -0
  20. checkpoint-462/adapter_config.json +31 -0
  21. checkpoint-462/adapter_model.safetensors +3 -0
  22. checkpoint-462/added_tokens.json +4 -0
  23. checkpoint-462/merges.txt +0 -0
  24. checkpoint-462/optimizer.pt +3 -0
  25. checkpoint-462/rng_state.pth +3 -0
  26. checkpoint-462/scheduler.pt +3 -0
  27. checkpoint-462/special_tokens_map.json +28 -0
  28. checkpoint-462/tokenizer.json +0 -0
  29. checkpoint-462/tokenizer_config.json +338 -0
  30. checkpoint-462/trainer_state.json +3255 -0
  31. checkpoint-462/training_args.bin +3 -0
  32. checkpoint-462/vocab.json +0 -0
  33. checkpoint-924/README.md +202 -0
  34. checkpoint-924/adapter_config.json +31 -0
  35. checkpoint-924/adapter_model.safetensors +3 -0
  36. checkpoint-924/added_tokens.json +4 -0
  37. checkpoint-924/merges.txt +0 -0
  38. checkpoint-924/optimizer.pt +3 -0
  39. checkpoint-924/rng_state.pth +3 -0
  40. checkpoint-924/scheduler.pt +3 -0
  41. checkpoint-924/special_tokens_map.json +28 -0
  42. checkpoint-924/tokenizer.json +0 -0
  43. checkpoint-924/tokenizer_config.json +338 -0
  44. checkpoint-924/trainer_state.json +0 -0
  45. checkpoint-924/training_args.bin +3 -0
  46. checkpoint-924/vocab.json +0 -0
  47. merges.txt +0 -0
  48. special_tokens_map.json +28 -0
  49. tokenizer.json +0 -0
  50. tokenizer_config.json +338 -0
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bigcode-openrail-m
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: bigcode/starcoder2-7b
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: starcoder2_15B_codefeedback
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # starcoder2_15B_codefeedback
20
+
21
+ This model is a fine-tuned version of [bigcode/starcoder2-7b](https://huggingface.co/bigcode/starcoder2-7b) on the generator dataset.
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 2e-05
41
+ - train_batch_size: 16
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 2
45
+ - total_train_batch_size: 32
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.03
49
+ - num_epochs: 1
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - PEFT 0.7.1
58
+ - Transformers 4.39.0.dev0
59
+ - Pytorch 2.2.0
60
+ - Datasets 2.16.1
61
+ - Tokenizers 0.15.2
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "o_proj",
24
+ "up_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb3aebb538254009f40575b5b13b58eee1ddc7c5c149b1422531c9c10ddfc595
3
+ size 58754872
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 49153,
3
+ "<|im_start|>": 49152
4
+ }
checkpoint-1386/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.7.1
checkpoint-1386/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "o_proj",
24
+ "up_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
checkpoint-1386/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99cba23e772563448e379b89fc406bfade485345a77cc2a2023227fcf55f4f9f
3
+ size 58754872
checkpoint-1386/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 49153,
3
+ "<|im_start|>": 49152
4
+ }
checkpoint-1386/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1386/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74660045c4e4288dfd044594b505b24a287b53a6208d9688b8eae20f96e1419b
3
+ size 117659642
checkpoint-1386/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338708309e0a78710fd3f2fc6b8c345a1f1f18b0c7090d2f2199e4f2d12ad207
3
+ size 14244
checkpoint-1386/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb6d991e5a5c6abc18487a917136e4d40a9ab7b7072b0d65e22efab061943a6
3
+ size 1064
checkpoint-1386/special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
checkpoint-1386/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1386/tokenizer_config.json ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "49152": {
309
+ "content": "<|im_start|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49153": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ }
324
+ },
325
+ "additional_special_tokens": [
326
+ "<|im_start|>",
327
+ "<|im_end|>"
328
+ ],
329
+ "bos_token": "<|im_start|>",
330
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
331
+ "clean_up_tokenization_spaces": true,
332
+ "eos_token": "<|im_end|>",
333
+ "model_max_length": 1000000000000000019884624838656,
334
+ "pad_token": "<|im_end|>",
335
+ "tokenizer_class": "GPT2Tokenizer",
336
+ "unk_token": "<|endoftext|>",
337
+ "vocab_size": 49152
338
+ }
checkpoint-1386/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1386/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eafc7d0eb7f055ddd6fbba193c9843badab904036b02ccac0ca4e09c80561ff
3
+ size 4920
checkpoint-1386/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-462/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.7.1
checkpoint-462/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "o_proj",
24
+ "up_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
checkpoint-462/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201203d8d97734f78d70b3817563572ef89eb3b0e4405916d8ccfe04129319fd
3
+ size 58754872
checkpoint-462/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 49153,
3
+ "<|im_start|>": 49152
4
+ }
checkpoint-462/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-462/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7158209076c41e69f6b886800804e2cdf2f2eb47c3155b57d8c7801a5525374
3
+ size 117659642
checkpoint-462/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be4806c65ff8d6e5ec5727649c945584857c52aecdd4b84c4725d411effd940f
3
+ size 14244
checkpoint-462/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89260cbaba7b81ebbf8a6fe57d787fcfc2dead8a8891d1c6700a773847c47ad7
3
+ size 1064
checkpoint-462/special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
checkpoint-462/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-462/tokenizer_config.json ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "49152": {
309
+ "content": "<|im_start|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49153": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ }
324
+ },
325
+ "additional_special_tokens": [
326
+ "<|im_start|>",
327
+ "<|im_end|>"
328
+ ],
329
+ "bos_token": "<|im_start|>",
330
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
331
+ "clean_up_tokenization_spaces": true,
332
+ "eos_token": "<|im_end|>",
333
+ "model_max_length": 1000000000000000019884624838656,
334
+ "pad_token": "<|im_end|>",
335
+ "tokenizer_class": "GPT2Tokenizer",
336
+ "unk_token": "<|endoftext|>",
337
+ "vocab_size": 49152
338
+ }
checkpoint-462/trainer_state.json ADDED
@@ -0,0 +1,3255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.30039011703511054,
5
+ "eval_steps": 500,
6
+ "global_step": 462,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.0303955078125,
14
+ "learning_rate": 4.2553191489361704e-07,
15
+ "loss": 1.11,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "grad_norm": 0.0294189453125,
21
+ "learning_rate": 8.510638297872341e-07,
22
+ "loss": 0.9825,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.0,
27
+ "grad_norm": 0.028076171875,
28
+ "learning_rate": 1.276595744680851e-06,
29
+ "loss": 1.0375,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.0,
34
+ "grad_norm": 0.036865234375,
35
+ "learning_rate": 1.7021276595744682e-06,
36
+ "loss": 1.042,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.0,
41
+ "grad_norm": 0.0311279296875,
42
+ "learning_rate": 2.1276595744680853e-06,
43
+ "loss": 0.9769,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.0,
48
+ "grad_norm": 0.036865234375,
49
+ "learning_rate": 2.553191489361702e-06,
50
+ "loss": 0.9316,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "grad_norm": 0.0299072265625,
56
+ "learning_rate": 2.978723404255319e-06,
57
+ "loss": 1.0077,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.01,
62
+ "grad_norm": 0.03369140625,
63
+ "learning_rate": 3.4042553191489363e-06,
64
+ "loss": 1.0346,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.01,
69
+ "grad_norm": 0.0301513671875,
70
+ "learning_rate": 3.8297872340425535e-06,
71
+ "loss": 1.0193,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.01,
76
+ "grad_norm": 0.0267333984375,
77
+ "learning_rate": 4.255319148936171e-06,
78
+ "loss": 1.0297,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.01,
83
+ "grad_norm": 0.0291748046875,
84
+ "learning_rate": 4.680851063829788e-06,
85
+ "loss": 1.0868,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.01,
90
+ "grad_norm": 0.0296630859375,
91
+ "learning_rate": 5.106382978723404e-06,
92
+ "loss": 1.0641,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.01,
97
+ "grad_norm": 0.028564453125,
98
+ "learning_rate": 5.531914893617022e-06,
99
+ "loss": 0.9389,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.01,
104
+ "grad_norm": 0.03515625,
105
+ "learning_rate": 5.957446808510638e-06,
106
+ "loss": 1.033,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.01,
111
+ "grad_norm": 0.0283203125,
112
+ "learning_rate": 6.382978723404256e-06,
113
+ "loss": 1.0216,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.01,
118
+ "grad_norm": 0.0322265625,
119
+ "learning_rate": 6.808510638297873e-06,
120
+ "loss": 1.1086,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.01,
125
+ "grad_norm": 0.0322265625,
126
+ "learning_rate": 7.234042553191491e-06,
127
+ "loss": 1.0577,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.01,
132
+ "grad_norm": 0.030517578125,
133
+ "learning_rate": 7.659574468085107e-06,
134
+ "loss": 1.0733,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.01,
139
+ "grad_norm": 0.0303955078125,
140
+ "learning_rate": 8.085106382978723e-06,
141
+ "loss": 0.9865,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.01,
146
+ "grad_norm": 0.0291748046875,
147
+ "learning_rate": 8.510638297872341e-06,
148
+ "loss": 1.0125,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.01,
153
+ "grad_norm": 0.034423828125,
154
+ "learning_rate": 8.936170212765958e-06,
155
+ "loss": 1.1245,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.01,
160
+ "grad_norm": 0.0260009765625,
161
+ "learning_rate": 9.361702127659576e-06,
162
+ "loss": 1.0024,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.01,
167
+ "grad_norm": 0.0306396484375,
168
+ "learning_rate": 9.787234042553192e-06,
169
+ "loss": 1.0131,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.02,
174
+ "grad_norm": 0.033447265625,
175
+ "learning_rate": 1.0212765957446808e-05,
176
+ "loss": 1.0171,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.02,
181
+ "grad_norm": 0.033203125,
182
+ "learning_rate": 1.0638297872340426e-05,
183
+ "loss": 0.9613,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.02,
188
+ "grad_norm": 0.03857421875,
189
+ "learning_rate": 1.1063829787234044e-05,
190
+ "loss": 1.1312,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.02,
195
+ "grad_norm": 0.036865234375,
196
+ "learning_rate": 1.1489361702127662e-05,
197
+ "loss": 1.0187,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.02,
202
+ "grad_norm": 0.03515625,
203
+ "learning_rate": 1.1914893617021277e-05,
204
+ "loss": 0.9934,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.02,
209
+ "grad_norm": 0.036376953125,
210
+ "learning_rate": 1.2340425531914895e-05,
211
+ "loss": 1.0872,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.02,
216
+ "grad_norm": 0.035888671875,
217
+ "learning_rate": 1.2765957446808513e-05,
218
+ "loss": 0.9591,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.02,
223
+ "grad_norm": 0.033203125,
224
+ "learning_rate": 1.3191489361702127e-05,
225
+ "loss": 0.9589,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.02,
230
+ "grad_norm": 0.040771484375,
231
+ "learning_rate": 1.3617021276595745e-05,
232
+ "loss": 1.0093,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.02,
237
+ "grad_norm": 0.0400390625,
238
+ "learning_rate": 1.4042553191489363e-05,
239
+ "loss": 1.0195,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.02,
244
+ "grad_norm": 0.038330078125,
245
+ "learning_rate": 1.4468085106382981e-05,
246
+ "loss": 0.8936,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.02,
251
+ "grad_norm": 0.04443359375,
252
+ "learning_rate": 1.4893617021276596e-05,
253
+ "loss": 0.9958,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.02,
258
+ "grad_norm": 0.045654296875,
259
+ "learning_rate": 1.5319148936170214e-05,
260
+ "loss": 0.9279,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.02,
265
+ "grad_norm": 0.03662109375,
266
+ "learning_rate": 1.5744680851063832e-05,
267
+ "loss": 1.0153,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.02,
272
+ "grad_norm": 0.04638671875,
273
+ "learning_rate": 1.6170212765957446e-05,
274
+ "loss": 0.9862,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.03,
279
+ "grad_norm": 0.042236328125,
280
+ "learning_rate": 1.6595744680851064e-05,
281
+ "loss": 1.0962,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.03,
286
+ "grad_norm": 0.040771484375,
287
+ "learning_rate": 1.7021276595744682e-05,
288
+ "loss": 0.956,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.03,
293
+ "grad_norm": 0.035888671875,
294
+ "learning_rate": 1.74468085106383e-05,
295
+ "loss": 1.0559,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.03,
300
+ "grad_norm": 0.04736328125,
301
+ "learning_rate": 1.7872340425531915e-05,
302
+ "loss": 1.0014,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.03,
307
+ "grad_norm": 0.051025390625,
308
+ "learning_rate": 1.8297872340425533e-05,
309
+ "loss": 1.0252,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.03,
314
+ "grad_norm": 0.04931640625,
315
+ "learning_rate": 1.872340425531915e-05,
316
+ "loss": 0.9541,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.03,
321
+ "grad_norm": 0.0556640625,
322
+ "learning_rate": 1.914893617021277e-05,
323
+ "loss": 0.9603,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.03,
328
+ "grad_norm": 0.0419921875,
329
+ "learning_rate": 1.9574468085106384e-05,
330
+ "loss": 1.0601,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.03,
335
+ "grad_norm": 0.0478515625,
336
+ "learning_rate": 2e-05,
337
+ "loss": 0.9919,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.03,
342
+ "grad_norm": 0.047119140625,
343
+ "learning_rate": 1.9999977801976743e-05,
344
+ "loss": 1.0247,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.03,
349
+ "grad_norm": 0.048095703125,
350
+ "learning_rate": 1.999991120800551e-05,
351
+ "loss": 0.9936,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.03,
356
+ "grad_norm": 0.05419921875,
357
+ "learning_rate": 1.9999800218381958e-05,
358
+ "loss": 1.0315,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.03,
363
+ "grad_norm": 0.0478515625,
364
+ "learning_rate": 1.9999644833598836e-05,
365
+ "loss": 0.9392,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.03,
370
+ "grad_norm": 0.0546875,
371
+ "learning_rate": 1.9999445054345993e-05,
372
+ "loss": 1.0716,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.03,
377
+ "grad_norm": 0.05224609375,
378
+ "learning_rate": 1.9999200881510366e-05,
379
+ "loss": 0.9724,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.04,
384
+ "grad_norm": 0.04736328125,
385
+ "learning_rate": 1.999891231617599e-05,
386
+ "loss": 0.9966,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.04,
391
+ "grad_norm": 0.049072265625,
392
+ "learning_rate": 1.9998579359623977e-05,
393
+ "loss": 0.969,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.04,
398
+ "grad_norm": 0.051513671875,
399
+ "learning_rate": 1.9998202013332525e-05,
400
+ "loss": 0.972,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.04,
405
+ "grad_norm": 0.043701171875,
406
+ "learning_rate": 1.99977802789769e-05,
407
+ "loss": 0.9705,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.04,
412
+ "grad_norm": 0.044189453125,
413
+ "learning_rate": 1.999731415842944e-05,
414
+ "loss": 1.002,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.04,
419
+ "grad_norm": 0.039794921875,
420
+ "learning_rate": 1.9996803653759534e-05,
421
+ "loss": 0.9508,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.04,
426
+ "grad_norm": 0.03759765625,
427
+ "learning_rate": 1.9996248767233616e-05,
428
+ "loss": 0.9232,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.04,
433
+ "grad_norm": 0.0390625,
434
+ "learning_rate": 1.9995649501315172e-05,
435
+ "loss": 1.0054,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.04,
440
+ "grad_norm": 0.034423828125,
441
+ "learning_rate": 1.9995005858664696e-05,
442
+ "loss": 0.9685,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.04,
447
+ "grad_norm": 0.03369140625,
448
+ "learning_rate": 1.9994317842139715e-05,
449
+ "loss": 0.9313,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.04,
454
+ "grad_norm": 0.0311279296875,
455
+ "learning_rate": 1.9993585454794748e-05,
456
+ "loss": 0.9463,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.04,
461
+ "grad_norm": 0.0311279296875,
462
+ "learning_rate": 1.9992808699881303e-05,
463
+ "loss": 0.9049,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.04,
468
+ "grad_norm": 0.0322265625,
469
+ "learning_rate": 1.999198758084787e-05,
470
+ "loss": 0.9088,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.04,
475
+ "grad_norm": 0.033203125,
476
+ "learning_rate": 1.9991122101339885e-05,
477
+ "loss": 0.9369,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.04,
482
+ "grad_norm": 0.0478515625,
483
+ "learning_rate": 1.9990212265199738e-05,
484
+ "loss": 0.9902,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.04,
489
+ "grad_norm": 0.03466796875,
490
+ "learning_rate": 1.9989258076466743e-05,
491
+ "loss": 0.9569,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.05,
496
+ "grad_norm": 0.042724609375,
497
+ "learning_rate": 1.998825953937712e-05,
498
+ "loss": 0.9779,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.05,
503
+ "grad_norm": 0.0380859375,
504
+ "learning_rate": 1.9987216658363983e-05,
505
+ "loss": 0.9505,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.05,
510
+ "grad_norm": 0.036376953125,
511
+ "learning_rate": 1.9986129438057306e-05,
512
+ "loss": 0.9374,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.05,
517
+ "grad_norm": 0.0361328125,
518
+ "learning_rate": 1.998499788328392e-05,
519
+ "loss": 1.0086,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.05,
524
+ "grad_norm": 0.034912109375,
525
+ "learning_rate": 1.9983821999067478e-05,
526
+ "loss": 1.046,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.05,
531
+ "grad_norm": 0.031005859375,
532
+ "learning_rate": 1.998260179062844e-05,
533
+ "loss": 0.9375,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.05,
538
+ "grad_norm": 0.032958984375,
539
+ "learning_rate": 1.9981337263384057e-05,
540
+ "loss": 0.9514,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.05,
545
+ "grad_norm": 0.031982421875,
546
+ "learning_rate": 1.9980028422948323e-05,
547
+ "loss": 0.8629,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.05,
552
+ "grad_norm": 0.03125,
553
+ "learning_rate": 1.9978675275131975e-05,
554
+ "loss": 0.933,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.05,
559
+ "grad_norm": 0.0299072265625,
560
+ "learning_rate": 1.9977277825942453e-05,
561
+ "loss": 0.9408,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.05,
566
+ "grad_norm": 0.031494140625,
567
+ "learning_rate": 1.997583608158388e-05,
568
+ "loss": 1.0041,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.05,
573
+ "grad_norm": 0.031982421875,
574
+ "learning_rate": 1.997435004845703e-05,
575
+ "loss": 0.9605,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.05,
580
+ "grad_norm": 0.03271484375,
581
+ "learning_rate": 1.99728197331593e-05,
582
+ "loss": 0.9256,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.05,
587
+ "grad_norm": 0.034912109375,
588
+ "learning_rate": 1.9971245142484693e-05,
589
+ "loss": 1.0026,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.05,
594
+ "grad_norm": 0.031494140625,
595
+ "learning_rate": 1.996962628342376e-05,
596
+ "loss": 0.9789,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.06,
601
+ "grad_norm": 0.0341796875,
602
+ "learning_rate": 1.99679631631636e-05,
603
+ "loss": 0.9437,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.06,
608
+ "grad_norm": 0.030517578125,
609
+ "learning_rate": 1.996625578908781e-05,
610
+ "loss": 0.9487,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.06,
615
+ "grad_norm": 0.033447265625,
616
+ "learning_rate": 1.9964504168776454e-05,
617
+ "loss": 0.9645,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.06,
622
+ "grad_norm": 0.03271484375,
623
+ "learning_rate": 1.9962708310006032e-05,
624
+ "loss": 0.9967,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.06,
629
+ "grad_norm": 0.0296630859375,
630
+ "learning_rate": 1.996086822074945e-05,
631
+ "loss": 1.0195,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.06,
636
+ "grad_norm": 0.030517578125,
637
+ "learning_rate": 1.9958983909175977e-05,
638
+ "loss": 0.8769,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.06,
643
+ "grad_norm": 0.031494140625,
644
+ "learning_rate": 1.995705538365121e-05,
645
+ "loss": 0.8407,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.06,
650
+ "grad_norm": 0.033203125,
651
+ "learning_rate": 1.995508265273704e-05,
652
+ "loss": 0.9368,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.06,
657
+ "grad_norm": 0.031982421875,
658
+ "learning_rate": 1.9953065725191613e-05,
659
+ "loss": 0.9308,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.06,
664
+ "grad_norm": 0.03076171875,
665
+ "learning_rate": 1.9951004609969286e-05,
666
+ "loss": 0.9235,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.06,
671
+ "grad_norm": 0.032958984375,
672
+ "learning_rate": 1.9948899316220603e-05,
673
+ "loss": 0.9008,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.06,
678
+ "grad_norm": 0.03173828125,
679
+ "learning_rate": 1.9946749853292233e-05,
680
+ "loss": 0.9735,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.06,
685
+ "grad_norm": 0.033447265625,
686
+ "learning_rate": 1.994455623072694e-05,
687
+ "loss": 0.9328,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.06,
692
+ "grad_norm": 0.033203125,
693
+ "learning_rate": 1.994231845826354e-05,
694
+ "loss": 0.8967,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.06,
699
+ "grad_norm": 0.03173828125,
700
+ "learning_rate": 1.994003654583686e-05,
701
+ "loss": 0.8363,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.07,
706
+ "grad_norm": 0.033447265625,
707
+ "learning_rate": 1.993771050357769e-05,
708
+ "loss": 0.9072,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.07,
713
+ "grad_norm": 0.03369140625,
714
+ "learning_rate": 1.9935340341812737e-05,
715
+ "loss": 0.9502,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.07,
720
+ "grad_norm": 0.03271484375,
721
+ "learning_rate": 1.993292607106458e-05,
722
+ "loss": 0.8794,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.07,
727
+ "grad_norm": 0.03466796875,
728
+ "learning_rate": 1.9930467702051632e-05,
729
+ "loss": 0.9601,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.07,
734
+ "grad_norm": 0.0341796875,
735
+ "learning_rate": 1.9927965245688073e-05,
736
+ "loss": 0.9099,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.07,
741
+ "grad_norm": 0.033935546875,
742
+ "learning_rate": 1.9925418713083824e-05,
743
+ "loss": 0.929,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.07,
748
+ "grad_norm": 0.033447265625,
749
+ "learning_rate": 1.992282811554448e-05,
750
+ "loss": 0.9046,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.07,
755
+ "grad_norm": 0.031005859375,
756
+ "learning_rate": 1.9920193464571277e-05,
757
+ "loss": 0.9393,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.07,
762
+ "grad_norm": 0.03515625,
763
+ "learning_rate": 1.9917514771861015e-05,
764
+ "loss": 0.9933,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.07,
769
+ "grad_norm": 0.035400390625,
770
+ "learning_rate": 1.9914792049306034e-05,
771
+ "loss": 0.8865,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.07,
776
+ "grad_norm": 0.032958984375,
777
+ "learning_rate": 1.9912025308994146e-05,
778
+ "loss": 0.9158,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.07,
783
+ "grad_norm": 0.035888671875,
784
+ "learning_rate": 1.990921456320859e-05,
785
+ "loss": 0.9143,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.07,
790
+ "grad_norm": 0.03369140625,
791
+ "learning_rate": 1.9906359824427953e-05,
792
+ "loss": 0.9707,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.07,
797
+ "grad_norm": 0.0341796875,
798
+ "learning_rate": 1.9903461105326155e-05,
799
+ "loss": 0.8894,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.07,
804
+ "grad_norm": 0.0341796875,
805
+ "learning_rate": 1.9900518418772364e-05,
806
+ "loss": 0.966,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.07,
811
+ "grad_norm": 0.035888671875,
812
+ "learning_rate": 1.989753177783094e-05,
813
+ "loss": 0.9201,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.08,
818
+ "grad_norm": 0.0341796875,
819
+ "learning_rate": 1.9894501195761393e-05,
820
+ "loss": 0.9299,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.08,
825
+ "grad_norm": 0.03369140625,
826
+ "learning_rate": 1.9891426686018308e-05,
827
+ "loss": 0.8812,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.08,
832
+ "grad_norm": 0.035888671875,
833
+ "learning_rate": 1.9888308262251286e-05,
834
+ "loss": 0.9995,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.08,
839
+ "grad_norm": 0.031982421875,
840
+ "learning_rate": 1.9885145938304905e-05,
841
+ "loss": 0.8804,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.08,
846
+ "grad_norm": 0.03955078125,
847
+ "learning_rate": 1.988193972821863e-05,
848
+ "loss": 0.9021,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.08,
853
+ "grad_norm": 0.039306640625,
854
+ "learning_rate": 1.987868964622676e-05,
855
+ "loss": 0.8066,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.08,
860
+ "grad_norm": 0.0361328125,
861
+ "learning_rate": 1.9875395706758388e-05,
862
+ "loss": 0.909,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.08,
867
+ "grad_norm": 0.037109375,
868
+ "learning_rate": 1.987205792443729e-05,
869
+ "loss": 0.8611,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.08,
874
+ "grad_norm": 0.037841796875,
875
+ "learning_rate": 1.9868676314081907e-05,
876
+ "loss": 0.9249,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.08,
881
+ "grad_norm": 0.0322265625,
882
+ "learning_rate": 1.986525089070525e-05,
883
+ "loss": 0.837,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.08,
888
+ "grad_norm": 0.03515625,
889
+ "learning_rate": 1.986178166951484e-05,
890
+ "loss": 0.8653,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.08,
895
+ "grad_norm": 0.03662109375,
896
+ "learning_rate": 1.9858268665912653e-05,
897
+ "loss": 0.9011,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.08,
902
+ "grad_norm": 0.035888671875,
903
+ "learning_rate": 1.9854711895495034e-05,
904
+ "loss": 0.9942,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.08,
909
+ "grad_norm": 0.032958984375,
910
+ "learning_rate": 1.985111137405264e-05,
911
+ "loss": 0.9303,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.08,
916
+ "grad_norm": 0.03369140625,
917
+ "learning_rate": 1.9847467117570364e-05,
918
+ "loss": 0.9206,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.09,
923
+ "grad_norm": 0.033935546875,
924
+ "learning_rate": 1.9843779142227258e-05,
925
+ "loss": 0.8366,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.09,
930
+ "grad_norm": 0.0380859375,
931
+ "learning_rate": 1.9840047464396477e-05,
932
+ "loss": 0.8988,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.09,
937
+ "grad_norm": 0.06005859375,
938
+ "learning_rate": 1.98362721006452e-05,
939
+ "loss": 0.9719,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.09,
944
+ "grad_norm": 0.03466796875,
945
+ "learning_rate": 1.983245306773454e-05,
946
+ "loss": 0.9629,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.09,
951
+ "grad_norm": 0.0380859375,
952
+ "learning_rate": 1.98285903826195e-05,
953
+ "loss": 0.8384,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.09,
958
+ "grad_norm": 0.034423828125,
959
+ "learning_rate": 1.9824684062448876e-05,
960
+ "loss": 0.8031,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.09,
965
+ "grad_norm": 0.036376953125,
966
+ "learning_rate": 1.982073412456518e-05,
967
+ "loss": 0.8623,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.09,
972
+ "grad_norm": 0.033935546875,
973
+ "learning_rate": 1.981674058650458e-05,
974
+ "loss": 0.8357,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.09,
979
+ "grad_norm": 0.036376953125,
980
+ "learning_rate": 1.98127034659968e-05,
981
+ "loss": 0.9306,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.09,
986
+ "grad_norm": 0.03564453125,
987
+ "learning_rate": 1.9808622780965064e-05,
988
+ "loss": 0.9464,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.09,
993
+ "grad_norm": 0.033935546875,
994
+ "learning_rate": 1.9804498549526e-05,
995
+ "loss": 0.9146,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.09,
1000
+ "grad_norm": 0.034912109375,
1001
+ "learning_rate": 1.980033078998956e-05,
1002
+ "loss": 0.8999,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.09,
1007
+ "grad_norm": 0.03564453125,
1008
+ "learning_rate": 1.9796119520858957e-05,
1009
+ "loss": 0.9932,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.09,
1014
+ "grad_norm": 0.035888671875,
1015
+ "learning_rate": 1.9791864760830554e-05,
1016
+ "loss": 0.8976,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.09,
1021
+ "grad_norm": 0.03369140625,
1022
+ "learning_rate": 1.9787566528793806e-05,
1023
+ "loss": 0.9024,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.09,
1028
+ "grad_norm": 0.033447265625,
1029
+ "learning_rate": 1.9783224843831162e-05,
1030
+ "loss": 0.8262,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.1,
1035
+ "grad_norm": 0.036376953125,
1036
+ "learning_rate": 1.977883972521799e-05,
1037
+ "loss": 0.9491,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.1,
1042
+ "grad_norm": 0.0361328125,
1043
+ "learning_rate": 1.9774411192422486e-05,
1044
+ "loss": 0.9347,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.1,
1049
+ "grad_norm": 0.0390625,
1050
+ "learning_rate": 1.9769939265105573e-05,
1051
+ "loss": 0.8401,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.1,
1056
+ "grad_norm": 0.03466796875,
1057
+ "learning_rate": 1.976542396312085e-05,
1058
+ "loss": 0.8949,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.1,
1063
+ "grad_norm": 0.03369140625,
1064
+ "learning_rate": 1.976086530651447e-05,
1065
+ "loss": 0.8675,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.1,
1070
+ "grad_norm": 0.0322265625,
1071
+ "learning_rate": 1.975626331552507e-05,
1072
+ "loss": 0.8617,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.1,
1077
+ "grad_norm": 0.034423828125,
1078
+ "learning_rate": 1.9751618010583665e-05,
1079
+ "loss": 0.8374,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.1,
1084
+ "grad_norm": 0.036865234375,
1085
+ "learning_rate": 1.974692941231357e-05,
1086
+ "loss": 0.8396,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.1,
1091
+ "grad_norm": 0.034912109375,
1092
+ "learning_rate": 1.974219754153032e-05,
1093
+ "loss": 0.9553,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.1,
1098
+ "grad_norm": 0.0341796875,
1099
+ "learning_rate": 1.9737422419241538e-05,
1100
+ "loss": 0.8821,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.1,
1105
+ "grad_norm": 0.03466796875,
1106
+ "learning_rate": 1.9732604066646882e-05,
1107
+ "loss": 0.8778,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.1,
1112
+ "grad_norm": 0.03955078125,
1113
+ "learning_rate": 1.9727742505137936e-05,
1114
+ "loss": 0.8552,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.1,
1119
+ "grad_norm": 0.03515625,
1120
+ "learning_rate": 1.9722837756298112e-05,
1121
+ "loss": 0.9358,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.1,
1126
+ "grad_norm": 0.03466796875,
1127
+ "learning_rate": 1.9717889841902553e-05,
1128
+ "loss": 0.9171,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.1,
1133
+ "grad_norm": 0.033447265625,
1134
+ "learning_rate": 1.971289878391804e-05,
1135
+ "loss": 0.8395,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.11,
1140
+ "grad_norm": 0.035400390625,
1141
+ "learning_rate": 1.97078646045029e-05,
1142
+ "loss": 0.8955,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.11,
1147
+ "grad_norm": 0.0419921875,
1148
+ "learning_rate": 1.9702787326006906e-05,
1149
+ "loss": 0.8192,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.11,
1154
+ "grad_norm": 0.037353515625,
1155
+ "learning_rate": 1.9697666970971153e-05,
1156
+ "loss": 0.8264,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.11,
1161
+ "grad_norm": 0.03857421875,
1162
+ "learning_rate": 1.9692503562128004e-05,
1163
+ "loss": 0.9093,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.11,
1168
+ "grad_norm": 0.044189453125,
1169
+ "learning_rate": 1.9687297122400952e-05,
1170
+ "loss": 0.9446,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.11,
1175
+ "grad_norm": 0.03466796875,
1176
+ "learning_rate": 1.9682047674904527e-05,
1177
+ "loss": 0.8802,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.11,
1182
+ "grad_norm": 0.03564453125,
1183
+ "learning_rate": 1.9676755242944202e-05,
1184
+ "loss": 0.9152,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.11,
1189
+ "grad_norm": 0.032958984375,
1190
+ "learning_rate": 1.9671419850016283e-05,
1191
+ "loss": 0.8396,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.11,
1196
+ "grad_norm": 0.0341796875,
1197
+ "learning_rate": 1.9666041519807802e-05,
1198
+ "loss": 0.7976,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.11,
1203
+ "grad_norm": 0.036376953125,
1204
+ "learning_rate": 1.966062027619643e-05,
1205
+ "loss": 0.8979,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.11,
1210
+ "grad_norm": 0.032470703125,
1211
+ "learning_rate": 1.9655156143250328e-05,
1212
+ "loss": 0.8632,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.11,
1217
+ "grad_norm": 0.0390625,
1218
+ "learning_rate": 1.96496491452281e-05,
1219
+ "loss": 0.9456,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.11,
1224
+ "grad_norm": 0.0361328125,
1225
+ "learning_rate": 1.9644099306578636e-05,
1226
+ "loss": 0.837,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.11,
1231
+ "grad_norm": 0.031982421875,
1232
+ "learning_rate": 1.9638506651941024e-05,
1233
+ "loss": 0.7911,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.11,
1238
+ "grad_norm": 0.03564453125,
1239
+ "learning_rate": 1.963287120614444e-05,
1240
+ "loss": 0.8926,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.12,
1245
+ "grad_norm": 0.0341796875,
1246
+ "learning_rate": 1.9627192994208038e-05,
1247
+ "loss": 0.8054,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 0.12,
1252
+ "grad_norm": 0.034423828125,
1253
+ "learning_rate": 1.962147204134083e-05,
1254
+ "loss": 0.9226,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.12,
1259
+ "grad_norm": 0.035400390625,
1260
+ "learning_rate": 1.9615708372941588e-05,
1261
+ "loss": 0.8987,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.12,
1266
+ "grad_norm": 0.040283203125,
1267
+ "learning_rate": 1.960990201459872e-05,
1268
+ "loss": 0.8729,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.12,
1273
+ "grad_norm": 0.039306640625,
1274
+ "learning_rate": 1.960405299209016e-05,
1275
+ "loss": 0.9454,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.12,
1280
+ "grad_norm": 0.035888671875,
1281
+ "learning_rate": 1.9598161331383258e-05,
1282
+ "loss": 0.9157,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.12,
1287
+ "grad_norm": 0.03857421875,
1288
+ "learning_rate": 1.9592227058634655e-05,
1289
+ "loss": 0.8724,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.12,
1294
+ "grad_norm": 0.0361328125,
1295
+ "learning_rate": 1.958625020019018e-05,
1296
+ "loss": 0.8446,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.12,
1301
+ "grad_norm": 0.0341796875,
1302
+ "learning_rate": 1.9580230782584722e-05,
1303
+ "loss": 0.8441,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.12,
1308
+ "grad_norm": 0.037841796875,
1309
+ "learning_rate": 1.957416883254211e-05,
1310
+ "loss": 0.9078,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.12,
1315
+ "grad_norm": 0.037109375,
1316
+ "learning_rate": 1.9568064376975013e-05,
1317
+ "loss": 0.9075,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.12,
1322
+ "grad_norm": 0.036376953125,
1323
+ "learning_rate": 1.956191744298479e-05,
1324
+ "loss": 0.8932,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.12,
1329
+ "grad_norm": 0.03466796875,
1330
+ "learning_rate": 1.955572805786141e-05,
1331
+ "loss": 0.8577,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 0.12,
1336
+ "grad_norm": 0.03564453125,
1337
+ "learning_rate": 1.9549496249083288e-05,
1338
+ "loss": 0.8257,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 0.12,
1343
+ "grad_norm": 0.03369140625,
1344
+ "learning_rate": 1.954322204431719e-05,
1345
+ "loss": 0.7848,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 0.12,
1350
+ "grad_norm": 0.037353515625,
1351
+ "learning_rate": 1.953690547141811e-05,
1352
+ "loss": 0.8617,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 0.13,
1357
+ "grad_norm": 0.034912109375,
1358
+ "learning_rate": 1.953054655842913e-05,
1359
+ "loss": 0.7992,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 0.13,
1364
+ "grad_norm": 0.03515625,
1365
+ "learning_rate": 1.9524145333581315e-05,
1366
+ "loss": 0.8101,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 0.13,
1371
+ "grad_norm": 0.038330078125,
1372
+ "learning_rate": 1.951770182529357e-05,
1373
+ "loss": 0.8669,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 0.13,
1378
+ "grad_norm": 0.03564453125,
1379
+ "learning_rate": 1.951121606217252e-05,
1380
+ "loss": 0.8589,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 0.13,
1385
+ "grad_norm": 0.036376953125,
1386
+ "learning_rate": 1.9504688073012397e-05,
1387
+ "loss": 0.9205,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 0.13,
1392
+ "grad_norm": 0.039794921875,
1393
+ "learning_rate": 1.9498117886794885e-05,
1394
+ "loss": 0.9052,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 0.13,
1399
+ "grad_norm": 0.037109375,
1400
+ "learning_rate": 1.9491505532689017e-05,
1401
+ "loss": 0.8167,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 0.13,
1406
+ "grad_norm": 0.03662109375,
1407
+ "learning_rate": 1.948485104005103e-05,
1408
+ "loss": 0.9358,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 0.13,
1413
+ "grad_norm": 0.03759765625,
1414
+ "learning_rate": 1.947815443842424e-05,
1415
+ "loss": 0.8639,
1416
+ "step": 201
1417
+ },
1418
+ {
1419
+ "epoch": 0.13,
1420
+ "grad_norm": 0.034423828125,
1421
+ "learning_rate": 1.9471415757538918e-05,
1422
+ "loss": 0.8684,
1423
+ "step": 202
1424
+ },
1425
+ {
1426
+ "epoch": 0.13,
1427
+ "grad_norm": 0.032470703125,
1428
+ "learning_rate": 1.946463502731213e-05,
1429
+ "loss": 0.7762,
1430
+ "step": 203
1431
+ },
1432
+ {
1433
+ "epoch": 0.13,
1434
+ "grad_norm": 0.034912109375,
1435
+ "learning_rate": 1.9457812277847645e-05,
1436
+ "loss": 0.8664,
1437
+ "step": 204
1438
+ },
1439
+ {
1440
+ "epoch": 0.13,
1441
+ "grad_norm": 0.038330078125,
1442
+ "learning_rate": 1.945094753943577e-05,
1443
+ "loss": 0.9964,
1444
+ "step": 205
1445
+ },
1446
+ {
1447
+ "epoch": 0.13,
1448
+ "grad_norm": 0.037353515625,
1449
+ "learning_rate": 1.944404084255324e-05,
1450
+ "loss": 0.8768,
1451
+ "step": 206
1452
+ },
1453
+ {
1454
+ "epoch": 0.13,
1455
+ "grad_norm": 0.0380859375,
1456
+ "learning_rate": 1.9437092217863043e-05,
1457
+ "loss": 0.8999,
1458
+ "step": 207
1459
+ },
1460
+ {
1461
+ "epoch": 0.14,
1462
+ "grad_norm": 0.036376953125,
1463
+ "learning_rate": 1.9430101696214335e-05,
1464
+ "loss": 0.8437,
1465
+ "step": 208
1466
+ },
1467
+ {
1468
+ "epoch": 0.14,
1469
+ "grad_norm": 0.037841796875,
1470
+ "learning_rate": 1.9423069308642267e-05,
1471
+ "loss": 0.8273,
1472
+ "step": 209
1473
+ },
1474
+ {
1475
+ "epoch": 0.14,
1476
+ "grad_norm": 0.04052734375,
1477
+ "learning_rate": 1.9415995086367858e-05,
1478
+ "loss": 0.9275,
1479
+ "step": 210
1480
+ },
1481
+ {
1482
+ "epoch": 0.14,
1483
+ "grad_norm": 0.036865234375,
1484
+ "learning_rate": 1.940887906079786e-05,
1485
+ "loss": 0.8938,
1486
+ "step": 211
1487
+ },
1488
+ {
1489
+ "epoch": 0.14,
1490
+ "grad_norm": 0.033203125,
1491
+ "learning_rate": 1.9401721263524616e-05,
1492
+ "loss": 0.8414,
1493
+ "step": 212
1494
+ },
1495
+ {
1496
+ "epoch": 0.14,
1497
+ "grad_norm": 0.037109375,
1498
+ "learning_rate": 1.9394521726325907e-05,
1499
+ "loss": 0.9055,
1500
+ "step": 213
1501
+ },
1502
+ {
1503
+ "epoch": 0.14,
1504
+ "grad_norm": 0.042236328125,
1505
+ "learning_rate": 1.938728048116484e-05,
1506
+ "loss": 0.9002,
1507
+ "step": 214
1508
+ },
1509
+ {
1510
+ "epoch": 0.14,
1511
+ "grad_norm": 0.034912109375,
1512
+ "learning_rate": 1.9379997560189677e-05,
1513
+ "loss": 0.8598,
1514
+ "step": 215
1515
+ },
1516
+ {
1517
+ "epoch": 0.14,
1518
+ "grad_norm": 0.038330078125,
1519
+ "learning_rate": 1.9372672995733706e-05,
1520
+ "loss": 0.8557,
1521
+ "step": 216
1522
+ },
1523
+ {
1524
+ "epoch": 0.14,
1525
+ "grad_norm": 0.036376953125,
1526
+ "learning_rate": 1.9365306820315104e-05,
1527
+ "loss": 0.9001,
1528
+ "step": 217
1529
+ },
1530
+ {
1531
+ "epoch": 0.14,
1532
+ "grad_norm": 0.037109375,
1533
+ "learning_rate": 1.9357899066636774e-05,
1534
+ "loss": 0.842,
1535
+ "step": 218
1536
+ },
1537
+ {
1538
+ "epoch": 0.14,
1539
+ "grad_norm": 0.036865234375,
1540
+ "learning_rate": 1.935044976758621e-05,
1541
+ "loss": 0.8759,
1542
+ "step": 219
1543
+ },
1544
+ {
1545
+ "epoch": 0.14,
1546
+ "grad_norm": 0.03759765625,
1547
+ "learning_rate": 1.9342958956235365e-05,
1548
+ "loss": 0.8306,
1549
+ "step": 220
1550
+ },
1551
+ {
1552
+ "epoch": 0.14,
1553
+ "grad_norm": 0.03759765625,
1554
+ "learning_rate": 1.933542666584047e-05,
1555
+ "loss": 0.8322,
1556
+ "step": 221
1557
+ },
1558
+ {
1559
+ "epoch": 0.14,
1560
+ "grad_norm": 0.03515625,
1561
+ "learning_rate": 1.9327852929841918e-05,
1562
+ "loss": 0.8149,
1563
+ "step": 222
1564
+ },
1565
+ {
1566
+ "epoch": 0.14,
1567
+ "grad_norm": 0.03955078125,
1568
+ "learning_rate": 1.9320237781864106e-05,
1569
+ "loss": 0.8458,
1570
+ "step": 223
1571
+ },
1572
+ {
1573
+ "epoch": 0.15,
1574
+ "grad_norm": 0.03759765625,
1575
+ "learning_rate": 1.9312581255715276e-05,
1576
+ "loss": 0.84,
1577
+ "step": 224
1578
+ },
1579
+ {
1580
+ "epoch": 0.15,
1581
+ "grad_norm": 0.038818359375,
1582
+ "learning_rate": 1.9304883385387383e-05,
1583
+ "loss": 0.8254,
1584
+ "step": 225
1585
+ },
1586
+ {
1587
+ "epoch": 0.15,
1588
+ "grad_norm": 0.03466796875,
1589
+ "learning_rate": 1.9297144205055925e-05,
1590
+ "loss": 0.8898,
1591
+ "step": 226
1592
+ },
1593
+ {
1594
+ "epoch": 0.15,
1595
+ "grad_norm": 0.037353515625,
1596
+ "learning_rate": 1.9289363749079798e-05,
1597
+ "loss": 0.8231,
1598
+ "step": 227
1599
+ },
1600
+ {
1601
+ "epoch": 0.15,
1602
+ "grad_norm": 0.041015625,
1603
+ "learning_rate": 1.928154205200116e-05,
1604
+ "loss": 0.8764,
1605
+ "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.15,
1609
+ "grad_norm": 0.037353515625,
1610
+ "learning_rate": 1.9273679148545246e-05,
1611
+ "loss": 0.8436,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.15,
1616
+ "grad_norm": 0.037841796875,
1617
+ "learning_rate": 1.9265775073620244e-05,
1618
+ "loss": 0.8622,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.15,
1623
+ "grad_norm": 0.036865234375,
1624
+ "learning_rate": 1.9257829862317118e-05,
1625
+ "loss": 0.8484,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.15,
1630
+ "grad_norm": 0.037841796875,
1631
+ "learning_rate": 1.9249843549909467e-05,
1632
+ "loss": 0.8765,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.15,
1637
+ "grad_norm": 0.03759765625,
1638
+ "learning_rate": 1.9241816171853362e-05,
1639
+ "loss": 0.8762,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.15,
1644
+ "grad_norm": 0.03955078125,
1645
+ "learning_rate": 1.9233747763787187e-05,
1646
+ "loss": 0.8716,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.15,
1651
+ "grad_norm": 0.04443359375,
1652
+ "learning_rate": 1.9225638361531482e-05,
1653
+ "loss": 0.8453,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.15,
1658
+ "grad_norm": 0.037109375,
1659
+ "learning_rate": 1.9217488001088784e-05,
1660
+ "loss": 0.7992,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.15,
1665
+ "grad_norm": 0.036865234375,
1666
+ "learning_rate": 1.920929671864348e-05,
1667
+ "loss": 0.9607,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.15,
1672
+ "grad_norm": 0.037841796875,
1673
+ "learning_rate": 1.920106455056162e-05,
1674
+ "loss": 0.8416,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.16,
1679
+ "grad_norm": 0.0380859375,
1680
+ "learning_rate": 1.9192791533390778e-05,
1681
+ "loss": 0.7983,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.16,
1686
+ "grad_norm": 0.040283203125,
1687
+ "learning_rate": 1.9184477703859876e-05,
1688
+ "loss": 0.8942,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.16,
1693
+ "grad_norm": 0.037841796875,
1694
+ "learning_rate": 1.9176123098879035e-05,
1695
+ "loss": 0.8849,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.16,
1700
+ "grad_norm": 0.037109375,
1701
+ "learning_rate": 1.9167727755539393e-05,
1702
+ "loss": 0.83,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.16,
1707
+ "grad_norm": 0.036865234375,
1708
+ "learning_rate": 1.9159291711112962e-05,
1709
+ "loss": 0.7999,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.16,
1714
+ "grad_norm": 0.035400390625,
1715
+ "learning_rate": 1.9150815003052436e-05,
1716
+ "loss": 0.8281,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.16,
1721
+ "grad_norm": 0.038818359375,
1722
+ "learning_rate": 1.9142297668991053e-05,
1723
+ "loss": 0.884,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.16,
1728
+ "grad_norm": 0.044189453125,
1729
+ "learning_rate": 1.913373974674241e-05,
1730
+ "loss": 0.8701,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.16,
1735
+ "grad_norm": 0.039306640625,
1736
+ "learning_rate": 1.9125141274300293e-05,
1737
+ "loss": 0.8734,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.16,
1742
+ "grad_norm": 0.03857421875,
1743
+ "learning_rate": 1.9116502289838524e-05,
1744
+ "loss": 0.8851,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.16,
1749
+ "grad_norm": 0.044189453125,
1750
+ "learning_rate": 1.910782283171078e-05,
1751
+ "loss": 0.9402,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.16,
1756
+ "grad_norm": 0.038818359375,
1757
+ "learning_rate": 1.909910293845042e-05,
1758
+ "loss": 0.831,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.16,
1763
+ "grad_norm": 0.038330078125,
1764
+ "learning_rate": 1.909034264877032e-05,
1765
+ "loss": 0.8093,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.16,
1770
+ "grad_norm": 0.039794921875,
1771
+ "learning_rate": 1.9081542001562713e-05,
1772
+ "loss": 0.9085,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.16,
1777
+ "grad_norm": 0.038330078125,
1778
+ "learning_rate": 1.9072701035898985e-05,
1779
+ "loss": 0.8466,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.17,
1784
+ "grad_norm": 0.0419921875,
1785
+ "learning_rate": 1.906381979102953e-05,
1786
+ "loss": 0.8938,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.17,
1791
+ "grad_norm": 0.0400390625,
1792
+ "learning_rate": 1.9054898306383568e-05,
1793
+ "loss": 0.8787,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.17,
1798
+ "grad_norm": 0.040283203125,
1799
+ "learning_rate": 1.904593662156896e-05,
1800
+ "loss": 0.882,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.17,
1805
+ "grad_norm": 0.03759765625,
1806
+ "learning_rate": 1.903693477637204e-05,
1807
+ "loss": 0.7803,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.17,
1812
+ "grad_norm": 0.037353515625,
1813
+ "learning_rate": 1.902789281075745e-05,
1814
+ "loss": 0.8078,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.17,
1819
+ "grad_norm": 0.03857421875,
1820
+ "learning_rate": 1.9018810764867935e-05,
1821
+ "loss": 0.8318,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.17,
1826
+ "grad_norm": 0.04150390625,
1827
+ "learning_rate": 1.900968867902419e-05,
1828
+ "loss": 0.8728,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.17,
1833
+ "grad_norm": 0.0390625,
1834
+ "learning_rate": 1.9000526593724678e-05,
1835
+ "loss": 0.836,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.17,
1840
+ "grad_norm": 0.0380859375,
1841
+ "learning_rate": 1.8991324549645424e-05,
1842
+ "loss": 0.9197,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.17,
1847
+ "grad_norm": 0.04150390625,
1848
+ "learning_rate": 1.898208258763987e-05,
1849
+ "loss": 0.7965,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.17,
1854
+ "grad_norm": 0.041015625,
1855
+ "learning_rate": 1.897280074873868e-05,
1856
+ "loss": 0.8078,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.17,
1861
+ "grad_norm": 0.042236328125,
1862
+ "learning_rate": 1.8963479074149537e-05,
1863
+ "loss": 0.9035,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.17,
1868
+ "grad_norm": 0.040771484375,
1869
+ "learning_rate": 1.8954117605257e-05,
1870
+ "loss": 0.8515,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.17,
1875
+ "grad_norm": 0.041015625,
1876
+ "learning_rate": 1.8944716383622288e-05,
1877
+ "loss": 0.8147,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.17,
1882
+ "grad_norm": 0.040771484375,
1883
+ "learning_rate": 1.8935275450983102e-05,
1884
+ "loss": 0.8121,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.17,
1889
+ "grad_norm": 0.04150390625,
1890
+ "learning_rate": 1.8925794849253462e-05,
1891
+ "loss": 0.858,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.18,
1896
+ "grad_norm": 0.044921875,
1897
+ "learning_rate": 1.8916274620523482e-05,
1898
+ "loss": 0.8502,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.18,
1903
+ "grad_norm": 0.04150390625,
1904
+ "learning_rate": 1.8906714807059218e-05,
1905
+ "loss": 0.8438,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.18,
1910
+ "grad_norm": 0.0390625,
1911
+ "learning_rate": 1.889711545130246e-05,
1912
+ "loss": 0.8464,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.18,
1917
+ "grad_norm": 0.05712890625,
1918
+ "learning_rate": 1.8887476595870558e-05,
1919
+ "loss": 0.8227,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.18,
1924
+ "grad_norm": 0.036865234375,
1925
+ "learning_rate": 1.887779828355621e-05,
1926
+ "loss": 0.8546,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.18,
1931
+ "grad_norm": 0.037109375,
1932
+ "learning_rate": 1.8868080557327305e-05,
1933
+ "loss": 0.8932,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.18,
1938
+ "grad_norm": 0.041748046875,
1939
+ "learning_rate": 1.8858323460326704e-05,
1940
+ "loss": 0.889,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.18,
1945
+ "grad_norm": 0.039306640625,
1946
+ "learning_rate": 1.8848527035872057e-05,
1947
+ "loss": 0.8174,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.18,
1952
+ "grad_norm": 0.038818359375,
1953
+ "learning_rate": 1.883869132745561e-05,
1954
+ "loss": 0.8183,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.18,
1959
+ "grad_norm": 0.0390625,
1960
+ "learning_rate": 1.8828816378744035e-05,
1961
+ "loss": 0.8924,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.18,
1966
+ "grad_norm": 0.038330078125,
1967
+ "learning_rate": 1.8818902233578188e-05,
1968
+ "loss": 0.7906,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.18,
1973
+ "grad_norm": 0.04248046875,
1974
+ "learning_rate": 1.8808948935972965e-05,
1975
+ "loss": 0.8118,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.18,
1980
+ "grad_norm": 0.03857421875,
1981
+ "learning_rate": 1.8798956530117058e-05,
1982
+ "loss": 0.8512,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.18,
1987
+ "grad_norm": 0.044677734375,
1988
+ "learning_rate": 1.8788925060372806e-05,
1989
+ "loss": 0.8224,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.18,
1994
+ "grad_norm": 0.047607421875,
1995
+ "learning_rate": 1.8778854571275972e-05,
1996
+ "loss": 0.8207,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.19,
2001
+ "grad_norm": 0.038330078125,
2002
+ "learning_rate": 1.876874510753554e-05,
2003
+ "loss": 0.8011,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.19,
2008
+ "grad_norm": 0.04296875,
2009
+ "learning_rate": 1.875859671403354e-05,
2010
+ "loss": 0.8132,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.19,
2015
+ "grad_norm": 0.042236328125,
2016
+ "learning_rate": 1.874840943582482e-05,
2017
+ "loss": 0.9056,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.19,
2022
+ "grad_norm": 0.0400390625,
2023
+ "learning_rate": 1.8738183318136867e-05,
2024
+ "loss": 0.8353,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.19,
2029
+ "grad_norm": 0.040771484375,
2030
+ "learning_rate": 1.872791840636961e-05,
2031
+ "loss": 0.7943,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.19,
2036
+ "grad_norm": 0.045654296875,
2037
+ "learning_rate": 1.871761474609519e-05,
2038
+ "loss": 0.8207,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.19,
2043
+ "grad_norm": 0.04345703125,
2044
+ "learning_rate": 1.8707272383057785e-05,
2045
+ "loss": 0.8415,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.19,
2050
+ "grad_norm": 0.04052734375,
2051
+ "learning_rate": 1.8696891363173405e-05,
2052
+ "loss": 0.797,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.19,
2057
+ "grad_norm": 0.046142578125,
2058
+ "learning_rate": 1.8686471732529667e-05,
2059
+ "loss": 0.8248,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.19,
2064
+ "grad_norm": 0.041259765625,
2065
+ "learning_rate": 1.8676013537385614e-05,
2066
+ "loss": 0.76,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.19,
2071
+ "grad_norm": 0.04150390625,
2072
+ "learning_rate": 1.8665516824171497e-05,
2073
+ "loss": 0.8362,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.19,
2078
+ "grad_norm": 0.040771484375,
2079
+ "learning_rate": 1.865498163948858e-05,
2080
+ "loss": 0.8093,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.19,
2085
+ "grad_norm": 0.0380859375,
2086
+ "learning_rate": 1.864440803010891e-05,
2087
+ "loss": 0.7735,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.19,
2092
+ "grad_norm": 0.041015625,
2093
+ "learning_rate": 1.863379604297513e-05,
2094
+ "loss": 0.8824,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.19,
2099
+ "grad_norm": 0.039794921875,
2100
+ "learning_rate": 1.862314572520028e-05,
2101
+ "loss": 0.8157,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.2,
2106
+ "grad_norm": 0.03857421875,
2107
+ "learning_rate": 1.861245712406755e-05,
2108
+ "loss": 0.8084,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.2,
2113
+ "grad_norm": 0.049072265625,
2114
+ "learning_rate": 1.86017302870301e-05,
2115
+ "loss": 0.7976,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.2,
2120
+ "grad_norm": 0.041259765625,
2121
+ "learning_rate": 1.8590965261710856e-05,
2122
+ "loss": 0.8406,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.2,
2127
+ "grad_norm": 0.041015625,
2128
+ "learning_rate": 1.858016209590227e-05,
2129
+ "loss": 0.8145,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.2,
2134
+ "grad_norm": 0.038818359375,
2135
+ "learning_rate": 1.8569320837566128e-05,
2136
+ "loss": 0.8142,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.2,
2141
+ "grad_norm": 0.04052734375,
2142
+ "learning_rate": 1.8558441534833327e-05,
2143
+ "loss": 0.8894,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.2,
2148
+ "grad_norm": 0.04296875,
2149
+ "learning_rate": 1.8547524236003675e-05,
2150
+ "loss": 0.8793,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.2,
2155
+ "grad_norm": 0.0380859375,
2156
+ "learning_rate": 1.8536568989545662e-05,
2157
+ "loss": 0.868,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.2,
2162
+ "grad_norm": 0.0419921875,
2163
+ "learning_rate": 1.8525575844096243e-05,
2164
+ "loss": 0.8572,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.2,
2169
+ "grad_norm": 0.04931640625,
2170
+ "learning_rate": 1.8514544848460653e-05,
2171
+ "loss": 0.7933,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.2,
2176
+ "grad_norm": 0.0390625,
2177
+ "learning_rate": 1.8503476051612138e-05,
2178
+ "loss": 0.8017,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.2,
2183
+ "grad_norm": 0.042724609375,
2184
+ "learning_rate": 1.8492369502691785e-05,
2185
+ "loss": 0.8317,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.2,
2190
+ "grad_norm": 0.04052734375,
2191
+ "learning_rate": 1.8481225251008284e-05,
2192
+ "loss": 0.8201,
2193
+ "step": 312
2194
+ },
2195
+ {
2196
+ "epoch": 0.2,
2197
+ "grad_norm": 0.041748046875,
2198
+ "learning_rate": 1.8470043346037698e-05,
2199
+ "loss": 0.8258,
2200
+ "step": 313
2201
+ },
2202
+ {
2203
+ "epoch": 0.2,
2204
+ "grad_norm": 0.0419921875,
2205
+ "learning_rate": 1.8458823837423274e-05,
2206
+ "loss": 0.8402,
2207
+ "step": 314
2208
+ },
2209
+ {
2210
+ "epoch": 0.2,
2211
+ "grad_norm": 0.044921875,
2212
+ "learning_rate": 1.8447566774975187e-05,
2213
+ "loss": 0.9293,
2214
+ "step": 315
2215
+ },
2216
+ {
2217
+ "epoch": 0.21,
2218
+ "grad_norm": 0.048583984375,
2219
+ "learning_rate": 1.8436272208670346e-05,
2220
+ "loss": 0.8716,
2221
+ "step": 316
2222
+ },
2223
+ {
2224
+ "epoch": 0.21,
2225
+ "grad_norm": 0.0458984375,
2226
+ "learning_rate": 1.842494018865216e-05,
2227
+ "loss": 0.8868,
2228
+ "step": 317
2229
+ },
2230
+ {
2231
+ "epoch": 0.21,
2232
+ "grad_norm": 0.047607421875,
2233
+ "learning_rate": 1.841357076523032e-05,
2234
+ "loss": 0.9027,
2235
+ "step": 318
2236
+ },
2237
+ {
2238
+ "epoch": 0.21,
2239
+ "grad_norm": 0.044189453125,
2240
+ "learning_rate": 1.840216398888057e-05,
2241
+ "loss": 0.7936,
2242
+ "step": 319
2243
+ },
2244
+ {
2245
+ "epoch": 0.21,
2246
+ "grad_norm": 0.0458984375,
2247
+ "learning_rate": 1.8390719910244487e-05,
2248
+ "loss": 0.8498,
2249
+ "step": 320
2250
+ },
2251
+ {
2252
+ "epoch": 0.21,
2253
+ "grad_norm": 0.044677734375,
2254
+ "learning_rate": 1.8379238580129256e-05,
2255
+ "loss": 0.798,
2256
+ "step": 321
2257
+ },
2258
+ {
2259
+ "epoch": 0.21,
2260
+ "grad_norm": 0.044677734375,
2261
+ "learning_rate": 1.836772004950744e-05,
2262
+ "loss": 0.8746,
2263
+ "step": 322
2264
+ },
2265
+ {
2266
+ "epoch": 0.21,
2267
+ "grad_norm": 0.04541015625,
2268
+ "learning_rate": 1.8356164369516772e-05,
2269
+ "loss": 0.8658,
2270
+ "step": 323
2271
+ },
2272
+ {
2273
+ "epoch": 0.21,
2274
+ "grad_norm": 0.0400390625,
2275
+ "learning_rate": 1.834457159145989e-05,
2276
+ "loss": 0.8299,
2277
+ "step": 324
2278
+ },
2279
+ {
2280
+ "epoch": 0.21,
2281
+ "grad_norm": 0.0458984375,
2282
+ "learning_rate": 1.8332941766804152e-05,
2283
+ "loss": 0.8723,
2284
+ "step": 325
2285
+ },
2286
+ {
2287
+ "epoch": 0.21,
2288
+ "grad_norm": 0.04150390625,
2289
+ "learning_rate": 1.832127494718138e-05,
2290
+ "loss": 0.8311,
2291
+ "step": 326
2292
+ },
2293
+ {
2294
+ "epoch": 0.21,
2295
+ "grad_norm": 0.0439453125,
2296
+ "learning_rate": 1.830957118438764e-05,
2297
+ "loss": 0.8159,
2298
+ "step": 327
2299
+ },
2300
+ {
2301
+ "epoch": 0.21,
2302
+ "grad_norm": 0.044921875,
2303
+ "learning_rate": 1.829783053038301e-05,
2304
+ "loss": 0.8351,
2305
+ "step": 328
2306
+ },
2307
+ {
2308
+ "epoch": 0.21,
2309
+ "grad_norm": 0.0419921875,
2310
+ "learning_rate": 1.8286053037291356e-05,
2311
+ "loss": 0.7679,
2312
+ "step": 329
2313
+ },
2314
+ {
2315
+ "epoch": 0.21,
2316
+ "grad_norm": 0.04345703125,
2317
+ "learning_rate": 1.8274238757400096e-05,
2318
+ "loss": 0.7848,
2319
+ "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.22,
2323
+ "grad_norm": 0.04150390625,
2324
+ "learning_rate": 1.826238774315995e-05,
2325
+ "loss": 0.8741,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.22,
2330
+ "grad_norm": 0.042236328125,
2331
+ "learning_rate": 1.8250500047184744e-05,
2332
+ "loss": 0.8517,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.22,
2337
+ "grad_norm": 0.046875,
2338
+ "learning_rate": 1.8238575722251144e-05,
2339
+ "loss": 0.8602,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.22,
2344
+ "grad_norm": 0.041259765625,
2345
+ "learning_rate": 1.8226614821298444e-05,
2346
+ "loss": 0.8087,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.22,
2351
+ "grad_norm": 0.042724609375,
2352
+ "learning_rate": 1.821461739742831e-05,
2353
+ "loss": 0.8301,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.22,
2358
+ "grad_norm": 0.046875,
2359
+ "learning_rate": 1.820258350390456e-05,
2360
+ "loss": 0.8342,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.22,
2365
+ "grad_norm": 0.043701171875,
2366
+ "learning_rate": 1.819051319415293e-05,
2367
+ "loss": 0.8249,
2368
+ "step": 337
2369
+ },
2370
+ {
2371
+ "epoch": 0.22,
2372
+ "grad_norm": 0.041748046875,
2373
+ "learning_rate": 1.817840652176082e-05,
2374
+ "loss": 0.7909,
2375
+ "step": 338
2376
+ },
2377
+ {
2378
+ "epoch": 0.22,
2379
+ "grad_norm": 0.04248046875,
2380
+ "learning_rate": 1.8166263540477068e-05,
2381
+ "loss": 0.8071,
2382
+ "step": 339
2383
+ },
2384
+ {
2385
+ "epoch": 0.22,
2386
+ "grad_norm": 0.043212890625,
2387
+ "learning_rate": 1.815408430421171e-05,
2388
+ "loss": 0.7983,
2389
+ "step": 340
2390
+ },
2391
+ {
2392
+ "epoch": 0.22,
2393
+ "grad_norm": 0.041748046875,
2394
+ "learning_rate": 1.8141868867035745e-05,
2395
+ "loss": 0.7877,
2396
+ "step": 341
2397
+ },
2398
+ {
2399
+ "epoch": 0.22,
2400
+ "grad_norm": 0.04443359375,
2401
+ "learning_rate": 1.8129617283180878e-05,
2402
+ "loss": 0.9056,
2403
+ "step": 342
2404
+ },
2405
+ {
2406
+ "epoch": 0.22,
2407
+ "grad_norm": 0.043212890625,
2408
+ "learning_rate": 1.81173296070393e-05,
2409
+ "loss": 0.8708,
2410
+ "step": 343
2411
+ },
2412
+ {
2413
+ "epoch": 0.22,
2414
+ "grad_norm": 0.04541015625,
2415
+ "learning_rate": 1.8105005893163436e-05,
2416
+ "loss": 0.8387,
2417
+ "step": 344
2418
+ },
2419
+ {
2420
+ "epoch": 0.22,
2421
+ "grad_norm": 0.042724609375,
2422
+ "learning_rate": 1.8092646196265705e-05,
2423
+ "loss": 0.8578,
2424
+ "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.22,
2428
+ "grad_norm": 0.042236328125,
2429
+ "learning_rate": 1.808025057121827e-05,
2430
+ "loss": 0.8642,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.23,
2435
+ "grad_norm": 0.048095703125,
2436
+ "learning_rate": 1.8067819073052813e-05,
2437
+ "loss": 0.8058,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.23,
2442
+ "grad_norm": 0.041259765625,
2443
+ "learning_rate": 1.8055351756960262e-05,
2444
+ "loss": 0.8128,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.23,
2449
+ "grad_norm": 0.04296875,
2450
+ "learning_rate": 1.804284867829058e-05,
2451
+ "loss": 0.8387,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.23,
2456
+ "grad_norm": 0.0458984375,
2457
+ "learning_rate": 1.8030309892552488e-05,
2458
+ "loss": 0.9106,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.23,
2463
+ "grad_norm": 0.041259765625,
2464
+ "learning_rate": 1.801773545541324e-05,
2465
+ "loss": 0.752,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.23,
2470
+ "grad_norm": 0.044189453125,
2471
+ "learning_rate": 1.800512542269836e-05,
2472
+ "loss": 0.881,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.23,
2477
+ "grad_norm": 0.045166015625,
2478
+ "learning_rate": 1.7992479850391416e-05,
2479
+ "loss": 0.8004,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.23,
2484
+ "grad_norm": 0.046875,
2485
+ "learning_rate": 1.797979879463375e-05,
2486
+ "loss": 0.8075,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.23,
2491
+ "grad_norm": 0.044677734375,
2492
+ "learning_rate": 1.796708231172423e-05,
2493
+ "loss": 0.8315,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.23,
2498
+ "grad_norm": 0.041259765625,
2499
+ "learning_rate": 1.795433045811901e-05,
2500
+ "loss": 0.8506,
2501
+ "step": 356
2502
+ },
2503
+ {
2504
+ "epoch": 0.23,
2505
+ "grad_norm": 0.051025390625,
2506
+ "learning_rate": 1.7941543290431286e-05,
2507
+ "loss": 0.8314,
2508
+ "step": 357
2509
+ },
2510
+ {
2511
+ "epoch": 0.23,
2512
+ "grad_norm": 0.045166015625,
2513
+ "learning_rate": 1.792872086543103e-05,
2514
+ "loss": 0.7697,
2515
+ "step": 358
2516
+ },
2517
+ {
2518
+ "epoch": 0.23,
2519
+ "grad_norm": 0.04541015625,
2520
+ "learning_rate": 1.7915863240044727e-05,
2521
+ "loss": 0.9001,
2522
+ "step": 359
2523
+ },
2524
+ {
2525
+ "epoch": 0.23,
2526
+ "grad_norm": 0.04443359375,
2527
+ "learning_rate": 1.7902970471355162e-05,
2528
+ "loss": 0.7685,
2529
+ "step": 360
2530
+ },
2531
+ {
2532
+ "epoch": 0.23,
2533
+ "grad_norm": 0.0458984375,
2534
+ "learning_rate": 1.7890042616601125e-05,
2535
+ "loss": 0.8105,
2536
+ "step": 361
2537
+ },
2538
+ {
2539
+ "epoch": 0.24,
2540
+ "grad_norm": 0.044677734375,
2541
+ "learning_rate": 1.7877079733177185e-05,
2542
+ "loss": 0.9061,
2543
+ "step": 362
2544
+ },
2545
+ {
2546
+ "epoch": 0.24,
2547
+ "grad_norm": 0.043212890625,
2548
+ "learning_rate": 1.7864081878633414e-05,
2549
+ "loss": 0.813,
2550
+ "step": 363
2551
+ },
2552
+ {
2553
+ "epoch": 0.24,
2554
+ "grad_norm": 0.04345703125,
2555
+ "learning_rate": 1.785104911067515e-05,
2556
+ "loss": 0.8197,
2557
+ "step": 364
2558
+ },
2559
+ {
2560
+ "epoch": 0.24,
2561
+ "grad_norm": 0.044189453125,
2562
+ "learning_rate": 1.783798148716273e-05,
2563
+ "loss": 0.894,
2564
+ "step": 365
2565
+ },
2566
+ {
2567
+ "epoch": 0.24,
2568
+ "grad_norm": 0.04296875,
2569
+ "learning_rate": 1.782487906611124e-05,
2570
+ "loss": 0.7809,
2571
+ "step": 366
2572
+ },
2573
+ {
2574
+ "epoch": 0.24,
2575
+ "grad_norm": 0.04296875,
2576
+ "learning_rate": 1.781174190569024e-05,
2577
+ "loss": 0.8428,
2578
+ "step": 367
2579
+ },
2580
+ {
2581
+ "epoch": 0.24,
2582
+ "grad_norm": 0.049072265625,
2583
+ "learning_rate": 1.7798570064223536e-05,
2584
+ "loss": 0.8276,
2585
+ "step": 368
2586
+ },
2587
+ {
2588
+ "epoch": 0.24,
2589
+ "grad_norm": 0.04541015625,
2590
+ "learning_rate": 1.7785363600188894e-05,
2591
+ "loss": 0.7937,
2592
+ "step": 369
2593
+ },
2594
+ {
2595
+ "epoch": 0.24,
2596
+ "grad_norm": 0.03955078125,
2597
+ "learning_rate": 1.7772122572217796e-05,
2598
+ "loss": 0.7835,
2599
+ "step": 370
2600
+ },
2601
+ {
2602
+ "epoch": 0.24,
2603
+ "grad_norm": 0.04833984375,
2604
+ "learning_rate": 1.7758847039095167e-05,
2605
+ "loss": 0.8456,
2606
+ "step": 371
2607
+ },
2608
+ {
2609
+ "epoch": 0.24,
2610
+ "grad_norm": 0.0439453125,
2611
+ "learning_rate": 1.774553705975913e-05,
2612
+ "loss": 0.8483,
2613
+ "step": 372
2614
+ },
2615
+ {
2616
+ "epoch": 0.24,
2617
+ "grad_norm": 0.041259765625,
2618
+ "learning_rate": 1.773219269330073e-05,
2619
+ "loss": 0.7902,
2620
+ "step": 373
2621
+ },
2622
+ {
2623
+ "epoch": 0.24,
2624
+ "grad_norm": 0.04296875,
2625
+ "learning_rate": 1.7718813998963678e-05,
2626
+ "loss": 0.8734,
2627
+ "step": 374
2628
+ },
2629
+ {
2630
+ "epoch": 0.24,
2631
+ "grad_norm": 0.044921875,
2632
+ "learning_rate": 1.7705401036144086e-05,
2633
+ "loss": 0.8646,
2634
+ "step": 375
2635
+ },
2636
+ {
2637
+ "epoch": 0.24,
2638
+ "grad_norm": 0.04345703125,
2639
+ "learning_rate": 1.7691953864390208e-05,
2640
+ "loss": 0.8005,
2641
+ "step": 376
2642
+ },
2643
+ {
2644
+ "epoch": 0.25,
2645
+ "grad_norm": 0.0419921875,
2646
+ "learning_rate": 1.7678472543402166e-05,
2647
+ "loss": 0.8701,
2648
+ "step": 377
2649
+ },
2650
+ {
2651
+ "epoch": 0.25,
2652
+ "grad_norm": 0.045166015625,
2653
+ "learning_rate": 1.7664957133031705e-05,
2654
+ "loss": 0.8099,
2655
+ "step": 378
2656
+ },
2657
+ {
2658
+ "epoch": 0.25,
2659
+ "grad_norm": 0.057373046875,
2660
+ "learning_rate": 1.7651407693281896e-05,
2661
+ "loss": 0.8524,
2662
+ "step": 379
2663
+ },
2664
+ {
2665
+ "epoch": 0.25,
2666
+ "grad_norm": 0.05224609375,
2667
+ "learning_rate": 1.7637824284306898e-05,
2668
+ "loss": 0.8456,
2669
+ "step": 380
2670
+ },
2671
+ {
2672
+ "epoch": 0.25,
2673
+ "grad_norm": 0.05078125,
2674
+ "learning_rate": 1.762420696641167e-05,
2675
+ "loss": 0.7977,
2676
+ "step": 381
2677
+ },
2678
+ {
2679
+ "epoch": 0.25,
2680
+ "grad_norm": 0.044189453125,
2681
+ "learning_rate": 1.7610555800051727e-05,
2682
+ "loss": 0.7834,
2683
+ "step": 382
2684
+ },
2685
+ {
2686
+ "epoch": 0.25,
2687
+ "grad_norm": 0.045166015625,
2688
+ "learning_rate": 1.759687084583285e-05,
2689
+ "loss": 0.7946,
2690
+ "step": 383
2691
+ },
2692
+ {
2693
+ "epoch": 0.25,
2694
+ "grad_norm": 0.04443359375,
2695
+ "learning_rate": 1.7583152164510827e-05,
2696
+ "loss": 0.7456,
2697
+ "step": 384
2698
+ },
2699
+ {
2700
+ "epoch": 0.25,
2701
+ "grad_norm": 0.044677734375,
2702
+ "learning_rate": 1.7569399816991174e-05,
2703
+ "loss": 0.8358,
2704
+ "step": 385
2705
+ },
2706
+ {
2707
+ "epoch": 0.25,
2708
+ "grad_norm": 0.047119140625,
2709
+ "learning_rate": 1.7555613864328876e-05,
2710
+ "loss": 0.7976,
2711
+ "step": 386
2712
+ },
2713
+ {
2714
+ "epoch": 0.25,
2715
+ "grad_norm": 0.09423828125,
2716
+ "learning_rate": 1.754179436772812e-05,
2717
+ "loss": 0.9486,
2718
+ "step": 387
2719
+ },
2720
+ {
2721
+ "epoch": 0.25,
2722
+ "grad_norm": 0.046142578125,
2723
+ "learning_rate": 1.7527941388542006e-05,
2724
+ "loss": 0.7898,
2725
+ "step": 388
2726
+ },
2727
+ {
2728
+ "epoch": 0.25,
2729
+ "grad_norm": 0.04150390625,
2730
+ "learning_rate": 1.751405498827228e-05,
2731
+ "loss": 0.7644,
2732
+ "step": 389
2733
+ },
2734
+ {
2735
+ "epoch": 0.25,
2736
+ "grad_norm": 0.04931640625,
2737
+ "learning_rate": 1.7500135228569067e-05,
2738
+ "loss": 0.8363,
2739
+ "step": 390
2740
+ },
2741
+ {
2742
+ "epoch": 0.25,
2743
+ "grad_norm": 0.047119140625,
2744
+ "learning_rate": 1.748618217123061e-05,
2745
+ "loss": 0.801,
2746
+ "step": 391
2747
+ },
2748
+ {
2749
+ "epoch": 0.25,
2750
+ "grad_norm": 0.044677734375,
2751
+ "learning_rate": 1.7472195878202955e-05,
2752
+ "loss": 0.8487,
2753
+ "step": 392
2754
+ },
2755
+ {
2756
+ "epoch": 0.26,
2757
+ "grad_norm": 0.046142578125,
2758
+ "learning_rate": 1.7458176411579715e-05,
2759
+ "loss": 0.8884,
2760
+ "step": 393
2761
+ },
2762
+ {
2763
+ "epoch": 0.26,
2764
+ "grad_norm": 0.0439453125,
2765
+ "learning_rate": 1.7444123833601784e-05,
2766
+ "loss": 0.8484,
2767
+ "step": 394
2768
+ },
2769
+ {
2770
+ "epoch": 0.26,
2771
+ "grad_norm": 0.043701171875,
2772
+ "learning_rate": 1.743003820665705e-05,
2773
+ "loss": 0.8325,
2774
+ "step": 395
2775
+ },
2776
+ {
2777
+ "epoch": 0.26,
2778
+ "grad_norm": 0.048095703125,
2779
+ "learning_rate": 1.741591959328013e-05,
2780
+ "loss": 0.8061,
2781
+ "step": 396
2782
+ },
2783
+ {
2784
+ "epoch": 0.26,
2785
+ "grad_norm": 0.047607421875,
2786
+ "learning_rate": 1.7401768056152083e-05,
2787
+ "loss": 0.7888,
2788
+ "step": 397
2789
+ },
2790
+ {
2791
+ "epoch": 0.26,
2792
+ "grad_norm": 0.047119140625,
2793
+ "learning_rate": 1.7387583658100144e-05,
2794
+ "loss": 0.8564,
2795
+ "step": 398
2796
+ },
2797
+ {
2798
+ "epoch": 0.26,
2799
+ "grad_norm": 0.044677734375,
2800
+ "learning_rate": 1.737336646209742e-05,
2801
+ "loss": 0.8412,
2802
+ "step": 399
2803
+ },
2804
+ {
2805
+ "epoch": 0.26,
2806
+ "grad_norm": 0.0439453125,
2807
+ "learning_rate": 1.7359116531262654e-05,
2808
+ "loss": 0.9182,
2809
+ "step": 400
2810
+ },
2811
+ {
2812
+ "epoch": 0.26,
2813
+ "grad_norm": 0.047119140625,
2814
+ "learning_rate": 1.73448339288599e-05,
2815
+ "loss": 0.8653,
2816
+ "step": 401
2817
+ },
2818
+ {
2819
+ "epoch": 0.26,
2820
+ "grad_norm": 0.0478515625,
2821
+ "learning_rate": 1.7330518718298263e-05,
2822
+ "loss": 0.8174,
2823
+ "step": 402
2824
+ },
2825
+ {
2826
+ "epoch": 0.26,
2827
+ "grad_norm": 0.05859375,
2828
+ "learning_rate": 1.7316170963131627e-05,
2829
+ "loss": 0.8621,
2830
+ "step": 403
2831
+ },
2832
+ {
2833
+ "epoch": 0.26,
2834
+ "grad_norm": 0.047119140625,
2835
+ "learning_rate": 1.7301790727058344e-05,
2836
+ "loss": 0.7991,
2837
+ "step": 404
2838
+ },
2839
+ {
2840
+ "epoch": 0.26,
2841
+ "grad_norm": 0.046142578125,
2842
+ "learning_rate": 1.728737807392098e-05,
2843
+ "loss": 0.8706,
2844
+ "step": 405
2845
+ },
2846
+ {
2847
+ "epoch": 0.26,
2848
+ "grad_norm": 0.046142578125,
2849
+ "learning_rate": 1.727293306770602e-05,
2850
+ "loss": 0.824,
2851
+ "step": 406
2852
+ },
2853
+ {
2854
+ "epoch": 0.26,
2855
+ "grad_norm": 0.0498046875,
2856
+ "learning_rate": 1.7258455772543573e-05,
2857
+ "loss": 0.9865,
2858
+ "step": 407
2859
+ },
2860
+ {
2861
+ "epoch": 0.27,
2862
+ "grad_norm": 0.0537109375,
2863
+ "learning_rate": 1.7243946252707115e-05,
2864
+ "loss": 0.844,
2865
+ "step": 408
2866
+ },
2867
+ {
2868
+ "epoch": 0.27,
2869
+ "grad_norm": 0.04052734375,
2870
+ "learning_rate": 1.7229404572613174e-05,
2871
+ "loss": 0.7566,
2872
+ "step": 409
2873
+ },
2874
+ {
2875
+ "epoch": 0.27,
2876
+ "grad_norm": 0.0439453125,
2877
+ "learning_rate": 1.721483079682106e-05,
2878
+ "loss": 0.8393,
2879
+ "step": 410
2880
+ },
2881
+ {
2882
+ "epoch": 0.27,
2883
+ "grad_norm": 0.0458984375,
2884
+ "learning_rate": 1.7200224990032577e-05,
2885
+ "loss": 0.7992,
2886
+ "step": 411
2887
+ },
2888
+ {
2889
+ "epoch": 0.27,
2890
+ "grad_norm": 0.04443359375,
2891
+ "learning_rate": 1.7185587217091727e-05,
2892
+ "loss": 0.8862,
2893
+ "step": 412
2894
+ },
2895
+ {
2896
+ "epoch": 0.27,
2897
+ "grad_norm": 0.046630859375,
2898
+ "learning_rate": 1.7170917542984445e-05,
2899
+ "loss": 0.8859,
2900
+ "step": 413
2901
+ },
2902
+ {
2903
+ "epoch": 0.27,
2904
+ "grad_norm": 0.050537109375,
2905
+ "learning_rate": 1.7156216032838275e-05,
2906
+ "loss": 0.8738,
2907
+ "step": 414
2908
+ },
2909
+ {
2910
+ "epoch": 0.27,
2911
+ "grad_norm": 0.047119140625,
2912
+ "learning_rate": 1.7141482751922117e-05,
2913
+ "loss": 0.8702,
2914
+ "step": 415
2915
+ },
2916
+ {
2917
+ "epoch": 0.27,
2918
+ "grad_norm": 0.044677734375,
2919
+ "learning_rate": 1.7126717765645908e-05,
2920
+ "loss": 0.8496,
2921
+ "step": 416
2922
+ },
2923
+ {
2924
+ "epoch": 0.27,
2925
+ "grad_norm": 0.047607421875,
2926
+ "learning_rate": 1.7111921139560356e-05,
2927
+ "loss": 0.8402,
2928
+ "step": 417
2929
+ },
2930
+ {
2931
+ "epoch": 0.27,
2932
+ "grad_norm": 0.044677734375,
2933
+ "learning_rate": 1.7097092939356622e-05,
2934
+ "loss": 0.8719,
2935
+ "step": 418
2936
+ },
2937
+ {
2938
+ "epoch": 0.27,
2939
+ "grad_norm": 0.05126953125,
2940
+ "learning_rate": 1.7082233230866064e-05,
2941
+ "loss": 0.865,
2942
+ "step": 419
2943
+ },
2944
+ {
2945
+ "epoch": 0.27,
2946
+ "grad_norm": 0.0517578125,
2947
+ "learning_rate": 1.7067342080059904e-05,
2948
+ "loss": 0.8876,
2949
+ "step": 420
2950
+ },
2951
+ {
2952
+ "epoch": 0.27,
2953
+ "grad_norm": 0.0478515625,
2954
+ "learning_rate": 1.7052419553048965e-05,
2955
+ "loss": 0.8594,
2956
+ "step": 421
2957
+ },
2958
+ {
2959
+ "epoch": 0.27,
2960
+ "grad_norm": 0.041015625,
2961
+ "learning_rate": 1.703746571608337e-05,
2962
+ "loss": 0.7774,
2963
+ "step": 422
2964
+ },
2965
+ {
2966
+ "epoch": 0.28,
2967
+ "grad_norm": 0.04638671875,
2968
+ "learning_rate": 1.7022480635552243e-05,
2969
+ "loss": 0.8357,
2970
+ "step": 423
2971
+ },
2972
+ {
2973
+ "epoch": 0.28,
2974
+ "grad_norm": 0.044189453125,
2975
+ "learning_rate": 1.700746437798342e-05,
2976
+ "loss": 0.8365,
2977
+ "step": 424
2978
+ },
2979
+ {
2980
+ "epoch": 0.28,
2981
+ "grad_norm": 0.0517578125,
2982
+ "learning_rate": 1.6992417010043144e-05,
2983
+ "loss": 0.7916,
2984
+ "step": 425
2985
+ },
2986
+ {
2987
+ "epoch": 0.28,
2988
+ "grad_norm": 0.0439453125,
2989
+ "learning_rate": 1.6977338598535776e-05,
2990
+ "loss": 0.886,
2991
+ "step": 426
2992
+ },
2993
+ {
2994
+ "epoch": 0.28,
2995
+ "grad_norm": 0.04638671875,
2996
+ "learning_rate": 1.696222921040351e-05,
2997
+ "loss": 0.8391,
2998
+ "step": 427
2999
+ },
3000
+ {
3001
+ "epoch": 0.28,
3002
+ "grad_norm": 0.045166015625,
3003
+ "learning_rate": 1.6947088912726054e-05,
3004
+ "loss": 0.8403,
3005
+ "step": 428
3006
+ },
3007
+ {
3008
+ "epoch": 0.28,
3009
+ "grad_norm": 0.046875,
3010
+ "learning_rate": 1.693191777272034e-05,
3011
+ "loss": 0.8048,
3012
+ "step": 429
3013
+ },
3014
+ {
3015
+ "epoch": 0.28,
3016
+ "grad_norm": 0.049072265625,
3017
+ "learning_rate": 1.6916715857740234e-05,
3018
+ "loss": 0.7742,
3019
+ "step": 430
3020
+ },
3021
+ {
3022
+ "epoch": 0.28,
3023
+ "grad_norm": 0.045166015625,
3024
+ "learning_rate": 1.690148323527623e-05,
3025
+ "loss": 0.7859,
3026
+ "step": 431
3027
+ },
3028
+ {
3029
+ "epoch": 0.28,
3030
+ "grad_norm": 0.045166015625,
3031
+ "learning_rate": 1.688621997295515e-05,
3032
+ "loss": 0.7956,
3033
+ "step": 432
3034
+ },
3035
+ {
3036
+ "epoch": 0.28,
3037
+ "grad_norm": 0.04443359375,
3038
+ "learning_rate": 1.6870926138539837e-05,
3039
+ "loss": 0.8672,
3040
+ "step": 433
3041
+ },
3042
+ {
3043
+ "epoch": 0.28,
3044
+ "grad_norm": 0.060546875,
3045
+ "learning_rate": 1.6855601799928877e-05,
3046
+ "loss": 0.848,
3047
+ "step": 434
3048
+ },
3049
+ {
3050
+ "epoch": 0.28,
3051
+ "grad_norm": 0.046875,
3052
+ "learning_rate": 1.6840247025156272e-05,
3053
+ "loss": 0.8125,
3054
+ "step": 435
3055
+ },
3056
+ {
3057
+ "epoch": 0.28,
3058
+ "grad_norm": 0.052001953125,
3059
+ "learning_rate": 1.6824861882391154e-05,
3060
+ "loss": 0.8359,
3061
+ "step": 436
3062
+ },
3063
+ {
3064
+ "epoch": 0.28,
3065
+ "grad_norm": 0.048583984375,
3066
+ "learning_rate": 1.6809446439937472e-05,
3067
+ "loss": 0.877,
3068
+ "step": 437
3069
+ },
3070
+ {
3071
+ "epoch": 0.28,
3072
+ "grad_norm": 0.04833984375,
3073
+ "learning_rate": 1.6794000766233697e-05,
3074
+ "loss": 0.8408,
3075
+ "step": 438
3076
+ },
3077
+ {
3078
+ "epoch": 0.29,
3079
+ "grad_norm": 0.04833984375,
3080
+ "learning_rate": 1.6778524929852513e-05,
3081
+ "loss": 0.8381,
3082
+ "step": 439
3083
+ },
3084
+ {
3085
+ "epoch": 0.29,
3086
+ "grad_norm": 0.052490234375,
3087
+ "learning_rate": 1.676301899950052e-05,
3088
+ "loss": 0.782,
3089
+ "step": 440
3090
+ },
3091
+ {
3092
+ "epoch": 0.29,
3093
+ "grad_norm": 0.04345703125,
3094
+ "learning_rate": 1.674748304401791e-05,
3095
+ "loss": 0.8621,
3096
+ "step": 441
3097
+ },
3098
+ {
3099
+ "epoch": 0.29,
3100
+ "grad_norm": 0.04345703125,
3101
+ "learning_rate": 1.673191713237819e-05,
3102
+ "loss": 0.8012,
3103
+ "step": 442
3104
+ },
3105
+ {
3106
+ "epoch": 0.29,
3107
+ "grad_norm": 0.05322265625,
3108
+ "learning_rate": 1.671632133368785e-05,
3109
+ "loss": 0.8245,
3110
+ "step": 443
3111
+ },
3112
+ {
3113
+ "epoch": 0.29,
3114
+ "grad_norm": 0.0478515625,
3115
+ "learning_rate": 1.670069571718607e-05,
3116
+ "loss": 0.7882,
3117
+ "step": 444
3118
+ },
3119
+ {
3120
+ "epoch": 0.29,
3121
+ "grad_norm": 0.045166015625,
3122
+ "learning_rate": 1.6685040352244414e-05,
3123
+ "loss": 0.8387,
3124
+ "step": 445
3125
+ },
3126
+ {
3127
+ "epoch": 0.29,
3128
+ "grad_norm": 0.055419921875,
3129
+ "learning_rate": 1.666935530836651e-05,
3130
+ "loss": 0.7766,
3131
+ "step": 446
3132
+ },
3133
+ {
3134
+ "epoch": 0.29,
3135
+ "grad_norm": 0.04931640625,
3136
+ "learning_rate": 1.665364065518775e-05,
3137
+ "loss": 0.8204,
3138
+ "step": 447
3139
+ },
3140
+ {
3141
+ "epoch": 0.29,
3142
+ "grad_norm": 0.046630859375,
3143
+ "learning_rate": 1.6637896462474986e-05,
3144
+ "loss": 0.8133,
3145
+ "step": 448
3146
+ },
3147
+ {
3148
+ "epoch": 0.29,
3149
+ "grad_norm": 0.048828125,
3150
+ "learning_rate": 1.662212280012621e-05,
3151
+ "loss": 0.85,
3152
+ "step": 449
3153
+ },
3154
+ {
3155
+ "epoch": 0.29,
3156
+ "grad_norm": 0.045654296875,
3157
+ "learning_rate": 1.660631973817024e-05,
3158
+ "loss": 0.8247,
3159
+ "step": 450
3160
+ },
3161
+ {
3162
+ "epoch": 0.29,
3163
+ "grad_norm": 0.048583984375,
3164
+ "learning_rate": 1.6590487346766426e-05,
3165
+ "loss": 0.8977,
3166
+ "step": 451
3167
+ },
3168
+ {
3169
+ "epoch": 0.29,
3170
+ "grad_norm": 0.05712890625,
3171
+ "learning_rate": 1.657462569620433e-05,
3172
+ "loss": 0.8456,
3173
+ "step": 452
3174
+ },
3175
+ {
3176
+ "epoch": 0.29,
3177
+ "grad_norm": 0.048583984375,
3178
+ "learning_rate": 1.6558734856903406e-05,
3179
+ "loss": 0.8369,
3180
+ "step": 453
3181
+ },
3182
+ {
3183
+ "epoch": 0.3,
3184
+ "grad_norm": 0.04638671875,
3185
+ "learning_rate": 1.6542814899412694e-05,
3186
+ "loss": 0.8055,
3187
+ "step": 454
3188
+ },
3189
+ {
3190
+ "epoch": 0.3,
3191
+ "grad_norm": 0.04443359375,
3192
+ "learning_rate": 1.6526865894410526e-05,
3193
+ "loss": 0.8358,
3194
+ "step": 455
3195
+ },
3196
+ {
3197
+ "epoch": 0.3,
3198
+ "grad_norm": 0.046630859375,
3199
+ "learning_rate": 1.651088791270416e-05,
3200
+ "loss": 0.8094,
3201
+ "step": 456
3202
+ },
3203
+ {
3204
+ "epoch": 0.3,
3205
+ "grad_norm": 0.04833984375,
3206
+ "learning_rate": 1.6494881025229535e-05,
3207
+ "loss": 0.8518,
3208
+ "step": 457
3209
+ },
3210
+ {
3211
+ "epoch": 0.3,
3212
+ "grad_norm": 0.048583984375,
3213
+ "learning_rate": 1.647884530305089e-05,
3214
+ "loss": 0.9644,
3215
+ "step": 458
3216
+ },
3217
+ {
3218
+ "epoch": 0.3,
3219
+ "grad_norm": 0.046630859375,
3220
+ "learning_rate": 1.6462780817360502e-05,
3221
+ "loss": 0.8415,
3222
+ "step": 459
3223
+ },
3224
+ {
3225
+ "epoch": 0.3,
3226
+ "grad_norm": 0.050537109375,
3227
+ "learning_rate": 1.644668763947833e-05,
3228
+ "loss": 0.8764,
3229
+ "step": 460
3230
+ },
3231
+ {
3232
+ "epoch": 0.3,
3233
+ "grad_norm": 0.044921875,
3234
+ "learning_rate": 1.6430565840851723e-05,
3235
+ "loss": 0.7737,
3236
+ "step": 461
3237
+ },
3238
+ {
3239
+ "epoch": 0.3,
3240
+ "grad_norm": 0.047607421875,
3241
+ "learning_rate": 1.641441549305509e-05,
3242
+ "loss": 0.7559,
3243
+ "step": 462
3244
+ }
3245
+ ],
3246
+ "logging_steps": 1,
3247
+ "max_steps": 1538,
3248
+ "num_input_tokens_seen": 0,
3249
+ "num_train_epochs": 1,
3250
+ "save_steps": 462,
3251
+ "total_flos": 1.2674443629129892e+18,
3252
+ "train_batch_size": 16,
3253
+ "trial_name": null,
3254
+ "trial_params": null
3255
+ }
checkpoint-462/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eafc7d0eb7f055ddd6fbba193c9843badab904036b02ccac0ca4e09c80561ff
3
+ size 4920
checkpoint-462/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-924/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.7.1
checkpoint-924/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "o_proj",
24
+ "up_proj",
25
+ "down_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
checkpoint-924/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b63b59ed9ed000f56690a521b5eff91964a1e2362220712081edccd009b6ac
3
+ size 58754872
checkpoint-924/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 49153,
3
+ "<|im_start|>": 49152
4
+ }
checkpoint-924/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-924/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d492a3e562b0c6646d72e52b0b4b4d82799fb73f90995069f423a5edd30a000
3
+ size 117659642
checkpoint-924/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75612fb77dcc76b6de4ba90bbf652bf828e8f8a1187cae4af3104be6e5fd40b2
3
+ size 14244
checkpoint-924/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00867a3d51c43ebb23804d536dbd898b85106a0f372bc48dd9bf872a8df9e05
3
+ size 1064
checkpoint-924/special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
checkpoint-924/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-924/tokenizer_config.json ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "49152": {
309
+ "content": "<|im_start|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49153": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ }
324
+ },
325
+ "additional_special_tokens": [
326
+ "<|im_start|>",
327
+ "<|im_end|>"
328
+ ],
329
+ "bos_token": "<|im_start|>",
330
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
331
+ "clean_up_tokenization_spaces": true,
332
+ "eos_token": "<|im_end|>",
333
+ "model_max_length": 1000000000000000019884624838656,
334
+ "pad_token": "<|im_end|>",
335
+ "tokenizer_class": "GPT2Tokenizer",
336
+ "unk_token": "<|endoftext|>",
337
+ "vocab_size": 49152
338
+ }
checkpoint-924/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-924/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eafc7d0eb7f055ddd6fbba193c9843badab904036b02ccac0ca4e09c80561ff
3
+ size 4920
checkpoint-924/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<repo_name>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<file_sep>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_script>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<code_to_intermediate>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<intermediate_to_code>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<pr>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "19": {
157
+ "content": "<pr_status>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "20": {
165
+ "content": "<pr_is_merged>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "21": {
173
+ "content": "<pr_base>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "22": {
181
+ "content": "<pr_file>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "23": {
189
+ "content": "<pr_base_code>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "24": {
197
+ "content": "<pr_diff>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "25": {
205
+ "content": "<pr_diff_hunk>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "26": {
213
+ "content": "<pr_comment>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "27": {
221
+ "content": "<pr_event_id>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "28": {
229
+ "content": "<pr_review>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "29": {
237
+ "content": "<pr_review_state>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "30": {
245
+ "content": "<pr_review_comment>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "31": {
253
+ "content": "<pr_in_reply_to_review_id>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32": {
261
+ "content": "<pr_in_reply_to_comment_id>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "33": {
269
+ "content": "<pr_diff_hunk_comment_line>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "34": {
277
+ "content": "<NAME>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "35": {
285
+ "content": "<EMAIL>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "36": {
293
+ "content": "<KEY>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "37": {
301
+ "content": "<PASSWORD>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "49152": {
309
+ "content": "<|im_start|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49153": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ }
324
+ },
325
+ "additional_special_tokens": [
326
+ "<|im_start|>",
327
+ "<|im_end|>"
328
+ ],
329
+ "bos_token": "<|im_start|>",
330
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
331
+ "clean_up_tokenization_spaces": true,
332
+ "eos_token": "<|im_end|>",
333
+ "model_max_length": 1000000000000000019884624838656,
334
+ "pad_token": "<|im_end|>",
335
+ "tokenizer_class": "GPT2Tokenizer",
336
+ "unk_token": "<|endoftext|>",
337
+ "vocab_size": 49152
338
+ }