goldsounds commited on
Commit
7111b87
1 Parent(s): 0fb5b8d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +248 -0
  2. adapter_config.json +32 -0
  3. adapter_model.safetensors +3 -0
  4. all_results.json +13 -0
  5. checkpoint-1000/README.md +204 -0
  6. checkpoint-1000/adapter_config.json +32 -0
  7. checkpoint-1000/adapter_model.safetensors +3 -0
  8. checkpoint-1000/merges.txt +0 -0
  9. checkpoint-1000/optimizer.pt +3 -0
  10. checkpoint-1000/rng_state.pth +3 -0
  11. checkpoint-1000/scheduler.pt +3 -0
  12. checkpoint-1000/special_tokens_map.json +64 -0
  13. checkpoint-1000/tokenizer_config.json +361 -0
  14. checkpoint-1000/trainer_state.json +1501 -0
  15. checkpoint-1000/training_args.bin +3 -0
  16. checkpoint-1000/vocab.json +0 -0
  17. checkpoint-10000/README.md +204 -0
  18. checkpoint-10000/adapter_config.json +32 -0
  19. checkpoint-10000/adapter_model.safetensors +3 -0
  20. checkpoint-10000/merges.txt +0 -0
  21. checkpoint-10000/optimizer.pt +3 -0
  22. checkpoint-10000/rng_state.pth +3 -0
  23. checkpoint-10000/scheduler.pt +3 -0
  24. checkpoint-10000/special_tokens_map.json +64 -0
  25. checkpoint-10000/tokenizer_config.json +361 -0
  26. checkpoint-10000/trainer_state.json +0 -0
  27. checkpoint-10000/training_args.bin +3 -0
  28. checkpoint-10000/vocab.json +0 -0
  29. checkpoint-10500/README.md +204 -0
  30. checkpoint-10500/adapter_config.json +32 -0
  31. checkpoint-10500/adapter_model.safetensors +3 -0
  32. checkpoint-10500/merges.txt +0 -0
  33. checkpoint-10500/optimizer.pt +3 -0
  34. checkpoint-10500/rng_state.pth +3 -0
  35. checkpoint-10500/scheduler.pt +3 -0
  36. checkpoint-10500/special_tokens_map.json +64 -0
  37. checkpoint-10500/tokenizer_config.json +361 -0
  38. checkpoint-10500/trainer_state.json +0 -0
  39. checkpoint-10500/training_args.bin +3 -0
  40. checkpoint-10500/vocab.json +0 -0
  41. checkpoint-11000/README.md +204 -0
  42. checkpoint-11000/adapter_config.json +32 -0
  43. checkpoint-11000/adapter_model.safetensors +3 -0
  44. checkpoint-11000/merges.txt +0 -0
  45. checkpoint-11000/optimizer.pt +3 -0
  46. checkpoint-11000/rng_state.pth +3 -0
  47. checkpoint-11000/scheduler.pt +3 -0
  48. checkpoint-11000/special_tokens_map.json +64 -0
  49. checkpoint-11000/tokenizer_config.json +361 -0
  50. checkpoint-11000/trainer_state.json +0 -0
README.md ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: bigcode/starcoder2-7b
9
+ model-index:
10
+ - name: sft
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # sft
18
+
19
+ This model is a fine-tuned version of [bigcode/starcoder2-7b](https://huggingface.co/bigcode/starcoder2-7b) on the starcoder_jetpack dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.6761
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss |
52
+ |:-------------:|:-----:|:-----:|:---------------:|
53
+ | 1.2469 | 0.02 | 100 | 0.8479 |
54
+ | 1.3584 | 0.03 | 200 | 0.8368 |
55
+ | 1.0651 | 0.05 | 300 | 0.8281 |
56
+ | 0.9209 | 0.06 | 400 | 0.8185 |
57
+ | 0.8306 | 0.08 | 500 | 0.8132 |
58
+ | 0.9175 | 0.1 | 600 | 0.8064 |
59
+ | 0.8157 | 0.11 | 700 | 0.8023 |
60
+ | 0.9469 | 0.13 | 800 | 0.7996 |
61
+ | 0.8872 | 0.14 | 900 | 0.7955 |
62
+ | 0.8842 | 0.16 | 1000 | 0.7913 |
63
+ | 0.7909 | 0.18 | 1100 | 0.7863 |
64
+ | 0.8196 | 0.19 | 1200 | 0.7844 |
65
+ | 0.9341 | 0.21 | 1300 | 0.7825 |
66
+ | 0.8801 | 0.22 | 1400 | 0.7787 |
67
+ | 0.9109 | 0.24 | 1500 | 0.7777 |
68
+ | 0.8964 | 0.26 | 1600 | 0.7759 |
69
+ | 0.9265 | 0.27 | 1700 | 0.7742 |
70
+ | 0.8632 | 0.29 | 1800 | 0.7699 |
71
+ | 1.0081 | 0.3 | 1900 | 0.7693 |
72
+ | 0.7651 | 0.32 | 2000 | 0.7664 |
73
+ | 1.0037 | 0.34 | 2100 | 0.7654 |
74
+ | 0.8768 | 0.35 | 2200 | 0.7642 |
75
+ | 0.8052 | 0.37 | 2300 | 0.7618 |
76
+ | 0.7271 | 0.38 | 2400 | 0.7595 |
77
+ | 0.9615 | 0.4 | 2500 | 0.7582 |
78
+ | 0.8284 | 0.42 | 2600 | 0.7555 |
79
+ | 0.8631 | 0.43 | 2700 | 0.7540 |
80
+ | 1.0507 | 0.45 | 2800 | 0.7518 |
81
+ | 0.8247 | 0.46 | 2900 | 0.7512 |
82
+ | 0.9835 | 0.48 | 3000 | 0.7496 |
83
+ | 0.8407 | 0.49 | 3100 | 0.7496 |
84
+ | 0.7417 | 0.51 | 3200 | 0.7467 |
85
+ | 0.7449 | 0.53 | 3300 | 0.7473 |
86
+ | 0.8562 | 0.54 | 3400 | 0.7437 |
87
+ | 0.9222 | 0.56 | 3500 | 0.7429 |
88
+ | 0.9242 | 0.57 | 3600 | 0.7413 |
89
+ | 0.8092 | 0.59 | 3700 | 0.7403 |
90
+ | 0.7279 | 0.61 | 3800 | 0.7394 |
91
+ | 0.7774 | 0.62 | 3900 | 0.7385 |
92
+ | 0.8942 | 0.64 | 4000 | 0.7364 |
93
+ | 0.9286 | 0.65 | 4100 | 0.7348 |
94
+ | 0.7703 | 0.67 | 4200 | 0.7354 |
95
+ | 0.8322 | 0.69 | 4300 | 0.7330 |
96
+ | 0.9851 | 0.7 | 4400 | 0.7324 |
97
+ | 0.8712 | 0.72 | 4500 | 0.7317 |
98
+ | 0.7871 | 0.73 | 4600 | 0.7310 |
99
+ | 0.7156 | 0.75 | 4700 | 0.7284 |
100
+ | 0.7856 | 0.77 | 4800 | 0.7277 |
101
+ | 0.7906 | 0.78 | 4900 | 0.7255 |
102
+ | 0.7917 | 0.8 | 5000 | 0.7250 |
103
+ | 0.6395 | 0.81 | 5100 | 0.7237 |
104
+ | 0.7567 | 0.83 | 5200 | 0.7232 |
105
+ | 0.8551 | 0.85 | 5300 | 0.7220 |
106
+ | 0.7392 | 0.86 | 5400 | 0.7226 |
107
+ | 0.9312 | 0.88 | 5500 | 0.7205 |
108
+ | 0.8323 | 0.89 | 5600 | 0.7196 |
109
+ | 0.7312 | 0.91 | 5700 | 0.7197 |
110
+ | 1.0 | 0.93 | 5800 | 0.7182 |
111
+ | 0.6164 | 0.94 | 5900 | 0.7177 |
112
+ | 0.7484 | 0.96 | 6000 | 0.7147 |
113
+ | 0.7924 | 0.97 | 6100 | 0.7144 |
114
+ | 0.9389 | 0.99 | 6200 | 0.7145 |
115
+ | 0.7108 | 1.01 | 6300 | 0.7136 |
116
+ | 0.8076 | 1.02 | 6400 | 0.7154 |
117
+ | 0.7232 | 1.04 | 6500 | 0.7147 |
118
+ | 0.6456 | 1.05 | 6600 | 0.7122 |
119
+ | 0.5862 | 1.07 | 6700 | 0.7133 |
120
+ | 0.6935 | 1.09 | 6800 | 0.7112 |
121
+ | 0.7522 | 1.1 | 6900 | 0.7103 |
122
+ | 1.0525 | 1.12 | 7000 | 0.7106 |
123
+ | 0.8285 | 1.13 | 7100 | 0.7099 |
124
+ | 0.6116 | 1.15 | 7200 | 0.7079 |
125
+ | 0.5617 | 1.17 | 7300 | 0.7087 |
126
+ | 0.6514 | 1.18 | 7400 | 0.7072 |
127
+ | 0.6729 | 1.2 | 7500 | 0.7052 |
128
+ | 0.6401 | 1.21 | 7600 | 0.7055 |
129
+ | 0.8089 | 1.23 | 7700 | 0.7052 |
130
+ | 0.8166 | 1.25 | 7800 | 0.7041 |
131
+ | 0.8685 | 1.26 | 7900 | 0.7026 |
132
+ | 0.6945 | 1.28 | 8000 | 0.7043 |
133
+ | 0.6955 | 1.29 | 8100 | 0.7010 |
134
+ | 0.734 | 1.31 | 8200 | 0.7022 |
135
+ | 0.5586 | 1.32 | 8300 | 0.7017 |
136
+ | 0.7299 | 1.34 | 8400 | 0.6999 |
137
+ | 1.089 | 1.36 | 8500 | 0.6994 |
138
+ | 0.5733 | 1.37 | 8600 | 0.6994 |
139
+ | 0.5409 | 1.39 | 8700 | 0.6987 |
140
+ | 0.8848 | 1.4 | 8800 | 0.6976 |
141
+ | 0.5739 | 1.42 | 8900 | 0.6971 |
142
+ | 0.728 | 1.44 | 9000 | 0.6963 |
143
+ | 0.7503 | 1.45 | 9100 | 0.6953 |
144
+ | 0.706 | 1.47 | 9200 | 0.6951 |
145
+ | 0.7926 | 1.48 | 9300 | 0.6945 |
146
+ | 0.6019 | 1.5 | 9400 | 0.6945 |
147
+ | 0.6707 | 1.52 | 9500 | 0.6943 |
148
+ | 0.7269 | 1.53 | 9600 | 0.6940 |
149
+ | 0.7216 | 1.55 | 9700 | 0.6923 |
150
+ | 0.6394 | 1.56 | 9800 | 0.6920 |
151
+ | 0.7608 | 1.58 | 9900 | 0.6909 |
152
+ | 1.034 | 1.6 | 10000 | 0.6908 |
153
+ | 0.7934 | 1.61 | 10100 | 0.6892 |
154
+ | 0.627 | 1.63 | 10200 | 0.6902 |
155
+ | 0.5849 | 1.64 | 10300 | 0.6897 |
156
+ | 0.7257 | 1.66 | 10400 | 0.6889 |
157
+ | 0.8931 | 1.68 | 10500 | 0.6890 |
158
+ | 0.6831 | 1.69 | 10600 | 0.6875 |
159
+ | 0.4995 | 1.71 | 10700 | 0.6879 |
160
+ | 0.757 | 1.72 | 10800 | 0.6873 |
161
+ | 0.4664 | 1.74 | 10900 | 0.6876 |
162
+ | 0.78 | 1.76 | 11000 | 0.6865 |
163
+ | 0.5564 | 1.77 | 11100 | 0.6865 |
164
+ | 0.7858 | 1.79 | 11200 | 0.6858 |
165
+ | 0.6989 | 1.8 | 11300 | 0.6851 |
166
+ | 0.705 | 1.82 | 11400 | 0.6841 |
167
+ | 0.5795 | 1.84 | 11500 | 0.6842 |
168
+ | 0.6989 | 1.85 | 11600 | 0.6837 |
169
+ | 0.6877 | 1.87 | 11700 | 0.6838 |
170
+ | 0.6484 | 1.88 | 11800 | 0.6835 |
171
+ | 0.8525 | 1.9 | 11900 | 0.6832 |
172
+ | 0.7547 | 1.92 | 12000 | 0.6823 |
173
+ | 0.8118 | 1.93 | 12100 | 0.6819 |
174
+ | 0.8859 | 1.95 | 12200 | 0.6823 |
175
+ | 0.738 | 1.96 | 12300 | 0.6811 |
176
+ | 0.7051 | 1.98 | 12400 | 0.6816 |
177
+ | 0.5598 | 2.0 | 12500 | 0.6802 |
178
+ | 0.6194 | 2.01 | 12600 | 0.6812 |
179
+ | 0.7101 | 2.03 | 12700 | 0.6817 |
180
+ | 0.7027 | 2.04 | 12800 | 0.6815 |
181
+ | 0.9432 | 2.06 | 12900 | 0.6810 |
182
+ | 0.5931 | 2.08 | 13000 | 0.6817 |
183
+ | 0.5412 | 2.09 | 13100 | 0.6810 |
184
+ | 0.6237 | 2.11 | 13200 | 0.6815 |
185
+ | 0.5871 | 2.12 | 13300 | 0.6812 |
186
+ | 0.8331 | 2.14 | 13400 | 0.6817 |
187
+ | 0.4528 | 2.15 | 13500 | 0.6812 |
188
+ | 0.6292 | 2.17 | 13600 | 0.6814 |
189
+ | 0.6219 | 2.19 | 13700 | 0.6800 |
190
+ | 0.6281 | 2.2 | 13800 | 0.6798 |
191
+ | 0.6949 | 2.22 | 13900 | 0.6803 |
192
+ | 0.6701 | 2.23 | 14000 | 0.6791 |
193
+ | 0.6467 | 2.25 | 14100 | 0.6795 |
194
+ | 0.6579 | 2.27 | 14200 | 0.6800 |
195
+ | 0.5978 | 2.28 | 14300 | 0.6802 |
196
+ | 0.7032 | 2.3 | 14400 | 0.6793 |
197
+ | 0.6347 | 2.31 | 14500 | 0.6787 |
198
+ | 0.9034 | 2.33 | 14600 | 0.6788 |
199
+ | 0.6166 | 2.35 | 14700 | 0.6781 |
200
+ | 0.7327 | 2.36 | 14800 | 0.6786 |
201
+ | 0.7159 | 2.38 | 14900 | 0.6777 |
202
+ | 0.6283 | 2.39 | 15000 | 0.6779 |
203
+ | 0.6113 | 2.41 | 15100 | 0.6776 |
204
+ | 0.5951 | 2.43 | 15200 | 0.6781 |
205
+ | 0.6747 | 2.44 | 15300 | 0.6777 |
206
+ | 0.5935 | 2.46 | 15400 | 0.6779 |
207
+ | 0.6435 | 2.47 | 15500 | 0.6776 |
208
+ | 0.637 | 2.49 | 15600 | 0.6772 |
209
+ | 0.4617 | 2.51 | 15700 | 0.6774 |
210
+ | 0.7937 | 2.52 | 15800 | 0.6771 |
211
+ | 0.7187 | 2.54 | 15900 | 0.6768 |
212
+ | 0.657 | 2.55 | 16000 | 0.6767 |
213
+ | 0.8606 | 2.57 | 16100 | 0.6767 |
214
+ | 0.4392 | 2.59 | 16200 | 0.6768 |
215
+ | 0.5675 | 2.6 | 16300 | 0.6769 |
216
+ | 0.6454 | 2.62 | 16400 | 0.6768 |
217
+ | 0.5787 | 2.63 | 16500 | 0.6767 |
218
+ | 0.6111 | 2.65 | 16600 | 0.6766 |
219
+ | 0.6106 | 2.67 | 16700 | 0.6767 |
220
+ | 0.5947 | 2.68 | 16800 | 0.6763 |
221
+ | 0.5576 | 2.7 | 16900 | 0.6763 |
222
+ | 0.659 | 2.71 | 17000 | 0.6762 |
223
+ | 0.787 | 2.73 | 17100 | 0.6761 |
224
+ | 0.5503 | 2.75 | 17200 | 0.6760 |
225
+ | 0.5558 | 2.76 | 17300 | 0.6760 |
226
+ | 0.6324 | 2.78 | 17400 | 0.6761 |
227
+ | 0.5846 | 2.79 | 17500 | 0.6761 |
228
+ | 0.9542 | 2.81 | 17600 | 0.6760 |
229
+ | 0.5755 | 2.83 | 17700 | 0.6761 |
230
+ | 0.7841 | 2.84 | 17800 | 0.6761 |
231
+ | 0.5662 | 2.86 | 17900 | 0.6761 |
232
+ | 0.8085 | 2.87 | 18000 | 0.6761 |
233
+ | 0.7389 | 2.89 | 18100 | 0.6761 |
234
+ | 0.736 | 2.91 | 18200 | 0.6761 |
235
+ | 0.5604 | 2.92 | 18300 | 0.6761 |
236
+ | 0.6156 | 2.94 | 18400 | 0.6761 |
237
+ | 0.5473 | 2.95 | 18500 | 0.6761 |
238
+ | 0.7286 | 2.97 | 18600 | 0.6761 |
239
+ | 0.5932 | 2.98 | 18700 | 0.6761 |
240
+
241
+
242
+ ### Framework versions
243
+
244
+ - PEFT 0.9.0
245
+ - Transformers 4.39.0.dev0
246
+ - Pytorch 2.1.0+cu121
247
+ - Datasets 2.18.0
248
+ - Tokenizers 0.15.2
adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 64,
13
+ "lora_dropout": 0.15,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "c_fc",
23
+ "o_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "c_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc16c87cc4e55e8b52d845df95957993632cd645300dde18af970b85430a49d3
3
+ size 306235552
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 0.6760696172714233,
4
+ "eval_runtime": 96.721,
5
+ "eval_samples_per_second": 7.206,
6
+ "eval_steps_per_second": 7.206,
7
+ "perplexity": 1.9661348634697817,
8
+ "total_flos": 8.111041940658586e+17,
9
+ "train_loss": 0.7353765367668783,
10
+ "train_runtime": 28019.6609,
11
+ "train_samples_per_second": 0.671,
12
+ "train_steps_per_second": 0.671
13
+ }
checkpoint-1000/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.9.0
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 64,
13
+ "lora_dropout": 0.15,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "c_fc",
23
+ "o_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "c_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf012786c99967b1faabcd02528ccb46bd8e1dda55139ee8c61333a072a9f7c
3
+ size 306235552
checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009f146f4255b0a1b95a91fd2fb1bf70dc089a13ad98bd2f64673ea1217bfca0
3
+ size 612692114
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84fe6feed7b7aac04ce2bfaadc40072bafdaf974be4de840016f8ea7305b750a
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b83b0d57be939973ba63fb96361823572996faa011b323939493bdbb88e03d73
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": "<|endoftext|>",
57
+ "unk_token": {
58
+ "content": "<|endoftext|>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<repo_name>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<file_sep>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<jupyter_script>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<empty_output>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<code_to_intermediate>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<intermediate_to_code>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<pr>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<pr_status>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "<pr_is_merged>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<pr_base>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "<pr_file>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<pr_base_code>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "<pr_diff>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<pr_diff_hunk>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "<pr_comment>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "27": {
222
+ "content": "<pr_event_id>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "28": {
230
+ "content": "<pr_review>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "29": {
238
+ "content": "<pr_review_state>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "30": {
246
+ "content": "<pr_review_comment>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "31": {
254
+ "content": "<pr_in_reply_to_review_id>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32": {
262
+ "content": "<pr_in_reply_to_comment_id>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "33": {
270
+ "content": "<pr_diff_hunk_comment_line>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "34": {
278
+ "content": "<NAME>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "35": {
286
+ "content": "<EMAIL>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "36": {
294
+ "content": "<KEY>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "37": {
302
+ "content": "<PASSWORD>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ }
309
+ },
310
+ "additional_special_tokens": [
311
+ "<|endoftext|>",
312
+ "<fim_prefix>",
313
+ "<fim_middle>",
314
+ "<fim_suffix>",
315
+ "<fim_pad>",
316
+ "<repo_name>",
317
+ "<file_sep>",
318
+ "<issue_start>",
319
+ "<issue_comment>",
320
+ "<issue_closed>",
321
+ "<jupyter_start>",
322
+ "<jupyter_text>",
323
+ "<jupyter_code>",
324
+ "<jupyter_output>",
325
+ "<jupyter_script>",
326
+ "<empty_output>",
327
+ "<code_to_intermediate>",
328
+ "<intermediate_to_code>",
329
+ "<pr>",
330
+ "<pr_status>",
331
+ "<pr_is_merged>",
332
+ "<pr_base>",
333
+ "<pr_file>",
334
+ "<pr_base_code>",
335
+ "<pr_diff>",
336
+ "<pr_diff_hunk>",
337
+ "<pr_comment>",
338
+ "<pr_event_id>",
339
+ "<pr_review>",
340
+ "<pr_review_state>",
341
+ "<pr_review_comment>",
342
+ "<pr_in_reply_to_review_id>",
343
+ "<pr_in_reply_to_comment_id>",
344
+ "<pr_diff_hunk_comment_line>",
345
+ "<NAME>",
346
+ "<EMAIL>",
347
+ "<KEY>",
348
+ "<PASSWORD>"
349
+ ],
350
+ "bos_token": "<|endoftext|>",
351
+ "clean_up_tokenization_spaces": true,
352
+ "eos_token": "<|endoftext|>",
353
+ "errors": "replace",
354
+ "model_max_length": 1000000000000000019884624838656,
355
+ "pad_token": "<|endoftext|>",
356
+ "padding_side": "right",
357
+ "split_special_tokens": false,
358
+ "tokenizer_class": "GPT2Tokenizer",
359
+ "unk_token": "<|endoftext|>",
360
+ "vocab_size": 49152
361
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7912728190422058,
3
+ "best_model_checkpoint": "saves/starcoder2-7b/lora/sft/checkpoint-1000",
4
+ "epoch": 0.1596169193934557,
5
+ "eval_steps": 100,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.628385603427887,
14
+ "learning_rate": 4.999999126897802e-05,
15
+ "loss": 1.2582,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.0,
20
+ "grad_norm": 1.0855119228363037,
21
+ "learning_rate": 4.999996507591817e-05,
22
+ "loss": 0.801,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.0,
27
+ "grad_norm": 1.5689586400985718,
28
+ "learning_rate": 4.9999921420838745e-05,
29
+ "loss": 1.067,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0,
34
+ "grad_norm": 2.0851330757141113,
35
+ "learning_rate": 4.999986030377024e-05,
36
+ "loss": 1.2953,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.0,
41
+ "grad_norm": 1.397479772567749,
42
+ "learning_rate": 4.999978172475535e-05,
43
+ "loss": 0.9826,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.0,
48
+ "grad_norm": 1.344118595123291,
49
+ "learning_rate": 4.9999685683848954e-05,
50
+ "loss": 0.9485,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.01,
55
+ "grad_norm": 1.158163070678711,
56
+ "learning_rate": 4.9999596278606616e-05,
57
+ "loss": 0.8103,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.01,
62
+ "grad_norm": 1.602233648300171,
63
+ "learning_rate": 4.999946880647276e-05,
64
+ "loss": 0.8648,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.01,
69
+ "grad_norm": 1.557242751121521,
70
+ "learning_rate": 4.999932387266596e-05,
71
+ "loss": 1.0198,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.01,
76
+ "grad_norm": 1.36068856716156,
77
+ "learning_rate": 4.999916147728746e-05,
78
+ "loss": 0.9367,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.01,
83
+ "grad_norm": 1.3263639211654663,
84
+ "learning_rate": 4.999898162045068e-05,
85
+ "loss": 0.9695,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.01,
90
+ "grad_norm": 1.333601474761963,
91
+ "learning_rate": 4.999878430228126e-05,
92
+ "loss": 1.1509,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.01,
97
+ "grad_norm": 1.4753800630569458,
98
+ "learning_rate": 4.999856952291702e-05,
99
+ "loss": 1.1461,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.01,
104
+ "grad_norm": 1.5096240043640137,
105
+ "learning_rate": 4.9998337282507965e-05,
106
+ "loss": 1.1722,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.01,
111
+ "grad_norm": 1.189892053604126,
112
+ "learning_rate": 4.999808758121633e-05,
113
+ "loss": 1.1834,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.01,
118
+ "grad_norm": 0.9292634725570679,
119
+ "learning_rate": 4.999782041921651e-05,
120
+ "loss": 0.9498,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.01,
125
+ "grad_norm": 2.1775777339935303,
126
+ "learning_rate": 4.9997535796695134e-05,
127
+ "loss": 0.9346,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.01,
132
+ "grad_norm": 1.6854296922683716,
133
+ "learning_rate": 4.999723371385099e-05,
134
+ "loss": 1.119,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.02,
139
+ "grad_norm": 1.4571490287780762,
140
+ "learning_rate": 4.999691417089507e-05,
141
+ "loss": 0.8671,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.02,
146
+ "grad_norm": 1.277044653892517,
147
+ "learning_rate": 4.999657716805059e-05,
148
+ "loss": 1.2469,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.02,
153
+ "eval_loss": 0.8478816747665405,
154
+ "eval_runtime": 96.2736,
155
+ "eval_samples_per_second": 7.24,
156
+ "eval_steps_per_second": 7.24,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 0.02,
161
+ "grad_norm": 0.6687743067741394,
162
+ "learning_rate": 4.9996222705552933e-05,
163
+ "loss": 0.735,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 0.02,
168
+ "grad_norm": 1.3488354682922363,
169
+ "learning_rate": 4.9995850783649665e-05,
170
+ "loss": 0.8344,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 0.02,
175
+ "grad_norm": 1.1043323278427124,
176
+ "learning_rate": 4.9995461402600593e-05,
177
+ "loss": 0.8254,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 0.02,
182
+ "grad_norm": 0.9382895827293396,
183
+ "learning_rate": 4.9995054562677684e-05,
184
+ "loss": 0.9179,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 0.02,
189
+ "grad_norm": 1.2824612855911255,
190
+ "learning_rate": 4.9994630264165107e-05,
191
+ "loss": 0.8663,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 0.02,
196
+ "grad_norm": 1.0491925477981567,
197
+ "learning_rate": 4.999418850735923e-05,
198
+ "loss": 0.9247,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 0.02,
203
+ "grad_norm": 1.3642233610153198,
204
+ "learning_rate": 4.99937292925686e-05,
205
+ "loss": 0.8253,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 0.02,
210
+ "grad_norm": 3.747757911682129,
211
+ "learning_rate": 4.9993252620113976e-05,
212
+ "loss": 1.0245,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 0.02,
217
+ "grad_norm": 1.299494981765747,
218
+ "learning_rate": 4.999275849032832e-05,
219
+ "loss": 0.8723,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 0.02,
224
+ "grad_norm": 1.7195830345153809,
225
+ "learning_rate": 4.999224690355675e-05,
226
+ "loss": 1.0524,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 0.02,
231
+ "grad_norm": 0.9922987222671509,
232
+ "learning_rate": 4.9991717860156616e-05,
233
+ "loss": 0.9502,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 0.03,
238
+ "grad_norm": 1.0577458143234253,
239
+ "learning_rate": 4.9991171360497437e-05,
240
+ "loss": 1.0115,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.03,
245
+ "grad_norm": 1.0001195669174194,
246
+ "learning_rate": 4.999060740496093e-05,
247
+ "loss": 1.1999,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 0.03,
252
+ "grad_norm": 1.2456804513931274,
253
+ "learning_rate": 4.999002599394102e-05,
254
+ "loss": 0.8882,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 0.03,
259
+ "grad_norm": 1.0445325374603271,
260
+ "learning_rate": 4.9989427127843814e-05,
261
+ "loss": 1.0615,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 0.03,
266
+ "grad_norm": 1.2410887479782104,
267
+ "learning_rate": 4.9988810807087584e-05,
268
+ "loss": 1.1068,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 0.03,
273
+ "grad_norm": 0.8935971260070801,
274
+ "learning_rate": 4.998817703210285e-05,
275
+ "loss": 0.6683,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 0.03,
280
+ "grad_norm": 1.1614488363265991,
281
+ "learning_rate": 4.9987525803332265e-05,
282
+ "loss": 0.7446,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 0.03,
287
+ "grad_norm": 0.9392004013061523,
288
+ "learning_rate": 4.998685712123072e-05,
289
+ "loss": 0.7397,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 0.03,
294
+ "grad_norm": 1.0314444303512573,
295
+ "learning_rate": 4.9986170986265266e-05,
296
+ "loss": 1.3584,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 0.03,
301
+ "eval_loss": 0.8368077278137207,
302
+ "eval_runtime": 96.5262,
303
+ "eval_samples_per_second": 7.221,
304
+ "eval_steps_per_second": 7.221,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 0.03,
309
+ "grad_norm": 0.8964811563491821,
310
+ "learning_rate": 4.998546739891516e-05,
311
+ "loss": 0.9546,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 0.03,
316
+ "grad_norm": 1.0679796934127808,
317
+ "learning_rate": 4.998474635967185e-05,
318
+ "loss": 0.864,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 0.03,
323
+ "grad_norm": 1.2340985536575317,
324
+ "learning_rate": 4.998400786903896e-05,
325
+ "loss": 0.885,
326
+ "step": 215
327
+ },
328
+ {
329
+ "epoch": 0.04,
330
+ "grad_norm": 1.7219617366790771,
331
+ "learning_rate": 4.9983251927532315e-05,
332
+ "loss": 1.1069,
333
+ "step": 220
334
+ },
335
+ {
336
+ "epoch": 0.04,
337
+ "grad_norm": 1.1480705738067627,
338
+ "learning_rate": 4.9982478535679924e-05,
339
+ "loss": 1.0416,
340
+ "step": 225
341
+ },
342
+ {
343
+ "epoch": 0.04,
344
+ "grad_norm": 1.515589714050293,
345
+ "learning_rate": 4.9981687694021996e-05,
346
+ "loss": 1.1844,
347
+ "step": 230
348
+ },
349
+ {
350
+ "epoch": 0.04,
351
+ "grad_norm": 1.6687963008880615,
352
+ "learning_rate": 4.998087940311091e-05,
353
+ "loss": 0.8664,
354
+ "step": 235
355
+ },
356
+ {
357
+ "epoch": 0.04,
358
+ "grad_norm": 1.9256645441055298,
359
+ "learning_rate": 4.998005366351125e-05,
360
+ "loss": 1.0125,
361
+ "step": 240
362
+ },
363
+ {
364
+ "epoch": 0.04,
365
+ "grad_norm": 1.2500052452087402,
366
+ "learning_rate": 4.997921047579978e-05,
367
+ "loss": 1.1374,
368
+ "step": 245
369
+ },
370
+ {
371
+ "epoch": 0.04,
372
+ "grad_norm": 1.0543216466903687,
373
+ "learning_rate": 4.9978349840565434e-05,
374
+ "loss": 0.8502,
375
+ "step": 250
376
+ },
377
+ {
378
+ "epoch": 0.04,
379
+ "grad_norm": 1.3009012937545776,
380
+ "learning_rate": 4.997747175840937e-05,
381
+ "loss": 1.0357,
382
+ "step": 255
383
+ },
384
+ {
385
+ "epoch": 0.04,
386
+ "grad_norm": 0.8456661105155945,
387
+ "learning_rate": 4.997657622994491e-05,
388
+ "loss": 0.6883,
389
+ "step": 260
390
+ },
391
+ {
392
+ "epoch": 0.04,
393
+ "grad_norm": 0.5856515765190125,
394
+ "learning_rate": 4.9975663255797555e-05,
395
+ "loss": 0.7656,
396
+ "step": 265
397
+ },
398
+ {
399
+ "epoch": 0.04,
400
+ "grad_norm": 0.973818302154541,
401
+ "learning_rate": 4.997473283660501e-05,
402
+ "loss": 0.823,
403
+ "step": 270
404
+ },
405
+ {
406
+ "epoch": 0.04,
407
+ "grad_norm": 0.9960187673568726,
408
+ "learning_rate": 4.997378497301715e-05,
409
+ "loss": 0.8726,
410
+ "step": 275
411
+ },
412
+ {
413
+ "epoch": 0.04,
414
+ "grad_norm": 1.2900679111480713,
415
+ "learning_rate": 4.997281966569604e-05,
416
+ "loss": 0.9781,
417
+ "step": 280
418
+ },
419
+ {
420
+ "epoch": 0.05,
421
+ "grad_norm": 1.828894853591919,
422
+ "learning_rate": 4.9971836915315926e-05,
423
+ "loss": 0.8932,
424
+ "step": 285
425
+ },
426
+ {
427
+ "epoch": 0.05,
428
+ "grad_norm": 1.239621877670288,
429
+ "learning_rate": 4.9970836722563256e-05,
430
+ "loss": 1.2022,
431
+ "step": 290
432
+ },
433
+ {
434
+ "epoch": 0.05,
435
+ "grad_norm": 1.0117149353027344,
436
+ "learning_rate": 4.996981908813664e-05,
437
+ "loss": 0.8032,
438
+ "step": 295
439
+ },
440
+ {
441
+ "epoch": 0.05,
442
+ "grad_norm": 0.8861119747161865,
443
+ "learning_rate": 4.996878401274687e-05,
444
+ "loss": 1.0651,
445
+ "step": 300
446
+ },
447
+ {
448
+ "epoch": 0.05,
449
+ "eval_loss": 0.8281473517417908,
450
+ "eval_runtime": 96.5283,
451
+ "eval_samples_per_second": 7.221,
452
+ "eval_steps_per_second": 7.221,
453
+ "step": 300
454
+ },
455
+ {
456
+ "epoch": 0.05,
457
+ "grad_norm": 0.8583046197891235,
458
+ "learning_rate": 4.996773149711693e-05,
459
+ "loss": 0.8784,
460
+ "step": 305
461
+ },
462
+ {
463
+ "epoch": 0.05,
464
+ "grad_norm": 2.5717499256134033,
465
+ "learning_rate": 4.9966661541981984e-05,
466
+ "loss": 0.8395,
467
+ "step": 310
468
+ },
469
+ {
470
+ "epoch": 0.05,
471
+ "grad_norm": 0.982342004776001,
472
+ "learning_rate": 4.9965574148089376e-05,
473
+ "loss": 0.9869,
474
+ "step": 315
475
+ },
476
+ {
477
+ "epoch": 0.05,
478
+ "grad_norm": 0.9000777006149292,
479
+ "learning_rate": 4.9964469316198633e-05,
480
+ "loss": 0.8435,
481
+ "step": 320
482
+ },
483
+ {
484
+ "epoch": 0.05,
485
+ "grad_norm": 0.8733209371566772,
486
+ "learning_rate": 4.9963347047081464e-05,
487
+ "loss": 0.7281,
488
+ "step": 325
489
+ },
490
+ {
491
+ "epoch": 0.05,
492
+ "grad_norm": 3.323739767074585,
493
+ "learning_rate": 4.9962207341521746e-05,
494
+ "loss": 1.1013,
495
+ "step": 330
496
+ },
497
+ {
498
+ "epoch": 0.05,
499
+ "grad_norm": 1.7102876901626587,
500
+ "learning_rate": 4.996105020031554e-05,
501
+ "loss": 0.8276,
502
+ "step": 335
503
+ },
504
+ {
505
+ "epoch": 0.05,
506
+ "grad_norm": 0.9196123480796814,
507
+ "learning_rate": 4.995987562427109e-05,
508
+ "loss": 0.8274,
509
+ "step": 340
510
+ },
511
+ {
512
+ "epoch": 0.06,
513
+ "grad_norm": 1.210099458694458,
514
+ "learning_rate": 4.995868361420883e-05,
515
+ "loss": 1.3257,
516
+ "step": 345
517
+ },
518
+ {
519
+ "epoch": 0.06,
520
+ "grad_norm": 0.8923581838607788,
521
+ "learning_rate": 4.9957474170961335e-05,
522
+ "loss": 0.6815,
523
+ "step": 350
524
+ },
525
+ {
526
+ "epoch": 0.06,
527
+ "grad_norm": 0.9576735496520996,
528
+ "learning_rate": 4.9956247295373396e-05,
529
+ "loss": 1.23,
530
+ "step": 355
531
+ },
532
+ {
533
+ "epoch": 0.06,
534
+ "grad_norm": 1.3774089813232422,
535
+ "learning_rate": 4.995500298830196e-05,
536
+ "loss": 1.0556,
537
+ "step": 360
538
+ },
539
+ {
540
+ "epoch": 0.06,
541
+ "grad_norm": 1.1523677110671997,
542
+ "learning_rate": 4.995374125061614e-05,
543
+ "loss": 1.1787,
544
+ "step": 365
545
+ },
546
+ {
547
+ "epoch": 0.06,
548
+ "grad_norm": 0.8310608863830566,
549
+ "learning_rate": 4.9952462083197246e-05,
550
+ "loss": 0.8525,
551
+ "step": 370
552
+ },
553
+ {
554
+ "epoch": 0.06,
555
+ "grad_norm": 0.9814196825027466,
556
+ "learning_rate": 4.9951165486938765e-05,
557
+ "loss": 0.8522,
558
+ "step": 375
559
+ },
560
+ {
561
+ "epoch": 0.06,
562
+ "grad_norm": 0.9878122210502625,
563
+ "learning_rate": 4.994985146274633e-05,
564
+ "loss": 0.6618,
565
+ "step": 380
566
+ },
567
+ {
568
+ "epoch": 0.06,
569
+ "grad_norm": 1.2652586698532104,
570
+ "learning_rate": 4.994852001153777e-05,
571
+ "loss": 1.0489,
572
+ "step": 385
573
+ },
574
+ {
575
+ "epoch": 0.06,
576
+ "grad_norm": 1.2940975427627563,
577
+ "learning_rate": 4.994717113424307e-05,
578
+ "loss": 1.104,
579
+ "step": 390
580
+ },
581
+ {
582
+ "epoch": 0.06,
583
+ "grad_norm": 0.9636249542236328,
584
+ "learning_rate": 4.99458048318044e-05,
585
+ "loss": 0.9228,
586
+ "step": 395
587
+ },
588
+ {
589
+ "epoch": 0.06,
590
+ "grad_norm": 0.8122813105583191,
591
+ "learning_rate": 4.994442110517611e-05,
592
+ "loss": 0.9209,
593
+ "step": 400
594
+ },
595
+ {
596
+ "epoch": 0.06,
597
+ "eval_loss": 0.8184689879417419,
598
+ "eval_runtime": 96.4572,
599
+ "eval_samples_per_second": 7.226,
600
+ "eval_steps_per_second": 7.226,
601
+ "step": 400
602
+ },
603
+ {
604
+ "epoch": 0.06,
605
+ "grad_norm": 0.8742052912712097,
606
+ "learning_rate": 4.99430199553247e-05,
607
+ "loss": 0.9608,
608
+ "step": 405
609
+ },
610
+ {
611
+ "epoch": 0.07,
612
+ "grad_norm": 0.5679522752761841,
613
+ "learning_rate": 4.9941601383228835e-05,
614
+ "loss": 0.5963,
615
+ "step": 410
616
+ },
617
+ {
618
+ "epoch": 0.07,
619
+ "grad_norm": 1.0234627723693848,
620
+ "learning_rate": 4.994016538987938e-05,
621
+ "loss": 0.8642,
622
+ "step": 415
623
+ },
624
+ {
625
+ "epoch": 0.07,
626
+ "grad_norm": 0.8581897616386414,
627
+ "learning_rate": 4.993871197627934e-05,
628
+ "loss": 0.8993,
629
+ "step": 420
630
+ },
631
+ {
632
+ "epoch": 0.07,
633
+ "grad_norm": 1.4666485786437988,
634
+ "learning_rate": 4.9937241143443904e-05,
635
+ "loss": 0.8565,
636
+ "step": 425
637
+ },
638
+ {
639
+ "epoch": 0.07,
640
+ "grad_norm": 1.1166578531265259,
641
+ "learning_rate": 4.993575289240041e-05,
642
+ "loss": 0.881,
643
+ "step": 430
644
+ },
645
+ {
646
+ "epoch": 0.07,
647
+ "grad_norm": 1.303992748260498,
648
+ "learning_rate": 4.9934247224188393e-05,
649
+ "loss": 0.9962,
650
+ "step": 435
651
+ },
652
+ {
653
+ "epoch": 0.07,
654
+ "grad_norm": 0.9011989235877991,
655
+ "learning_rate": 4.993272413985952e-05,
656
+ "loss": 0.9316,
657
+ "step": 440
658
+ },
659
+ {
660
+ "epoch": 0.07,
661
+ "grad_norm": 0.8321458101272583,
662
+ "learning_rate": 4.993118364047764e-05,
663
+ "loss": 0.7889,
664
+ "step": 445
665
+ },
666
+ {
667
+ "epoch": 0.07,
668
+ "grad_norm": 0.7780352234840393,
669
+ "learning_rate": 4.992962572711877e-05,
670
+ "loss": 0.8287,
671
+ "step": 450
672
+ },
673
+ {
674
+ "epoch": 0.07,
675
+ "grad_norm": 0.9090210199356079,
676
+ "learning_rate": 4.992805040087108e-05,
677
+ "loss": 0.7018,
678
+ "step": 455
679
+ },
680
+ {
681
+ "epoch": 0.07,
682
+ "grad_norm": 0.8694137334823608,
683
+ "learning_rate": 4.9926457662834906e-05,
684
+ "loss": 0.8484,
685
+ "step": 460
686
+ },
687
+ {
688
+ "epoch": 0.07,
689
+ "grad_norm": 0.6327371001243591,
690
+ "learning_rate": 4.992484751412274e-05,
691
+ "loss": 0.716,
692
+ "step": 465
693
+ },
694
+ {
695
+ "epoch": 0.08,
696
+ "grad_norm": 1.200668215751648,
697
+ "learning_rate": 4.9923219955859254e-05,
698
+ "loss": 0.9525,
699
+ "step": 470
700
+ },
701
+ {
702
+ "epoch": 0.08,
703
+ "grad_norm": 0.8530198931694031,
704
+ "learning_rate": 4.9921574989181266e-05,
705
+ "loss": 0.744,
706
+ "step": 475
707
+ },
708
+ {
709
+ "epoch": 0.08,
710
+ "grad_norm": 1.168479323387146,
711
+ "learning_rate": 4.991991261523775e-05,
712
+ "loss": 0.729,
713
+ "step": 480
714
+ },
715
+ {
716
+ "epoch": 0.08,
717
+ "grad_norm": 0.9499714970588684,
718
+ "learning_rate": 4.9918232835189834e-05,
719
+ "loss": 0.7725,
720
+ "step": 485
721
+ },
722
+ {
723
+ "epoch": 0.08,
724
+ "grad_norm": 0.8434467911720276,
725
+ "learning_rate": 4.991653565021084e-05,
726
+ "loss": 1.1558,
727
+ "step": 490
728
+ },
729
+ {
730
+ "epoch": 0.08,
731
+ "grad_norm": 0.7665804624557495,
732
+ "learning_rate": 4.99148210614862e-05,
733
+ "loss": 1.0208,
734
+ "step": 495
735
+ },
736
+ {
737
+ "epoch": 0.08,
738
+ "grad_norm": 0.5782546401023865,
739
+ "learning_rate": 4.991308907021353e-05,
740
+ "loss": 0.8306,
741
+ "step": 500
742
+ },
743
+ {
744
+ "epoch": 0.08,
745
+ "eval_loss": 0.8132078051567078,
746
+ "eval_runtime": 96.433,
747
+ "eval_samples_per_second": 7.228,
748
+ "eval_steps_per_second": 7.228,
749
+ "step": 500
750
+ },
751
+ {
752
+ "epoch": 0.08,
753
+ "grad_norm": 1.0821778774261475,
754
+ "learning_rate": 4.9911339677602584e-05,
755
+ "loss": 0.9503,
756
+ "step": 505
757
+ },
758
+ {
759
+ "epoch": 0.08,
760
+ "grad_norm": 0.5409029126167297,
761
+ "learning_rate": 4.99095728848753e-05,
762
+ "loss": 0.8586,
763
+ "step": 510
764
+ },
765
+ {
766
+ "epoch": 0.08,
767
+ "grad_norm": 0.9011789560317993,
768
+ "learning_rate": 4.990778869326575e-05,
769
+ "loss": 0.7981,
770
+ "step": 515
771
+ },
772
+ {
773
+ "epoch": 0.08,
774
+ "grad_norm": 1.0092263221740723,
775
+ "learning_rate": 4.990598710402013e-05,
776
+ "loss": 1.0174,
777
+ "step": 520
778
+ },
779
+ {
780
+ "epoch": 0.08,
781
+ "grad_norm": 1.4362307786941528,
782
+ "learning_rate": 4.9904168118396844e-05,
783
+ "loss": 0.8373,
784
+ "step": 525
785
+ },
786
+ {
787
+ "epoch": 0.08,
788
+ "grad_norm": 2.1772639751434326,
789
+ "learning_rate": 4.9902331737666414e-05,
790
+ "loss": 0.9599,
791
+ "step": 530
792
+ },
793
+ {
794
+ "epoch": 0.09,
795
+ "grad_norm": 0.9610542058944702,
796
+ "learning_rate": 4.990047796311151e-05,
797
+ "loss": 0.6895,
798
+ "step": 535
799
+ },
800
+ {
801
+ "epoch": 0.09,
802
+ "grad_norm": 0.9922348260879517,
803
+ "learning_rate": 4.989860679602698e-05,
804
+ "loss": 0.7315,
805
+ "step": 540
806
+ },
807
+ {
808
+ "epoch": 0.09,
809
+ "grad_norm": 1.2409151792526245,
810
+ "learning_rate": 4.9896718237719785e-05,
811
+ "loss": 0.8574,
812
+ "step": 545
813
+ },
814
+ {
815
+ "epoch": 0.09,
816
+ "grad_norm": 1.016333818435669,
817
+ "learning_rate": 4.9894812289509046e-05,
818
+ "loss": 1.1248,
819
+ "step": 550
820
+ },
821
+ {
822
+ "epoch": 0.09,
823
+ "grad_norm": 0.9131489396095276,
824
+ "learning_rate": 4.989288895272604e-05,
825
+ "loss": 0.9847,
826
+ "step": 555
827
+ },
828
+ {
829
+ "epoch": 0.09,
830
+ "grad_norm": 1.215469479560852,
831
+ "learning_rate": 4.989094822871419e-05,
832
+ "loss": 0.912,
833
+ "step": 560
834
+ },
835
+ {
836
+ "epoch": 0.09,
837
+ "grad_norm": 1.0536105632781982,
838
+ "learning_rate": 4.988899011882903e-05,
839
+ "loss": 0.8425,
840
+ "step": 565
841
+ },
842
+ {
843
+ "epoch": 0.09,
844
+ "grad_norm": 1.9705311059951782,
845
+ "learning_rate": 4.988701462443829e-05,
846
+ "loss": 0.9385,
847
+ "step": 570
848
+ },
849
+ {
850
+ "epoch": 0.09,
851
+ "grad_norm": 1.2488442659378052,
852
+ "learning_rate": 4.98850217469218e-05,
853
+ "loss": 0.7865,
854
+ "step": 575
855
+ },
856
+ {
857
+ "epoch": 0.09,
858
+ "grad_norm": 1.7318600416183472,
859
+ "learning_rate": 4.988301148767157e-05,
860
+ "loss": 0.8231,
861
+ "step": 580
862
+ },
863
+ {
864
+ "epoch": 0.09,
865
+ "grad_norm": 0.8247858881950378,
866
+ "learning_rate": 4.9880983848091704e-05,
867
+ "loss": 0.8553,
868
+ "step": 585
869
+ },
870
+ {
871
+ "epoch": 0.09,
872
+ "grad_norm": 0.858172595500946,
873
+ "learning_rate": 4.987893882959849e-05,
874
+ "loss": 1.3952,
875
+ "step": 590
876
+ },
877
+ {
878
+ "epoch": 0.09,
879
+ "grad_norm": 1.2286418676376343,
880
+ "learning_rate": 4.987687643362033e-05,
881
+ "loss": 0.837,
882
+ "step": 595
883
+ },
884
+ {
885
+ "epoch": 0.1,
886
+ "grad_norm": 1.034350872039795,
887
+ "learning_rate": 4.9874796661597765e-05,
888
+ "loss": 0.9175,
889
+ "step": 600
890
+ },
891
+ {
892
+ "epoch": 0.1,
893
+ "eval_loss": 0.8063747882843018,
894
+ "eval_runtime": 96.4224,
895
+ "eval_samples_per_second": 7.229,
896
+ "eval_steps_per_second": 7.229,
897
+ "step": 600
898
+ },
899
+ {
900
+ "epoch": 0.1,
901
+ "grad_norm": 0.7192366123199463,
902
+ "learning_rate": 4.987269951498348e-05,
903
+ "loss": 0.8563,
904
+ "step": 605
905
+ },
906
+ {
907
+ "epoch": 0.1,
908
+ "grad_norm": 1.2645854949951172,
909
+ "learning_rate": 4.98705849952423e-05,
910
+ "loss": 0.6663,
911
+ "step": 610
912
+ },
913
+ {
914
+ "epoch": 0.1,
915
+ "grad_norm": 1.0610381364822388,
916
+ "learning_rate": 4.9868453103851176e-05,
917
+ "loss": 0.8452,
918
+ "step": 615
919
+ },
920
+ {
921
+ "epoch": 0.1,
922
+ "grad_norm": 0.8550002574920654,
923
+ "learning_rate": 4.986630384229919e-05,
924
+ "loss": 0.8894,
925
+ "step": 620
926
+ },
927
+ {
928
+ "epoch": 0.1,
929
+ "grad_norm": 0.7490519285202026,
930
+ "learning_rate": 4.986413721208757e-05,
931
+ "loss": 0.9106,
932
+ "step": 625
933
+ },
934
+ {
935
+ "epoch": 0.1,
936
+ "grad_norm": 0.557860255241394,
937
+ "learning_rate": 4.986195321472965e-05,
938
+ "loss": 0.685,
939
+ "step": 630
940
+ },
941
+ {
942
+ "epoch": 0.1,
943
+ "grad_norm": 0.7450752258300781,
944
+ "learning_rate": 4.9859751851750934e-05,
945
+ "loss": 0.8472,
946
+ "step": 635
947
+ },
948
+ {
949
+ "epoch": 0.1,
950
+ "grad_norm": 1.176376461982727,
951
+ "learning_rate": 4.985753312468903e-05,
952
+ "loss": 1.0197,
953
+ "step": 640
954
+ },
955
+ {
956
+ "epoch": 0.1,
957
+ "grad_norm": 1.0625300407409668,
958
+ "learning_rate": 4.985529703509367e-05,
959
+ "loss": 0.9685,
960
+ "step": 645
961
+ },
962
+ {
963
+ "epoch": 0.1,
964
+ "grad_norm": 0.8808372616767883,
965
+ "learning_rate": 4.985304358452672e-05,
966
+ "loss": 0.8612,
967
+ "step": 650
968
+ },
969
+ {
970
+ "epoch": 0.1,
971
+ "grad_norm": 0.8110201954841614,
972
+ "learning_rate": 4.985077277456218e-05,
973
+ "loss": 0.8401,
974
+ "step": 655
975
+ },
976
+ {
977
+ "epoch": 0.11,
978
+ "grad_norm": 0.9364888072013855,
979
+ "learning_rate": 4.984848460678618e-05,
980
+ "loss": 0.6197,
981
+ "step": 660
982
+ },
983
+ {
984
+ "epoch": 0.11,
985
+ "grad_norm": 1.0113518238067627,
986
+ "learning_rate": 4.984617908279694e-05,
987
+ "loss": 0.9889,
988
+ "step": 665
989
+ },
990
+ {
991
+ "epoch": 0.11,
992
+ "grad_norm": 1.1148868799209595,
993
+ "learning_rate": 4.984385620420485e-05,
994
+ "loss": 0.9558,
995
+ "step": 670
996
+ },
997
+ {
998
+ "epoch": 0.11,
999
+ "grad_norm": 0.9506175518035889,
1000
+ "learning_rate": 4.984151597263238e-05,
1001
+ "loss": 0.7323,
1002
+ "step": 675
1003
+ },
1004
+ {
1005
+ "epoch": 0.11,
1006
+ "grad_norm": 1.0044193267822266,
1007
+ "learning_rate": 4.983915838971415e-05,
1008
+ "loss": 0.7504,
1009
+ "step": 680
1010
+ },
1011
+ {
1012
+ "epoch": 0.11,
1013
+ "grad_norm": 2.2674214839935303,
1014
+ "learning_rate": 4.9836783457096875e-05,
1015
+ "loss": 1.032,
1016
+ "step": 685
1017
+ },
1018
+ {
1019
+ "epoch": 0.11,
1020
+ "grad_norm": 1.4945333003997803,
1021
+ "learning_rate": 4.983439117643942e-05,
1022
+ "loss": 1.0359,
1023
+ "step": 690
1024
+ },
1025
+ {
1026
+ "epoch": 0.11,
1027
+ "grad_norm": 0.9860715866088867,
1028
+ "learning_rate": 4.9831981549412744e-05,
1029
+ "loss": 1.1152,
1030
+ "step": 695
1031
+ },
1032
+ {
1033
+ "epoch": 0.11,
1034
+ "grad_norm": 0.8287227153778076,
1035
+ "learning_rate": 4.982955457769992e-05,
1036
+ "loss": 0.8157,
1037
+ "step": 700
1038
+ },
1039
+ {
1040
+ "epoch": 0.11,
1041
+ "eval_loss": 0.8022791743278503,
1042
+ "eval_runtime": 96.5324,
1043
+ "eval_samples_per_second": 7.22,
1044
+ "eval_steps_per_second": 7.22,
1045
+ "step": 700
1046
+ },
1047
+ {
1048
+ "epoch": 0.11,
1049
+ "grad_norm": 0.9216273427009583,
1050
+ "learning_rate": 4.9827110262996144e-05,
1051
+ "loss": 0.8395,
1052
+ "step": 705
1053
+ },
1054
+ {
1055
+ "epoch": 0.11,
1056
+ "grad_norm": 0.7642357349395752,
1057
+ "learning_rate": 4.982464860700874e-05,
1058
+ "loss": 0.8817,
1059
+ "step": 710
1060
+ },
1061
+ {
1062
+ "epoch": 0.11,
1063
+ "grad_norm": 0.8851175308227539,
1064
+ "learning_rate": 4.982216961145711e-05,
1065
+ "loss": 0.8558,
1066
+ "step": 715
1067
+ },
1068
+ {
1069
+ "epoch": 0.11,
1070
+ "grad_norm": 0.44226109981536865,
1071
+ "learning_rate": 4.98196732780728e-05,
1072
+ "loss": 0.882,
1073
+ "step": 720
1074
+ },
1075
+ {
1076
+ "epoch": 0.12,
1077
+ "grad_norm": 0.8005027174949646,
1078
+ "learning_rate": 4.981715960859945e-05,
1079
+ "loss": 0.8835,
1080
+ "step": 725
1081
+ },
1082
+ {
1083
+ "epoch": 0.12,
1084
+ "grad_norm": 0.7451304793357849,
1085
+ "learning_rate": 4.981462860479281e-05,
1086
+ "loss": 0.8551,
1087
+ "step": 730
1088
+ },
1089
+ {
1090
+ "epoch": 0.12,
1091
+ "grad_norm": 1.1069347858428955,
1092
+ "learning_rate": 4.9812080268420745e-05,
1093
+ "loss": 0.999,
1094
+ "step": 735
1095
+ },
1096
+ {
1097
+ "epoch": 0.12,
1098
+ "grad_norm": 0.8892244100570679,
1099
+ "learning_rate": 4.980951460126322e-05,
1100
+ "loss": 1.012,
1101
+ "step": 740
1102
+ },
1103
+ {
1104
+ "epoch": 0.12,
1105
+ "grad_norm": 0.8935977816581726,
1106
+ "learning_rate": 4.9806931605112305e-05,
1107
+ "loss": 0.9911,
1108
+ "step": 745
1109
+ },
1110
+ {
1111
+ "epoch": 0.12,
1112
+ "grad_norm": 0.8456961512565613,
1113
+ "learning_rate": 4.9804331281772176e-05,
1114
+ "loss": 0.7595,
1115
+ "step": 750
1116
+ },
1117
+ {
1118
+ "epoch": 0.12,
1119
+ "grad_norm": 0.78443443775177,
1120
+ "learning_rate": 4.980171363305911e-05,
1121
+ "loss": 0.8308,
1122
+ "step": 755
1123
+ },
1124
+ {
1125
+ "epoch": 0.12,
1126
+ "grad_norm": 1.0028038024902344,
1127
+ "learning_rate": 4.979907866080149e-05,
1128
+ "loss": 0.9637,
1129
+ "step": 760
1130
+ },
1131
+ {
1132
+ "epoch": 0.12,
1133
+ "grad_norm": 1.1801577806472778,
1134
+ "learning_rate": 4.9796426366839786e-05,
1135
+ "loss": 0.6159,
1136
+ "step": 765
1137
+ },
1138
+ {
1139
+ "epoch": 0.12,
1140
+ "grad_norm": 0.8370681405067444,
1141
+ "learning_rate": 4.979375675302659e-05,
1142
+ "loss": 0.9276,
1143
+ "step": 770
1144
+ },
1145
+ {
1146
+ "epoch": 0.12,
1147
+ "grad_norm": 0.8605382442474365,
1148
+ "learning_rate": 4.979106982122658e-05,
1149
+ "loss": 1.1077,
1150
+ "step": 775
1151
+ },
1152
+ {
1153
+ "epoch": 0.12,
1154
+ "grad_norm": 0.7788259387016296,
1155
+ "learning_rate": 4.978836557331652e-05,
1156
+ "loss": 0.8172,
1157
+ "step": 780
1158
+ },
1159
+ {
1160
+ "epoch": 0.13,
1161
+ "grad_norm": 1.4312686920166016,
1162
+ "learning_rate": 4.978564401118528e-05,
1163
+ "loss": 0.8759,
1164
+ "step": 785
1165
+ },
1166
+ {
1167
+ "epoch": 0.13,
1168
+ "grad_norm": 0.9109662175178528,
1169
+ "learning_rate": 4.978290513673381e-05,
1170
+ "loss": 0.947,
1171
+ "step": 790
1172
+ },
1173
+ {
1174
+ "epoch": 0.13,
1175
+ "grad_norm": 1.1819065809249878,
1176
+ "learning_rate": 4.9780148951875195e-05,
1177
+ "loss": 0.7364,
1178
+ "step": 795
1179
+ },
1180
+ {
1181
+ "epoch": 0.13,
1182
+ "grad_norm": 0.9400575160980225,
1183
+ "learning_rate": 4.977737545853455e-05,
1184
+ "loss": 0.9469,
1185
+ "step": 800
1186
+ },
1187
+ {
1188
+ "epoch": 0.13,
1189
+ "eval_loss": 0.7995806932449341,
1190
+ "eval_runtime": 96.5877,
1191
+ "eval_samples_per_second": 7.216,
1192
+ "eval_steps_per_second": 7.216,
1193
+ "step": 800
1194
+ },
1195
+ {
1196
+ "epoch": 0.13,
1197
+ "grad_norm": 1.693812370300293,
1198
+ "learning_rate": 4.9774584658649126e-05,
1199
+ "loss": 0.9433,
1200
+ "step": 805
1201
+ },
1202
+ {
1203
+ "epoch": 0.13,
1204
+ "grad_norm": 1.0892895460128784,
1205
+ "learning_rate": 4.9771776554168234e-05,
1206
+ "loss": 0.7027,
1207
+ "step": 810
1208
+ },
1209
+ {
1210
+ "epoch": 0.13,
1211
+ "grad_norm": 0.9118362665176392,
1212
+ "learning_rate": 4.976895114705329e-05,
1213
+ "loss": 0.9468,
1214
+ "step": 815
1215
+ },
1216
+ {
1217
+ "epoch": 0.13,
1218
+ "grad_norm": 0.8032681345939636,
1219
+ "learning_rate": 4.976610843927779e-05,
1220
+ "loss": 0.7927,
1221
+ "step": 820
1222
+ },
1223
+ {
1224
+ "epoch": 0.13,
1225
+ "grad_norm": 1.168225646018982,
1226
+ "learning_rate": 4.976324843282732e-05,
1227
+ "loss": 0.9673,
1228
+ "step": 825
1229
+ },
1230
+ {
1231
+ "epoch": 0.13,
1232
+ "grad_norm": 1.077602744102478,
1233
+ "learning_rate": 4.976037112969953e-05,
1234
+ "loss": 0.9156,
1235
+ "step": 830
1236
+ },
1237
+ {
1238
+ "epoch": 0.13,
1239
+ "grad_norm": 0.8643108606338501,
1240
+ "learning_rate": 4.9757476531904165e-05,
1241
+ "loss": 0.6999,
1242
+ "step": 835
1243
+ },
1244
+ {
1245
+ "epoch": 0.13,
1246
+ "grad_norm": 0.933397650718689,
1247
+ "learning_rate": 4.975456464146306e-05,
1248
+ "loss": 0.8828,
1249
+ "step": 840
1250
+ },
1251
+ {
1252
+ "epoch": 0.13,
1253
+ "grad_norm": 0.7036295533180237,
1254
+ "learning_rate": 4.975163546041011e-05,
1255
+ "loss": 0.8709,
1256
+ "step": 845
1257
+ },
1258
+ {
1259
+ "epoch": 0.14,
1260
+ "grad_norm": 0.5974694490432739,
1261
+ "learning_rate": 4.974868899079128e-05,
1262
+ "loss": 0.7594,
1263
+ "step": 850
1264
+ },
1265
+ {
1266
+ "epoch": 0.14,
1267
+ "grad_norm": 0.7244943380355835,
1268
+ "learning_rate": 4.974572523466465e-05,
1269
+ "loss": 0.8714,
1270
+ "step": 855
1271
+ },
1272
+ {
1273
+ "epoch": 0.14,
1274
+ "grad_norm": 0.5783522725105286,
1275
+ "learning_rate": 4.9742744194100345e-05,
1276
+ "loss": 0.8941,
1277
+ "step": 860
1278
+ },
1279
+ {
1280
+ "epoch": 0.14,
1281
+ "grad_norm": 0.7480617761611938,
1282
+ "learning_rate": 4.973974587118055e-05,
1283
+ "loss": 0.9798,
1284
+ "step": 865
1285
+ },
1286
+ {
1287
+ "epoch": 0.14,
1288
+ "grad_norm": 0.7548874020576477,
1289
+ "learning_rate": 4.973673026799956e-05,
1290
+ "loss": 0.7767,
1291
+ "step": 870
1292
+ },
1293
+ {
1294
+ "epoch": 0.14,
1295
+ "grad_norm": 0.7075071930885315,
1296
+ "learning_rate": 4.97336973866637e-05,
1297
+ "loss": 0.7779,
1298
+ "step": 875
1299
+ },
1300
+ {
1301
+ "epoch": 0.14,
1302
+ "grad_norm": 0.7042987942695618,
1303
+ "learning_rate": 4.97306472292914e-05,
1304
+ "loss": 0.8249,
1305
+ "step": 880
1306
+ },
1307
+ {
1308
+ "epoch": 0.14,
1309
+ "grad_norm": 1.0242459774017334,
1310
+ "learning_rate": 4.972757979801313e-05,
1311
+ "loss": 0.9223,
1312
+ "step": 885
1313
+ },
1314
+ {
1315
+ "epoch": 0.14,
1316
+ "grad_norm": 0.6138095259666443,
1317
+ "learning_rate": 4.9724495094971436e-05,
1318
+ "loss": 0.9842,
1319
+ "step": 890
1320
+ },
1321
+ {
1322
+ "epoch": 0.14,
1323
+ "grad_norm": 0.7905042767524719,
1324
+ "learning_rate": 4.9721393122320925e-05,
1325
+ "loss": 0.8738,
1326
+ "step": 895
1327
+ },
1328
+ {
1329
+ "epoch": 0.14,
1330
+ "grad_norm": 0.9658048748970032,
1331
+ "learning_rate": 4.9718273882228265e-05,
1332
+ "loss": 0.8872,
1333
+ "step": 900
1334
+ },
1335
+ {
1336
+ "epoch": 0.14,
1337
+ "eval_loss": 0.7954564690589905,
1338
+ "eval_runtime": 96.643,
1339
+ "eval_samples_per_second": 7.212,
1340
+ "eval_steps_per_second": 7.212,
1341
+ "step": 900
1342
+ },
1343
+ {
1344
+ "epoch": 0.14,
1345
+ "grad_norm": 0.8425014019012451,
1346
+ "learning_rate": 4.97151373768722e-05,
1347
+ "loss": 0.778,
1348
+ "step": 905
1349
+ },
1350
+ {
1351
+ "epoch": 0.15,
1352
+ "grad_norm": 0.5527231693267822,
1353
+ "learning_rate": 4.971198360844351e-05,
1354
+ "loss": 0.8332,
1355
+ "step": 910
1356
+ },
1357
+ {
1358
+ "epoch": 0.15,
1359
+ "grad_norm": 0.7870334386825562,
1360
+ "learning_rate": 4.9708812579145056e-05,
1361
+ "loss": 0.9265,
1362
+ "step": 915
1363
+ },
1364
+ {
1365
+ "epoch": 0.15,
1366
+ "grad_norm": 0.9935321807861328,
1367
+ "learning_rate": 4.970562429119173e-05,
1368
+ "loss": 0.7243,
1369
+ "step": 920
1370
+ },
1371
+ {
1372
+ "epoch": 0.15,
1373
+ "grad_norm": 0.9546892046928406,
1374
+ "learning_rate": 4.970241874681051e-05,
1375
+ "loss": 0.9908,
1376
+ "step": 925
1377
+ },
1378
+ {
1379
+ "epoch": 0.15,
1380
+ "grad_norm": 0.7340118885040283,
1381
+ "learning_rate": 4.969919594824039e-05,
1382
+ "loss": 0.7932,
1383
+ "step": 930
1384
+ },
1385
+ {
1386
+ "epoch": 0.15,
1387
+ "grad_norm": 5.1686015129089355,
1388
+ "learning_rate": 4.9695955897732453e-05,
1389
+ "loss": 0.9842,
1390
+ "step": 935
1391
+ },
1392
+ {
1393
+ "epoch": 0.15,
1394
+ "grad_norm": 0.9721456170082092,
1395
+ "learning_rate": 4.9692698597549815e-05,
1396
+ "loss": 0.9271,
1397
+ "step": 940
1398
+ },
1399
+ {
1400
+ "epoch": 0.15,
1401
+ "grad_norm": 0.6477334499359131,
1402
+ "learning_rate": 4.9689424049967623e-05,
1403
+ "loss": 0.934,
1404
+ "step": 945
1405
+ },
1406
+ {
1407
+ "epoch": 0.15,
1408
+ "grad_norm": 1.0759055614471436,
1409
+ "learning_rate": 4.968613225727311e-05,
1410
+ "loss": 1.0465,
1411
+ "step": 950
1412
+ },
1413
+ {
1414
+ "epoch": 0.15,
1415
+ "grad_norm": 0.7222158908843994,
1416
+ "learning_rate": 4.968282322176552e-05,
1417
+ "loss": 0.7732,
1418
+ "step": 955
1419
+ },
1420
+ {
1421
+ "epoch": 0.15,
1422
+ "grad_norm": 0.8591343760490417,
1423
+ "learning_rate": 4.9679496945756155e-05,
1424
+ "loss": 0.9062,
1425
+ "step": 960
1426
+ },
1427
+ {
1428
+ "epoch": 0.15,
1429
+ "grad_norm": 1.8495111465454102,
1430
+ "learning_rate": 4.967615343156837e-05,
1431
+ "loss": 0.8861,
1432
+ "step": 965
1433
+ },
1434
+ {
1435
+ "epoch": 0.15,
1436
+ "grad_norm": 0.6847331523895264,
1437
+ "learning_rate": 4.967279268153753e-05,
1438
+ "loss": 0.8001,
1439
+ "step": 970
1440
+ },
1441
+ {
1442
+ "epoch": 0.16,
1443
+ "grad_norm": 0.690113365650177,
1444
+ "learning_rate": 4.9669414698011074e-05,
1445
+ "loss": 0.7378,
1446
+ "step": 975
1447
+ },
1448
+ {
1449
+ "epoch": 0.16,
1450
+ "grad_norm": 0.8349626064300537,
1451
+ "learning_rate": 4.9666019483348456e-05,
1452
+ "loss": 0.7193,
1453
+ "step": 980
1454
+ },
1455
+ {
1456
+ "epoch": 0.16,
1457
+ "grad_norm": 0.6444108486175537,
1458
+ "learning_rate": 4.966260703992116e-05,
1459
+ "loss": 0.8729,
1460
+ "step": 985
1461
+ },
1462
+ {
1463
+ "epoch": 0.16,
1464
+ "grad_norm": 0.9515655040740967,
1465
+ "learning_rate": 4.965917737011274e-05,
1466
+ "loss": 0.7532,
1467
+ "step": 990
1468
+ },
1469
+ {
1470
+ "epoch": 0.16,
1471
+ "grad_norm": 0.8138986229896545,
1472
+ "learning_rate": 4.965573047631873e-05,
1473
+ "loss": 1.0124,
1474
+ "step": 995
1475
+ },
1476
+ {
1477
+ "epoch": 0.16,
1478
+ "grad_norm": 1.0182080268859863,
1479
+ "learning_rate": 4.9652266360946745e-05,
1480
+ "loss": 0.8842,
1481
+ "step": 1000
1482
+ },
1483
+ {
1484
+ "epoch": 0.16,
1485
+ "eval_loss": 0.7912728190422058,
1486
+ "eval_runtime": 96.5004,
1487
+ "eval_samples_per_second": 7.223,
1488
+ "eval_steps_per_second": 7.223,
1489
+ "step": 1000
1490
+ }
1491
+ ],
1492
+ "logging_steps": 5,
1493
+ "max_steps": 18795,
1494
+ "num_input_tokens_seen": 0,
1495
+ "num_train_epochs": 3,
1496
+ "save_steps": 500,
1497
+ "total_flos": 4.3155317587968e+16,
1498
+ "train_batch_size": 1,
1499
+ "trial_name": null,
1500
+ "trial_params": null
1501
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba1ff1fdda3287196c6ef142366a8ee27aa213077a93b2e39492dd1603ae72f
3
+ size 5048
checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.9.0
checkpoint-10000/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 64,
13
+ "lora_dropout": 0.15,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "c_fc",
23
+ "o_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "c_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-10000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b69b187b6aed04aa1dee26fa2c5a53a3703730adbc1241ba98d35e7b14bbcf0
3
+ size 306235552
checkpoint-10000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97a2e05763c3411f9f0e9b159d66c459c6c0753f6c4de5affec3a7ad0cb37acf
3
+ size 612692114
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ad759fd9a3102b74e5668b6462bee2ef2882f610228e5d30d85770644c87369
3
+ size 14244
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14fe74239f29afb85e174adc4b5e87cdbe8d520f9eaece0a2fdea90450a5bdf
3
+ size 1064
checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": "<|endoftext|>",
57
+ "unk_token": {
58
+ "content": "<|endoftext|>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
checkpoint-10000/tokenizer_config.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<repo_name>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<file_sep>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<jupyter_script>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<empty_output>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<code_to_intermediate>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<intermediate_to_code>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<pr>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<pr_status>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "<pr_is_merged>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<pr_base>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "<pr_file>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<pr_base_code>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "<pr_diff>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<pr_diff_hunk>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "<pr_comment>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "27": {
222
+ "content": "<pr_event_id>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "28": {
230
+ "content": "<pr_review>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "29": {
238
+ "content": "<pr_review_state>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "30": {
246
+ "content": "<pr_review_comment>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "31": {
254
+ "content": "<pr_in_reply_to_review_id>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32": {
262
+ "content": "<pr_in_reply_to_comment_id>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "33": {
270
+ "content": "<pr_diff_hunk_comment_line>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "34": {
278
+ "content": "<NAME>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "35": {
286
+ "content": "<EMAIL>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "36": {
294
+ "content": "<KEY>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "37": {
302
+ "content": "<PASSWORD>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ }
309
+ },
310
+ "additional_special_tokens": [
311
+ "<|endoftext|>",
312
+ "<fim_prefix>",
313
+ "<fim_middle>",
314
+ "<fim_suffix>",
315
+ "<fim_pad>",
316
+ "<repo_name>",
317
+ "<file_sep>",
318
+ "<issue_start>",
319
+ "<issue_comment>",
320
+ "<issue_closed>",
321
+ "<jupyter_start>",
322
+ "<jupyter_text>",
323
+ "<jupyter_code>",
324
+ "<jupyter_output>",
325
+ "<jupyter_script>",
326
+ "<empty_output>",
327
+ "<code_to_intermediate>",
328
+ "<intermediate_to_code>",
329
+ "<pr>",
330
+ "<pr_status>",
331
+ "<pr_is_merged>",
332
+ "<pr_base>",
333
+ "<pr_file>",
334
+ "<pr_base_code>",
335
+ "<pr_diff>",
336
+ "<pr_diff_hunk>",
337
+ "<pr_comment>",
338
+ "<pr_event_id>",
339
+ "<pr_review>",
340
+ "<pr_review_state>",
341
+ "<pr_review_comment>",
342
+ "<pr_in_reply_to_review_id>",
343
+ "<pr_in_reply_to_comment_id>",
344
+ "<pr_diff_hunk_comment_line>",
345
+ "<NAME>",
346
+ "<EMAIL>",
347
+ "<KEY>",
348
+ "<PASSWORD>"
349
+ ],
350
+ "bos_token": "<|endoftext|>",
351
+ "clean_up_tokenization_spaces": true,
352
+ "eos_token": "<|endoftext|>",
353
+ "errors": "replace",
354
+ "model_max_length": 1000000000000000019884624838656,
355
+ "pad_token": "<|endoftext|>",
356
+ "padding_side": "right",
357
+ "split_special_tokens": false,
358
+ "tokenizer_class": "GPT2Tokenizer",
359
+ "unk_token": "<|endoftext|>",
360
+ "vocab_size": 49152
361
+ }
checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba1ff1fdda3287196c6ef142366a8ee27aa213077a93b2e39492dd1603ae72f
3
+ size 5048
checkpoint-10000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10500/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.9.0
checkpoint-10500/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 64,
13
+ "lora_dropout": 0.15,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "c_fc",
23
+ "o_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "c_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-10500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a24f1b860c0718a0f599e20cfcde60dba4e6264959675f3b050fd71f6424047
3
+ size 306235552
checkpoint-10500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c83ba95cd90b11b3f8bdfa26bd0eaf2475e8e2a9b6427b1a2f127eeaba7e2f0d
3
+ size 612692114
checkpoint-10500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebc4d68c6ba09b134d762ec466777351973214f1a16cc6cd31114ef38347d4ae
3
+ size 14244
checkpoint-10500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0ae137ef9ab2f88c67af62bdf6ab3d615076a309f9c848b2969ed4df1a7b1d6
3
+ size 1064
checkpoint-10500/special_tokens_map.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": "<|endoftext|>",
57
+ "unk_token": {
58
+ "content": "<|endoftext|>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
checkpoint-10500/tokenizer_config.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<repo_name>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<file_sep>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<jupyter_script>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<empty_output>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<code_to_intermediate>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<intermediate_to_code>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<pr>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<pr_status>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "<pr_is_merged>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<pr_base>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "<pr_file>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<pr_base_code>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "<pr_diff>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<pr_diff_hunk>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "<pr_comment>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "27": {
222
+ "content": "<pr_event_id>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "28": {
230
+ "content": "<pr_review>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "29": {
238
+ "content": "<pr_review_state>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "30": {
246
+ "content": "<pr_review_comment>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "31": {
254
+ "content": "<pr_in_reply_to_review_id>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32": {
262
+ "content": "<pr_in_reply_to_comment_id>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "33": {
270
+ "content": "<pr_diff_hunk_comment_line>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "34": {
278
+ "content": "<NAME>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "35": {
286
+ "content": "<EMAIL>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "36": {
294
+ "content": "<KEY>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "37": {
302
+ "content": "<PASSWORD>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ }
309
+ },
310
+ "additional_special_tokens": [
311
+ "<|endoftext|>",
312
+ "<fim_prefix>",
313
+ "<fim_middle>",
314
+ "<fim_suffix>",
315
+ "<fim_pad>",
316
+ "<repo_name>",
317
+ "<file_sep>",
318
+ "<issue_start>",
319
+ "<issue_comment>",
320
+ "<issue_closed>",
321
+ "<jupyter_start>",
322
+ "<jupyter_text>",
323
+ "<jupyter_code>",
324
+ "<jupyter_output>",
325
+ "<jupyter_script>",
326
+ "<empty_output>",
327
+ "<code_to_intermediate>",
328
+ "<intermediate_to_code>",
329
+ "<pr>",
330
+ "<pr_status>",
331
+ "<pr_is_merged>",
332
+ "<pr_base>",
333
+ "<pr_file>",
334
+ "<pr_base_code>",
335
+ "<pr_diff>",
336
+ "<pr_diff_hunk>",
337
+ "<pr_comment>",
338
+ "<pr_event_id>",
339
+ "<pr_review>",
340
+ "<pr_review_state>",
341
+ "<pr_review_comment>",
342
+ "<pr_in_reply_to_review_id>",
343
+ "<pr_in_reply_to_comment_id>",
344
+ "<pr_diff_hunk_comment_line>",
345
+ "<NAME>",
346
+ "<EMAIL>",
347
+ "<KEY>",
348
+ "<PASSWORD>"
349
+ ],
350
+ "bos_token": "<|endoftext|>",
351
+ "clean_up_tokenization_spaces": true,
352
+ "eos_token": "<|endoftext|>",
353
+ "errors": "replace",
354
+ "model_max_length": 1000000000000000019884624838656,
355
+ "pad_token": "<|endoftext|>",
356
+ "padding_side": "right",
357
+ "split_special_tokens": false,
358
+ "tokenizer_class": "GPT2Tokenizer",
359
+ "unk_token": "<|endoftext|>",
360
+ "vocab_size": 49152
361
+ }
checkpoint-10500/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba1ff1fdda3287196c6ef142366a8ee27aa213077a93b2e39492dd1603ae72f
3
+ size 5048
checkpoint-10500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11000/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: bigcode/starcoder2-7b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.9.0
checkpoint-11000/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "bigcode/starcoder2-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 64,
13
+ "lora_dropout": 0.15,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "c_fc",
23
+ "o_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "c_proj"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-11000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9019013901b60deee9f2f66de29e225413c7311fe2715374be32987436a4e8e8
3
+ size 306235552
checkpoint-11000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fdb206d0d46198a2c519e1c9cd4fffe525cfd4c1af825722561b1213a4ea7e6
3
+ size 612692114
checkpoint-11000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e03cd57181c93286c55a2285d990849817b30e7f22c295a8fc298e711a5bf77
3
+ size 14244
checkpoint-11000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0e2ae236fbb9e6aa2493215f88876392eb92278816ada028bd155f1921609e2
3
+ size 1064
checkpoint-11000/special_tokens_map.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<repo_name>",
9
+ "<file_sep>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<jupyter_script>",
18
+ "<empty_output>",
19
+ "<code_to_intermediate>",
20
+ "<intermediate_to_code>",
21
+ "<pr>",
22
+ "<pr_status>",
23
+ "<pr_is_merged>",
24
+ "<pr_base>",
25
+ "<pr_file>",
26
+ "<pr_base_code>",
27
+ "<pr_diff>",
28
+ "<pr_diff_hunk>",
29
+ "<pr_comment>",
30
+ "<pr_event_id>",
31
+ "<pr_review>",
32
+ "<pr_review_state>",
33
+ "<pr_review_comment>",
34
+ "<pr_in_reply_to_review_id>",
35
+ "<pr_in_reply_to_comment_id>",
36
+ "<pr_diff_hunk_comment_line>",
37
+ "<NAME>",
38
+ "<EMAIL>",
39
+ "<KEY>",
40
+ "<PASSWORD>"
41
+ ],
42
+ "bos_token": {
43
+ "content": "<|endoftext|>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "eos_token": {
50
+ "content": "<|endoftext|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "pad_token": "<|endoftext|>",
57
+ "unk_token": {
58
+ "content": "<|endoftext|>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
checkpoint-11000/tokenizer_config.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<repo_name>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<file_sep>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<jupyter_script>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<empty_output>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<code_to_intermediate>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<intermediate_to_code>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<pr>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "19": {
158
+ "content": "<pr_status>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "20": {
166
+ "content": "<pr_is_merged>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "21": {
174
+ "content": "<pr_base>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "22": {
182
+ "content": "<pr_file>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "23": {
190
+ "content": "<pr_base_code>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "24": {
198
+ "content": "<pr_diff>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "25": {
206
+ "content": "<pr_diff_hunk>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "26": {
214
+ "content": "<pr_comment>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "27": {
222
+ "content": "<pr_event_id>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "28": {
230
+ "content": "<pr_review>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "29": {
238
+ "content": "<pr_review_state>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "30": {
246
+ "content": "<pr_review_comment>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "31": {
254
+ "content": "<pr_in_reply_to_review_id>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32": {
262
+ "content": "<pr_in_reply_to_comment_id>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "33": {
270
+ "content": "<pr_diff_hunk_comment_line>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "34": {
278
+ "content": "<NAME>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "35": {
286
+ "content": "<EMAIL>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "36": {
294
+ "content": "<KEY>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "37": {
302
+ "content": "<PASSWORD>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ }
309
+ },
310
+ "additional_special_tokens": [
311
+ "<|endoftext|>",
312
+ "<fim_prefix>",
313
+ "<fim_middle>",
314
+ "<fim_suffix>",
315
+ "<fim_pad>",
316
+ "<repo_name>",
317
+ "<file_sep>",
318
+ "<issue_start>",
319
+ "<issue_comment>",
320
+ "<issue_closed>",
321
+ "<jupyter_start>",
322
+ "<jupyter_text>",
323
+ "<jupyter_code>",
324
+ "<jupyter_output>",
325
+ "<jupyter_script>",
326
+ "<empty_output>",
327
+ "<code_to_intermediate>",
328
+ "<intermediate_to_code>",
329
+ "<pr>",
330
+ "<pr_status>",
331
+ "<pr_is_merged>",
332
+ "<pr_base>",
333
+ "<pr_file>",
334
+ "<pr_base_code>",
335
+ "<pr_diff>",
336
+ "<pr_diff_hunk>",
337
+ "<pr_comment>",
338
+ "<pr_event_id>",
339
+ "<pr_review>",
340
+ "<pr_review_state>",
341
+ "<pr_review_comment>",
342
+ "<pr_in_reply_to_review_id>",
343
+ "<pr_in_reply_to_comment_id>",
344
+ "<pr_diff_hunk_comment_line>",
345
+ "<NAME>",
346
+ "<EMAIL>",
347
+ "<KEY>",
348
+ "<PASSWORD>"
349
+ ],
350
+ "bos_token": "<|endoftext|>",
351
+ "clean_up_tokenization_spaces": true,
352
+ "eos_token": "<|endoftext|>",
353
+ "errors": "replace",
354
+ "model_max_length": 1000000000000000019884624838656,
355
+ "pad_token": "<|endoftext|>",
356
+ "padding_side": "right",
357
+ "split_special_tokens": false,
358
+ "tokenizer_class": "GPT2Tokenizer",
359
+ "unk_token": "<|endoftext|>",
360
+ "vocab_size": 49152
361
+ }
checkpoint-11000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff