P0u4a commited on
Commit
2ad4b4e
·
verified ·
1 Parent(s): 8c07f60

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. README.md +62 -0
  3. adapter_config.json +42 -0
  4. adapter_model.safetensors +3 -0
  5. added_tokens.json +24 -0
  6. chat_template.jinja +54 -0
  7. checkpoint-100/README.md +209 -0
  8. checkpoint-100/adapter_config.json +42 -0
  9. checkpoint-100/adapter_model.safetensors +3 -0
  10. checkpoint-100/added_tokens.json +24 -0
  11. checkpoint-100/chat_template.jinja +54 -0
  12. checkpoint-100/merges.txt +0 -0
  13. checkpoint-100/optimizer.pt +3 -0
  14. checkpoint-100/rng_state.pth +3 -0
  15. checkpoint-100/scaler.pt +3 -0
  16. checkpoint-100/scheduler.pt +3 -0
  17. checkpoint-100/special_tokens_map.json +31 -0
  18. checkpoint-100/tokenizer.json +3 -0
  19. checkpoint-100/tokenizer_config.json +207 -0
  20. checkpoint-100/trainer_state.json +144 -0
  21. checkpoint-100/training_args.bin +3 -0
  22. checkpoint-100/vocab.json +0 -0
  23. checkpoint-1000/README.md +209 -0
  24. checkpoint-1000/adapter_config.json +42 -0
  25. checkpoint-1000/adapter_model.safetensors +3 -0
  26. checkpoint-1000/added_tokens.json +24 -0
  27. checkpoint-1000/chat_template.jinja +54 -0
  28. checkpoint-1000/merges.txt +0 -0
  29. checkpoint-1000/optimizer.pt +3 -0
  30. checkpoint-1000/rng_state.pth +3 -0
  31. checkpoint-1000/scaler.pt +3 -0
  32. checkpoint-1000/scheduler.pt +3 -0
  33. checkpoint-1000/special_tokens_map.json +31 -0
  34. checkpoint-1000/tokenizer.json +3 -0
  35. checkpoint-1000/tokenizer_config.json +207 -0
  36. checkpoint-1000/trainer_state.json +1134 -0
  37. checkpoint-1000/training_args.bin +3 -0
  38. checkpoint-1000/vocab.json +0 -0
  39. checkpoint-200/README.md +209 -0
  40. checkpoint-200/adapter_config.json +42 -0
  41. checkpoint-200/adapter_model.safetensors +3 -0
  42. checkpoint-200/added_tokens.json +24 -0
  43. checkpoint-200/chat_template.jinja +54 -0
  44. checkpoint-200/merges.txt +0 -0
  45. checkpoint-200/optimizer.pt +3 -0
  46. checkpoint-200/rng_state.pth +3 -0
  47. checkpoint-200/scaler.pt +3 -0
  48. checkpoint-200/scheduler.pt +3 -0
  49. checkpoint-200/special_tokens_map.json +31 -0
  50. checkpoint-200/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Maincode/Maincoder-1B
3
+ library_name: peft
4
+ model_name: maincoder-1b-toolcalling-lora
5
+ tags:
6
+ - base_model:adapter:Maincode/Maincoder-1B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for maincoder-1b-toolcalling-lora
16
+
17
+ This model is a fine-tuned version of [Maincode/Maincoder-1B](https://huggingface.co/Maincode/Maincoder-1B).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+
34
+
35
+
36
+ This model was trained with SFT.
37
+
38
+ ### Framework versions
39
+
40
+ - PEFT 0.17.1
41
+ - TRL: 0.21.0
42
+ - Transformers: 4.57.3
43
+ - Pytorch: 2.10.0+cu128
44
+ - Datasets: 4.8.4
45
+ - Tokenizers: 0.22.2
46
+
47
+ ## Citations
48
+
49
+
50
+
51
+ Cite TRL as:
52
+
53
+ ```bibtex
54
+ @misc{vonwerra2022trl,
55
+ title = {{TRL: Transformer Reinforcement Learning}},
56
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
57
+ year = 2020,
58
+ journal = {GitHub repository},
59
+ publisher = {GitHub},
60
+ howpublished = {\url{https://github.com/huggingface/trl}}
61
+ }
62
+ ```
adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Maincode/Maincoder-1B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "up_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "k_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:262aa7df6404543e20f0dde11b7971643ca568b6701eefa4e08c221725518604
3
+ size 55111408
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-100/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Maincode/Maincoder-1B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Maincode/Maincoder-1B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Maincode/Maincoder-1B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "up_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "k_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:093e09efc7c93ba64eb2139609390e23ff5574401240c0df3ce85350084e924f
3
+ size 55111408
checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8789dc37c86a726128729e4c2470ec93bb4af3c89ed3e49f04adb9b0fe38eda5
3
+ size 110484131
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c81353d18360a112220e3590e9331869a8adf8395d1f4eb39a29ff95fe588d0
3
+ size 14645
checkpoint-100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4393a84a3109995aa1202073b039b12062e3189ed89aa0b94ef0510ba843009
3
+ size 1383
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027203cd324846f900163fad9eecbe4aeb19c35a55cd188f85a03ff3e0f62d81
3
+ size 1465
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|endoftext|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 32768,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.2222222222222222,
6
+ "eval_steps": 50,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.022222222222222223,
14
+ "grad_norm": 0.6193130612373352,
15
+ "learning_rate": 9.910000000000001e-05,
16
+ "loss": 1.654,
17
+ "mean_token_accuracy": 0.6766653224825859,
18
+ "num_tokens": 23330.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.044444444444444446,
23
+ "grad_norm": 0.5931444764137268,
24
+ "learning_rate": 9.81e-05,
25
+ "loss": 1.0695,
26
+ "mean_token_accuracy": 0.7898772016167641,
27
+ "num_tokens": 46070.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.06666666666666667,
32
+ "grad_norm": 0.3904027044773102,
33
+ "learning_rate": 9.71e-05,
34
+ "loss": 0.7556,
35
+ "mean_token_accuracy": 0.8578989505767822,
36
+ "num_tokens": 68719.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.08888888888888889,
41
+ "grad_norm": 0.32560470700263977,
42
+ "learning_rate": 9.61e-05,
43
+ "loss": 0.6186,
44
+ "mean_token_accuracy": 0.8813577085733414,
45
+ "num_tokens": 92126.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.1111111111111111,
50
+ "grad_norm": 0.30384671688079834,
51
+ "learning_rate": 9.51e-05,
52
+ "loss": 0.5761,
53
+ "mean_token_accuracy": 0.8921546742320061,
54
+ "num_tokens": 114738.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.1111111111111111,
59
+ "eval_loss": 0.5330827236175537,
60
+ "eval_mean_token_accuracy": 0.9009262701869011,
61
+ "eval_num_tokens": 114738.0,
62
+ "eval_runtime": 40.5094,
63
+ "eval_samples_per_second": 4.937,
64
+ "eval_steps_per_second": 4.937,
65
+ "step": 50
66
+ },
67
+ {
68
+ "epoch": 0.13333333333333333,
69
+ "grad_norm": 0.40981799364089966,
70
+ "learning_rate": 9.41e-05,
71
+ "loss": 0.4978,
72
+ "mean_token_accuracy": 0.9065196111798286,
73
+ "num_tokens": 136310.0,
74
+ "step": 60
75
+ },
76
+ {
77
+ "epoch": 0.15555555555555556,
78
+ "grad_norm": 0.5167299509048462,
79
+ "learning_rate": 9.310000000000001e-05,
80
+ "loss": 0.4466,
81
+ "mean_token_accuracy": 0.9203566908836365,
82
+ "num_tokens": 158193.0,
83
+ "step": 70
84
+ },
85
+ {
86
+ "epoch": 0.17777777777777778,
87
+ "grad_norm": 0.5062010884284973,
88
+ "learning_rate": 9.21e-05,
89
+ "loss": 0.4003,
90
+ "mean_token_accuracy": 0.9284946233034134,
91
+ "num_tokens": 180767.0,
92
+ "step": 80
93
+ },
94
+ {
95
+ "epoch": 0.2,
96
+ "grad_norm": 0.38172802329063416,
97
+ "learning_rate": 9.11e-05,
98
+ "loss": 0.3457,
99
+ "mean_token_accuracy": 0.9404646947979927,
100
+ "num_tokens": 203203.0,
101
+ "step": 90
102
+ },
103
+ {
104
+ "epoch": 0.2222222222222222,
105
+ "grad_norm": 0.4985530376434326,
106
+ "learning_rate": 9.010000000000001e-05,
107
+ "loss": 0.3271,
108
+ "mean_token_accuracy": 0.9434564337134361,
109
+ "num_tokens": 226070.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "epoch": 0.2222222222222222,
114
+ "eval_loss": 0.3216829001903534,
115
+ "eval_mean_token_accuracy": 0.9481457704305649,
116
+ "eval_num_tokens": 226070.0,
117
+ "eval_runtime": 41.0245,
118
+ "eval_samples_per_second": 4.875,
119
+ "eval_steps_per_second": 4.875,
120
+ "step": 100
121
+ }
122
+ ],
123
+ "logging_steps": 10,
124
+ "max_steps": 1000,
125
+ "num_input_tokens_seen": 0,
126
+ "num_train_epochs": 3,
127
+ "save_steps": 100,
128
+ "stateful_callbacks": {
129
+ "TrainerControl": {
130
+ "args": {
131
+ "should_epoch_stop": false,
132
+ "should_evaluate": false,
133
+ "should_log": false,
134
+ "should_save": true,
135
+ "should_training_stop": false
136
+ },
137
+ "attributes": {}
138
+ }
139
+ },
140
+ "total_flos": 1094077520640000.0,
141
+ "train_batch_size": 1,
142
+ "trial_name": null,
143
+ "trial_params": null
144
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f24a1d1fcf3561539dba436a511a781c825b96e542ffcf6aba86ad917df122
3
+ size 6225
checkpoint-100/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Maincode/Maincoder-1B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Maincode/Maincoder-1B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Maincode/Maincoder-1B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "up_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "k_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:262aa7df6404543e20f0dde11b7971643ca568b6701eefa4e08c221725518604
3
+ size 55111408
checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1528f3832f145fd518620483eec0423a1f1ae65064f157f2e3172977db3b3ff5
3
+ size 110484131
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ff3eb75f21b56c3fc12c648f5272ffc2d353868f9bc476a8d8c21d094f426e
3
+ size 14645
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ae2a2128444abab378aa06c09a61a84665f758fcc19fc46f5789b0bc1b5665
3
+ size 1383
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9008af82ea2c682c9b64de9d5bd271649247b8bd3c3761983a4def45d64125f2
3
+ size 1465
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|endoftext|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 32768,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.2222222222222223,
6
+ "eval_steps": 50,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.022222222222222223,
14
+ "grad_norm": 0.6193130612373352,
15
+ "learning_rate": 9.910000000000001e-05,
16
+ "loss": 1.654,
17
+ "mean_token_accuracy": 0.6766653224825859,
18
+ "num_tokens": 23330.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.044444444444444446,
23
+ "grad_norm": 0.5931444764137268,
24
+ "learning_rate": 9.81e-05,
25
+ "loss": 1.0695,
26
+ "mean_token_accuracy": 0.7898772016167641,
27
+ "num_tokens": 46070.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.06666666666666667,
32
+ "grad_norm": 0.3904027044773102,
33
+ "learning_rate": 9.71e-05,
34
+ "loss": 0.7556,
35
+ "mean_token_accuracy": 0.8578989505767822,
36
+ "num_tokens": 68719.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.08888888888888889,
41
+ "grad_norm": 0.32560470700263977,
42
+ "learning_rate": 9.61e-05,
43
+ "loss": 0.6186,
44
+ "mean_token_accuracy": 0.8813577085733414,
45
+ "num_tokens": 92126.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.1111111111111111,
50
+ "grad_norm": 0.30384671688079834,
51
+ "learning_rate": 9.51e-05,
52
+ "loss": 0.5761,
53
+ "mean_token_accuracy": 0.8921546742320061,
54
+ "num_tokens": 114738.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.1111111111111111,
59
+ "eval_loss": 0.5330827236175537,
60
+ "eval_mean_token_accuracy": 0.9009262701869011,
61
+ "eval_num_tokens": 114738.0,
62
+ "eval_runtime": 40.5094,
63
+ "eval_samples_per_second": 4.937,
64
+ "eval_steps_per_second": 4.937,
65
+ "step": 50
66
+ },
67
+ {
68
+ "epoch": 0.13333333333333333,
69
+ "grad_norm": 0.40981799364089966,
70
+ "learning_rate": 9.41e-05,
71
+ "loss": 0.4978,
72
+ "mean_token_accuracy": 0.9065196111798286,
73
+ "num_tokens": 136310.0,
74
+ "step": 60
75
+ },
76
+ {
77
+ "epoch": 0.15555555555555556,
78
+ "grad_norm": 0.5167299509048462,
79
+ "learning_rate": 9.310000000000001e-05,
80
+ "loss": 0.4466,
81
+ "mean_token_accuracy": 0.9203566908836365,
82
+ "num_tokens": 158193.0,
83
+ "step": 70
84
+ },
85
+ {
86
+ "epoch": 0.17777777777777778,
87
+ "grad_norm": 0.5062010884284973,
88
+ "learning_rate": 9.21e-05,
89
+ "loss": 0.4003,
90
+ "mean_token_accuracy": 0.9284946233034134,
91
+ "num_tokens": 180767.0,
92
+ "step": 80
93
+ },
94
+ {
95
+ "epoch": 0.2,
96
+ "grad_norm": 0.38172802329063416,
97
+ "learning_rate": 9.11e-05,
98
+ "loss": 0.3457,
99
+ "mean_token_accuracy": 0.9404646947979927,
100
+ "num_tokens": 203203.0,
101
+ "step": 90
102
+ },
103
+ {
104
+ "epoch": 0.2222222222222222,
105
+ "grad_norm": 0.4985530376434326,
106
+ "learning_rate": 9.010000000000001e-05,
107
+ "loss": 0.3271,
108
+ "mean_token_accuracy": 0.9434564337134361,
109
+ "num_tokens": 226070.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "epoch": 0.2222222222222222,
114
+ "eval_loss": 0.3216829001903534,
115
+ "eval_mean_token_accuracy": 0.9481457704305649,
116
+ "eval_num_tokens": 226070.0,
117
+ "eval_runtime": 41.0245,
118
+ "eval_samples_per_second": 4.875,
119
+ "eval_steps_per_second": 4.875,
120
+ "step": 100
121
+ },
122
+ {
123
+ "epoch": 0.24444444444444444,
124
+ "grad_norm": 0.3317733108997345,
125
+ "learning_rate": 8.910000000000001e-05,
126
+ "loss": 0.2834,
127
+ "mean_token_accuracy": 0.9527689620852471,
128
+ "num_tokens": 251659.0,
129
+ "step": 110
130
+ },
131
+ {
132
+ "epoch": 0.26666666666666666,
133
+ "grad_norm": 0.28577539324760437,
134
+ "learning_rate": 8.81e-05,
135
+ "loss": 0.262,
136
+ "mean_token_accuracy": 0.9557135447859764,
137
+ "num_tokens": 276141.0,
138
+ "step": 120
139
+ },
140
+ {
141
+ "epoch": 0.28888888888888886,
142
+ "grad_norm": 0.4030509293079376,
143
+ "learning_rate": 8.71e-05,
144
+ "loss": 0.26,
145
+ "mean_token_accuracy": 0.9594775453209877,
146
+ "num_tokens": 299250.0,
147
+ "step": 130
148
+ },
149
+ {
150
+ "epoch": 0.3111111111111111,
151
+ "grad_norm": 0.4423465132713318,
152
+ "learning_rate": 8.61e-05,
153
+ "loss": 0.2557,
154
+ "mean_token_accuracy": 0.9583914607763291,
155
+ "num_tokens": 323528.0,
156
+ "step": 140
157
+ },
158
+ {
159
+ "epoch": 0.3333333333333333,
160
+ "grad_norm": 0.49233391880989075,
161
+ "learning_rate": 8.510000000000001e-05,
162
+ "loss": 0.2444,
163
+ "mean_token_accuracy": 0.9617681756615639,
164
+ "num_tokens": 346496.0,
165
+ "step": 150
166
+ },
167
+ {
168
+ "epoch": 0.3333333333333333,
169
+ "eval_loss": 0.24680288136005402,
170
+ "eval_mean_token_accuracy": 0.963321838080883,
171
+ "eval_num_tokens": 346496.0,
172
+ "eval_runtime": 41.1525,
173
+ "eval_samples_per_second": 4.86,
174
+ "eval_steps_per_second": 4.86,
175
+ "step": 150
176
+ },
177
+ {
178
+ "epoch": 0.35555555555555557,
179
+ "grad_norm": 0.2696438133716583,
180
+ "learning_rate": 8.41e-05,
181
+ "loss": 0.2196,
182
+ "mean_token_accuracy": 0.9656250089406967,
183
+ "num_tokens": 371258.0,
184
+ "step": 160
185
+ },
186
+ {
187
+ "epoch": 0.37777777777777777,
188
+ "grad_norm": 0.525949239730835,
189
+ "learning_rate": 8.31e-05,
190
+ "loss": 0.2122,
191
+ "mean_token_accuracy": 0.9669450834393502,
192
+ "num_tokens": 395722.0,
193
+ "step": 170
194
+ },
195
+ {
196
+ "epoch": 0.4,
197
+ "grad_norm": 0.38804805278778076,
198
+ "learning_rate": 8.21e-05,
199
+ "loss": 0.2097,
200
+ "mean_token_accuracy": 0.9699977666139603,
201
+ "num_tokens": 418830.0,
202
+ "step": 180
203
+ },
204
+ {
205
+ "epoch": 0.4222222222222222,
206
+ "grad_norm": 0.38709455728530884,
207
+ "learning_rate": 8.11e-05,
208
+ "loss": 0.1914,
209
+ "mean_token_accuracy": 0.9692254871129989,
210
+ "num_tokens": 443015.0,
211
+ "step": 190
212
+ },
213
+ {
214
+ "epoch": 0.4444444444444444,
215
+ "grad_norm": 0.28362640738487244,
216
+ "learning_rate": 8.010000000000001e-05,
217
+ "loss": 0.2051,
218
+ "mean_token_accuracy": 0.968667496740818,
219
+ "num_tokens": 465941.0,
220
+ "step": 200
221
+ },
222
+ {
223
+ "epoch": 0.4444444444444444,
224
+ "eval_loss": 0.21475830674171448,
225
+ "eval_mean_token_accuracy": 0.9686776822805405,
226
+ "eval_num_tokens": 465941.0,
227
+ "eval_runtime": 40.8312,
228
+ "eval_samples_per_second": 4.898,
229
+ "eval_steps_per_second": 4.898,
230
+ "step": 200
231
+ },
232
+ {
233
+ "epoch": 0.4666666666666667,
234
+ "grad_norm": 0.17446358501911163,
235
+ "learning_rate": 7.910000000000001e-05,
236
+ "loss": 0.1992,
237
+ "mean_token_accuracy": 0.9691774398088455,
238
+ "num_tokens": 489869.0,
239
+ "step": 210
240
+ },
241
+ {
242
+ "epoch": 0.4888888888888889,
243
+ "grad_norm": 0.47806766629219055,
244
+ "learning_rate": 7.81e-05,
245
+ "loss": 0.18,
246
+ "mean_token_accuracy": 0.9718323066830635,
247
+ "num_tokens": 514551.0,
248
+ "step": 220
249
+ },
250
+ {
251
+ "epoch": 0.5111111111111111,
252
+ "grad_norm": 0.4450798034667969,
253
+ "learning_rate": 7.71e-05,
254
+ "loss": 0.1818,
255
+ "mean_token_accuracy": 0.9731900364160537,
256
+ "num_tokens": 539497.0,
257
+ "step": 230
258
+ },
259
+ {
260
+ "epoch": 0.5333333333333333,
261
+ "grad_norm": 0.3654688596725464,
262
+ "learning_rate": 7.61e-05,
263
+ "loss": 0.2072,
264
+ "mean_token_accuracy": 0.9681414648890495,
265
+ "num_tokens": 560454.0,
266
+ "step": 240
267
+ },
268
+ {
269
+ "epoch": 0.5555555555555556,
270
+ "grad_norm": 0.26568618416786194,
271
+ "learning_rate": 7.510000000000001e-05,
272
+ "loss": 0.1858,
273
+ "mean_token_accuracy": 0.9720202058553695,
274
+ "num_tokens": 584595.0,
275
+ "step": 250
276
+ },
277
+ {
278
+ "epoch": 0.5555555555555556,
279
+ "eval_loss": 0.19870425760746002,
280
+ "eval_mean_token_accuracy": 0.9720033320784569,
281
+ "eval_num_tokens": 584595.0,
282
+ "eval_runtime": 40.9104,
283
+ "eval_samples_per_second": 4.889,
284
+ "eval_steps_per_second": 4.889,
285
+ "step": 250
286
+ },
287
+ {
288
+ "epoch": 0.5777777777777777,
289
+ "grad_norm": 0.21654823422431946,
290
+ "learning_rate": 7.41e-05,
291
+ "loss": 0.1765,
292
+ "mean_token_accuracy": 0.9739127770066262,
293
+ "num_tokens": 608352.0,
294
+ "step": 260
295
+ },
296
+ {
297
+ "epoch": 0.6,
298
+ "grad_norm": 0.3529929220676422,
299
+ "learning_rate": 7.31e-05,
300
+ "loss": 0.1726,
301
+ "mean_token_accuracy": 0.9745927527546883,
302
+ "num_tokens": 632550.0,
303
+ "step": 270
304
+ },
305
+ {
306
+ "epoch": 0.6222222222222222,
307
+ "grad_norm": 0.2798812985420227,
308
+ "learning_rate": 7.21e-05,
309
+ "loss": 0.1814,
310
+ "mean_token_accuracy": 0.9733540549874306,
311
+ "num_tokens": 655957.0,
312
+ "step": 280
313
+ },
314
+ {
315
+ "epoch": 0.6444444444444445,
316
+ "grad_norm": 0.28510963916778564,
317
+ "learning_rate": 7.11e-05,
318
+ "loss": 0.1789,
319
+ "mean_token_accuracy": 0.9735615655779839,
320
+ "num_tokens": 680743.0,
321
+ "step": 290
322
+ },
323
+ {
324
+ "epoch": 0.6666666666666666,
325
+ "grad_norm": 0.22315602004528046,
326
+ "learning_rate": 7.01e-05,
327
+ "loss": 0.1777,
328
+ "mean_token_accuracy": 0.9739167138934135,
329
+ "num_tokens": 704844.0,
330
+ "step": 300
331
+ },
332
+ {
333
+ "epoch": 0.6666666666666666,
334
+ "eval_loss": 0.18998412787914276,
335
+ "eval_mean_token_accuracy": 0.9746258324384689,
336
+ "eval_num_tokens": 704844.0,
337
+ "eval_runtime": 40.6816,
338
+ "eval_samples_per_second": 4.916,
339
+ "eval_steps_per_second": 4.916,
340
+ "step": 300
341
+ },
342
+ {
343
+ "epoch": 0.6888888888888889,
344
+ "grad_norm": 0.25007760524749756,
345
+ "learning_rate": 6.91e-05,
346
+ "loss": 0.1913,
347
+ "mean_token_accuracy": 0.9710126295685768,
348
+ "num_tokens": 724403.0,
349
+ "step": 310
350
+ },
351
+ {
352
+ "epoch": 0.7111111111111111,
353
+ "grad_norm": 0.19620826840400696,
354
+ "learning_rate": 6.81e-05,
355
+ "loss": 0.1782,
356
+ "mean_token_accuracy": 0.9740939602255821,
357
+ "num_tokens": 748460.0,
358
+ "step": 320
359
+ },
360
+ {
361
+ "epoch": 0.7333333333333333,
362
+ "grad_norm": 0.2543444037437439,
363
+ "learning_rate": 6.71e-05,
364
+ "loss": 0.1825,
365
+ "mean_token_accuracy": 0.972811259329319,
366
+ "num_tokens": 769992.0,
367
+ "step": 330
368
+ },
369
+ {
370
+ "epoch": 0.7555555555555555,
371
+ "grad_norm": 0.12709860503673553,
372
+ "learning_rate": 6.610000000000001e-05,
373
+ "loss": 0.1828,
374
+ "mean_token_accuracy": 0.9728233471512795,
375
+ "num_tokens": 792260.0,
376
+ "step": 340
377
+ },
378
+ {
379
+ "epoch": 0.7777777777777778,
380
+ "grad_norm": 0.24585242569446564,
381
+ "learning_rate": 6.510000000000001e-05,
382
+ "loss": 0.1798,
383
+ "mean_token_accuracy": 0.9745096892118454,
384
+ "num_tokens": 815593.0,
385
+ "step": 350
386
+ },
387
+ {
388
+ "epoch": 0.7777777777777778,
389
+ "eval_loss": 0.18533459305763245,
390
+ "eval_mean_token_accuracy": 0.9745631077885628,
391
+ "eval_num_tokens": 815593.0,
392
+ "eval_runtime": 40.662,
393
+ "eval_samples_per_second": 4.919,
394
+ "eval_steps_per_second": 4.919,
395
+ "step": 350
396
+ },
397
+ {
398
+ "epoch": 0.8,
399
+ "grad_norm": 0.22581206262111664,
400
+ "learning_rate": 6.41e-05,
401
+ "loss": 0.1807,
402
+ "mean_token_accuracy": 0.9736085414886475,
403
+ "num_tokens": 837982.0,
404
+ "step": 360
405
+ },
406
+ {
407
+ "epoch": 0.8222222222222222,
408
+ "grad_norm": 0.2912260591983795,
409
+ "learning_rate": 6.31e-05,
410
+ "loss": 0.1668,
411
+ "mean_token_accuracy": 0.9759353816509246,
412
+ "num_tokens": 860757.0,
413
+ "step": 370
414
+ },
415
+ {
416
+ "epoch": 0.8444444444444444,
417
+ "grad_norm": 0.1845020055770874,
418
+ "learning_rate": 6.21e-05,
419
+ "loss": 0.1839,
420
+ "mean_token_accuracy": 0.9732057675719261,
421
+ "num_tokens": 881639.0,
422
+ "step": 380
423
+ },
424
+ {
425
+ "epoch": 0.8666666666666667,
426
+ "grad_norm": 0.2601327896118164,
427
+ "learning_rate": 6.110000000000001e-05,
428
+ "loss": 0.1652,
429
+ "mean_token_accuracy": 0.9748824626207352,
430
+ "num_tokens": 905643.0,
431
+ "step": 390
432
+ },
433
+ {
434
+ "epoch": 0.8888888888888888,
435
+ "grad_norm": 0.16098515689373016,
436
+ "learning_rate": 6.0100000000000004e-05,
437
+ "loss": 0.171,
438
+ "mean_token_accuracy": 0.9756452158093453,
439
+ "num_tokens": 929665.0,
440
+ "step": 400
441
+ },
442
+ {
443
+ "epoch": 0.8888888888888888,
444
+ "eval_loss": 0.18200141191482544,
445
+ "eval_mean_token_accuracy": 0.9752507287263871,
446
+ "eval_num_tokens": 929665.0,
447
+ "eval_runtime": 40.698,
448
+ "eval_samples_per_second": 4.914,
449
+ "eval_steps_per_second": 4.914,
450
+ "step": 400
451
+ },
452
+ {
453
+ "epoch": 0.9111111111111111,
454
+ "grad_norm": 0.16805872321128845,
455
+ "learning_rate": 5.91e-05,
456
+ "loss": 0.1718,
457
+ "mean_token_accuracy": 0.9748784646391868,
458
+ "num_tokens": 951866.0,
459
+ "step": 410
460
+ },
461
+ {
462
+ "epoch": 0.9333333333333333,
463
+ "grad_norm": 0.14501382410526276,
464
+ "learning_rate": 5.8099999999999996e-05,
465
+ "loss": 0.1662,
466
+ "mean_token_accuracy": 0.9753882080316544,
467
+ "num_tokens": 975946.0,
468
+ "step": 420
469
+ },
470
+ {
471
+ "epoch": 0.9555555555555556,
472
+ "grad_norm": 0.17429794371128082,
473
+ "learning_rate": 5.71e-05,
474
+ "loss": 0.1719,
475
+ "mean_token_accuracy": 0.9735226526856422,
476
+ "num_tokens": 997417.0,
477
+ "step": 430
478
+ },
479
+ {
480
+ "epoch": 0.9777777777777777,
481
+ "grad_norm": 0.21708282828330994,
482
+ "learning_rate": 5.610000000000001e-05,
483
+ "loss": 0.1679,
484
+ "mean_token_accuracy": 0.9750987365841866,
485
+ "num_tokens": 1021762.0,
486
+ "step": 440
487
+ },
488
+ {
489
+ "epoch": 1.0,
490
+ "grad_norm": 0.11724573373794556,
491
+ "learning_rate": 5.5100000000000004e-05,
492
+ "loss": 0.1648,
493
+ "mean_token_accuracy": 0.9775774568319321,
494
+ "num_tokens": 1046469.0,
495
+ "step": 450
496
+ },
497
+ {
498
+ "epoch": 1.0,
499
+ "eval_loss": 0.18049028515815735,
500
+ "eval_mean_token_accuracy": 0.9776099815964698,
501
+ "eval_num_tokens": 1046469.0,
502
+ "eval_runtime": 40.8205,
503
+ "eval_samples_per_second": 4.899,
504
+ "eval_steps_per_second": 4.899,
505
+ "step": 450
506
+ },
507
+ {
508
+ "epoch": 1.0222222222222221,
509
+ "grad_norm": 0.13565418124198914,
510
+ "learning_rate": 5.410000000000001e-05,
511
+ "loss": 0.161,
512
+ "mean_token_accuracy": 0.9767818033695221,
513
+ "num_tokens": 1070354.0,
514
+ "step": 460
515
+ },
516
+ {
517
+ "epoch": 1.0444444444444445,
518
+ "grad_norm": 0.14690855145454407,
519
+ "learning_rate": 5.31e-05,
520
+ "loss": 0.1616,
521
+ "mean_token_accuracy": 0.9762020215392113,
522
+ "num_tokens": 1095160.0,
523
+ "step": 470
524
+ },
525
+ {
526
+ "epoch": 1.0666666666666667,
527
+ "grad_norm": 0.17638139426708221,
528
+ "learning_rate": 5.2100000000000006e-05,
529
+ "loss": 0.1633,
530
+ "mean_token_accuracy": 0.9768583804368973,
531
+ "num_tokens": 1118117.0,
532
+ "step": 480
533
+ },
534
+ {
535
+ "epoch": 1.0888888888888888,
536
+ "grad_norm": 0.18202361464500427,
537
+ "learning_rate": 5.11e-05,
538
+ "loss": 0.165,
539
+ "mean_token_accuracy": 0.9771157950162888,
540
+ "num_tokens": 1140967.0,
541
+ "step": 490
542
+ },
543
+ {
544
+ "epoch": 1.1111111111111112,
545
+ "grad_norm": 0.1454939991235733,
546
+ "learning_rate": 5.0100000000000005e-05,
547
+ "loss": 0.1647,
548
+ "mean_token_accuracy": 0.9771055221557617,
549
+ "num_tokens": 1165040.0,
550
+ "step": 500
551
+ },
552
+ {
553
+ "epoch": 1.1111111111111112,
554
+ "eval_loss": 0.17858880758285522,
555
+ "eval_mean_token_accuracy": 0.9758283907175064,
556
+ "eval_num_tokens": 1165040.0,
557
+ "eval_runtime": 40.5796,
558
+ "eval_samples_per_second": 4.929,
559
+ "eval_steps_per_second": 4.929,
560
+ "step": 500
561
+ },
562
+ {
563
+ "epoch": 1.1333333333333333,
564
+ "grad_norm": 0.18715986609458923,
565
+ "learning_rate": 4.91e-05,
566
+ "loss": 0.1647,
567
+ "mean_token_accuracy": 0.9763637855648994,
568
+ "num_tokens": 1188748.0,
569
+ "step": 510
570
+ },
571
+ {
572
+ "epoch": 1.1555555555555554,
573
+ "grad_norm": 0.1779111623764038,
574
+ "learning_rate": 4.8100000000000004e-05,
575
+ "loss": 0.1696,
576
+ "mean_token_accuracy": 0.9764641597867012,
577
+ "num_tokens": 1211411.0,
578
+ "step": 520
579
+ },
580
+ {
581
+ "epoch": 1.1777777777777778,
582
+ "grad_norm": 0.19597475230693817,
583
+ "learning_rate": 4.71e-05,
584
+ "loss": 0.1707,
585
+ "mean_token_accuracy": 0.976306001842022,
586
+ "num_tokens": 1233545.0,
587
+ "step": 530
588
+ },
589
+ {
590
+ "epoch": 1.2,
591
+ "grad_norm": 0.15781964361667633,
592
+ "learning_rate": 4.61e-05,
593
+ "loss": 0.1617,
594
+ "mean_token_accuracy": 0.9768658638000488,
595
+ "num_tokens": 1257597.0,
596
+ "step": 540
597
+ },
598
+ {
599
+ "epoch": 1.2222222222222223,
600
+ "grad_norm": 0.16846869885921478,
601
+ "learning_rate": 4.5100000000000005e-05,
602
+ "loss": 0.1643,
603
+ "mean_token_accuracy": 0.9779975593090058,
604
+ "num_tokens": 1280857.0,
605
+ "step": 550
606
+ },
607
+ {
608
+ "epoch": 1.2222222222222223,
609
+ "eval_loss": 0.17742373049259186,
610
+ "eval_mean_token_accuracy": 0.975758826136589,
611
+ "eval_num_tokens": 1280857.0,
612
+ "eval_runtime": 40.7021,
613
+ "eval_samples_per_second": 4.914,
614
+ "eval_steps_per_second": 4.914,
615
+ "step": 550
616
+ },
617
+ {
618
+ "epoch": 1.2444444444444445,
619
+ "grad_norm": 0.09051565825939178,
620
+ "learning_rate": 4.41e-05,
621
+ "loss": 0.1672,
622
+ "mean_token_accuracy": 0.9775181457400322,
623
+ "num_tokens": 1304923.0,
624
+ "step": 560
625
+ },
626
+ {
627
+ "epoch": 1.2666666666666666,
628
+ "grad_norm": 0.12825968861579895,
629
+ "learning_rate": 4.3100000000000004e-05,
630
+ "loss": 0.1709,
631
+ "mean_token_accuracy": 0.976058243215084,
632
+ "num_tokens": 1326584.0,
633
+ "step": 570
634
+ },
635
+ {
636
+ "epoch": 1.2888888888888888,
637
+ "grad_norm": 0.11256464570760727,
638
+ "learning_rate": 4.21e-05,
639
+ "loss": 0.1566,
640
+ "mean_token_accuracy": 0.9770786449313164,
641
+ "num_tokens": 1351586.0,
642
+ "step": 580
643
+ },
644
+ {
645
+ "epoch": 1.3111111111111111,
646
+ "grad_norm": 0.19633373618125916,
647
+ "learning_rate": 4.11e-05,
648
+ "loss": 0.1815,
649
+ "mean_token_accuracy": 0.9755366548895836,
650
+ "num_tokens": 1372532.0,
651
+ "step": 590
652
+ },
653
+ {
654
+ "epoch": 1.3333333333333333,
655
+ "grad_norm": 0.1573602855205536,
656
+ "learning_rate": 4.0100000000000006e-05,
657
+ "loss": 0.1666,
658
+ "mean_token_accuracy": 0.9768186181783676,
659
+ "num_tokens": 1395159.0,
660
+ "step": 600
661
+ },
662
+ {
663
+ "epoch": 1.3333333333333333,
664
+ "eval_loss": 0.17687298357486725,
665
+ "eval_mean_token_accuracy": 0.9778035506606102,
666
+ "eval_num_tokens": 1395159.0,
667
+ "eval_runtime": 40.6718,
668
+ "eval_samples_per_second": 4.917,
669
+ "eval_steps_per_second": 4.917,
670
+ "step": 600
671
+ },
672
+ {
673
+ "epoch": 1.3555555555555556,
674
+ "grad_norm": 0.13229654729366302,
675
+ "learning_rate": 3.91e-05,
676
+ "loss": 0.1554,
677
+ "mean_token_accuracy": 0.9773815423250198,
678
+ "num_tokens": 1419268.0,
679
+ "step": 610
680
+ },
681
+ {
682
+ "epoch": 1.3777777777777778,
683
+ "grad_norm": 0.13749773800373077,
684
+ "learning_rate": 3.8100000000000005e-05,
685
+ "loss": 0.1619,
686
+ "mean_token_accuracy": 0.9785586386919022,
687
+ "num_tokens": 1443302.0,
688
+ "step": 620
689
+ },
690
+ {
691
+ "epoch": 1.4,
692
+ "grad_norm": 0.1410885453224182,
693
+ "learning_rate": 3.71e-05,
694
+ "loss": 0.1665,
695
+ "mean_token_accuracy": 0.9780510842800141,
696
+ "num_tokens": 1465561.0,
697
+ "step": 630
698
+ },
699
+ {
700
+ "epoch": 1.4222222222222223,
701
+ "grad_norm": 0.191091850399971,
702
+ "learning_rate": 3.61e-05,
703
+ "loss": 0.1655,
704
+ "mean_token_accuracy": 0.9782152384519577,
705
+ "num_tokens": 1488865.0,
706
+ "step": 640
707
+ },
708
+ {
709
+ "epoch": 1.4444444444444444,
710
+ "grad_norm": 0.19034996628761292,
711
+ "learning_rate": 3.51e-05,
712
+ "loss": 0.1692,
713
+ "mean_token_accuracy": 0.9753783032298088,
714
+ "num_tokens": 1510010.0,
715
+ "step": 650
716
+ },
717
+ {
718
+ "epoch": 1.4444444444444444,
719
+ "eval_loss": 0.17606773972511292,
720
+ "eval_mean_token_accuracy": 0.9765824827551842,
721
+ "eval_num_tokens": 1510010.0,
722
+ "eval_runtime": 40.6251,
723
+ "eval_samples_per_second": 4.923,
724
+ "eval_steps_per_second": 4.923,
725
+ "step": 650
726
+ },
727
+ {
728
+ "epoch": 1.4666666666666668,
729
+ "grad_norm": 0.1246759369969368,
730
+ "learning_rate": 3.41e-05,
731
+ "loss": 0.1587,
732
+ "mean_token_accuracy": 0.9783496215939522,
733
+ "num_tokens": 1533645.0,
734
+ "step": 660
735
+ },
736
+ {
737
+ "epoch": 1.488888888888889,
738
+ "grad_norm": 0.14433911442756653,
739
+ "learning_rate": 3.3100000000000005e-05,
740
+ "loss": 0.1751,
741
+ "mean_token_accuracy": 0.9757299274206161,
742
+ "num_tokens": 1555222.0,
743
+ "step": 670
744
+ },
745
+ {
746
+ "epoch": 1.511111111111111,
747
+ "grad_norm": 0.09897135943174362,
748
+ "learning_rate": 3.21e-05,
749
+ "loss": 0.1538,
750
+ "mean_token_accuracy": 0.9790621817111969,
751
+ "num_tokens": 1578797.0,
752
+ "step": 680
753
+ },
754
+ {
755
+ "epoch": 1.5333333333333332,
756
+ "grad_norm": 0.11254730075597763,
757
+ "learning_rate": 3.1100000000000004e-05,
758
+ "loss": 0.1506,
759
+ "mean_token_accuracy": 0.9802307024598121,
760
+ "num_tokens": 1604438.0,
761
+ "step": 690
762
+ },
763
+ {
764
+ "epoch": 1.5555555555555556,
765
+ "grad_norm": 0.10534244775772095,
766
+ "learning_rate": 3.01e-05,
767
+ "loss": 0.1642,
768
+ "mean_token_accuracy": 0.9772371336817741,
769
+ "num_tokens": 1626780.0,
770
+ "step": 700
771
+ },
772
+ {
773
+ "epoch": 1.5555555555555556,
774
+ "eval_loss": 0.17524176836013794,
775
+ "eval_mean_token_accuracy": 0.9759097599983215,
776
+ "eval_num_tokens": 1626780.0,
777
+ "eval_runtime": 41.004,
778
+ "eval_samples_per_second": 4.878,
779
+ "eval_steps_per_second": 4.878,
780
+ "step": 700
781
+ },
782
+ {
783
+ "epoch": 1.5777777777777777,
784
+ "grad_norm": 0.14336709678173065,
785
+ "learning_rate": 2.91e-05,
786
+ "loss": 0.1595,
787
+ "mean_token_accuracy": 0.977231676876545,
788
+ "num_tokens": 1649439.0,
789
+ "step": 710
790
+ },
791
+ {
792
+ "epoch": 1.6,
793
+ "grad_norm": 0.18425902724266052,
794
+ "learning_rate": 2.8100000000000005e-05,
795
+ "loss": 0.1646,
796
+ "mean_token_accuracy": 0.9778214260935784,
797
+ "num_tokens": 1671958.0,
798
+ "step": 720
799
+ },
800
+ {
801
+ "epoch": 1.6222222222222222,
802
+ "grad_norm": 0.12844131886959076,
803
+ "learning_rate": 2.7100000000000005e-05,
804
+ "loss": 0.1693,
805
+ "mean_token_accuracy": 0.9771967604756355,
806
+ "num_tokens": 1694994.0,
807
+ "step": 730
808
+ },
809
+ {
810
+ "epoch": 1.6444444444444444,
811
+ "grad_norm": 0.12891170382499695,
812
+ "learning_rate": 2.61e-05,
813
+ "loss": 0.161,
814
+ "mean_token_accuracy": 0.9793439760804177,
815
+ "num_tokens": 1719686.0,
816
+ "step": 740
817
+ },
818
+ {
819
+ "epoch": 1.6666666666666665,
820
+ "grad_norm": 0.12332133948802948,
821
+ "learning_rate": 2.51e-05,
822
+ "loss": 0.1555,
823
+ "mean_token_accuracy": 0.9788334503769874,
824
+ "num_tokens": 1744365.0,
825
+ "step": 750
826
+ },
827
+ {
828
+ "epoch": 1.6666666666666665,
829
+ "eval_loss": 0.1749415248632431,
830
+ "eval_mean_token_accuracy": 0.9778647214174271,
831
+ "eval_num_tokens": 1744365.0,
832
+ "eval_runtime": 40.6207,
833
+ "eval_samples_per_second": 4.924,
834
+ "eval_steps_per_second": 4.924,
835
+ "step": 750
836
+ },
837
+ {
838
+ "epoch": 1.6888888888888889,
839
+ "grad_norm": 0.20146825909614563,
840
+ "learning_rate": 2.41e-05,
841
+ "loss": 0.1559,
842
+ "mean_token_accuracy": 0.9788876891136169,
843
+ "num_tokens": 1768804.0,
844
+ "step": 760
845
+ },
846
+ {
847
+ "epoch": 1.7111111111111112,
848
+ "grad_norm": 0.12395230680704117,
849
+ "learning_rate": 2.3100000000000002e-05,
850
+ "loss": 0.1606,
851
+ "mean_token_accuracy": 0.9783805578947067,
852
+ "num_tokens": 1791264.0,
853
+ "step": 770
854
+ },
855
+ {
856
+ "epoch": 1.7333333333333334,
857
+ "grad_norm": 0.15062493085861206,
858
+ "learning_rate": 2.2100000000000002e-05,
859
+ "loss": 0.1594,
860
+ "mean_token_accuracy": 0.9800337478518486,
861
+ "num_tokens": 1815439.0,
862
+ "step": 780
863
+ },
864
+ {
865
+ "epoch": 1.7555555555555555,
866
+ "grad_norm": 0.14727267622947693,
867
+ "learning_rate": 2.11e-05,
868
+ "loss": 0.1609,
869
+ "mean_token_accuracy": 0.9797701701521874,
870
+ "num_tokens": 1839541.0,
871
+ "step": 790
872
+ },
873
+ {
874
+ "epoch": 1.7777777777777777,
875
+ "grad_norm": 0.14255313575267792,
876
+ "learning_rate": 2.01e-05,
877
+ "loss": 0.1632,
878
+ "mean_token_accuracy": 0.9785327047109604,
879
+ "num_tokens": 1862487.0,
880
+ "step": 800
881
+ },
882
+ {
883
+ "epoch": 1.7777777777777777,
884
+ "eval_loss": 0.1741703748703003,
885
+ "eval_mean_token_accuracy": 0.9780414113402367,
886
+ "eval_num_tokens": 1862487.0,
887
+ "eval_runtime": 40.5514,
888
+ "eval_samples_per_second": 4.932,
889
+ "eval_steps_per_second": 4.932,
890
+ "step": 800
891
+ },
892
+ {
893
+ "epoch": 1.8,
894
+ "grad_norm": 0.13947232067584991,
895
+ "learning_rate": 1.91e-05,
896
+ "loss": 0.1633,
897
+ "mean_token_accuracy": 0.9781667277216911,
898
+ "num_tokens": 1887435.0,
899
+ "step": 810
900
+ },
901
+ {
902
+ "epoch": 1.8222222222222222,
903
+ "grad_norm": 0.13715006411075592,
904
+ "learning_rate": 1.81e-05,
905
+ "loss": 0.1758,
906
+ "mean_token_accuracy": 0.9765221685171127,
907
+ "num_tokens": 1908438.0,
908
+ "step": 820
909
+ },
910
+ {
911
+ "epoch": 1.8444444444444446,
912
+ "grad_norm": 0.14724566042423248,
913
+ "learning_rate": 1.7100000000000002e-05,
914
+ "loss": 0.1636,
915
+ "mean_token_accuracy": 0.9784929260611535,
916
+ "num_tokens": 1931471.0,
917
+ "step": 830
918
+ },
919
+ {
920
+ "epoch": 1.8666666666666667,
921
+ "grad_norm": 0.12384811788797379,
922
+ "learning_rate": 1.6100000000000002e-05,
923
+ "loss": 0.15,
924
+ "mean_token_accuracy": 0.9788558304309845,
925
+ "num_tokens": 1955944.0,
926
+ "step": 840
927
+ },
928
+ {
929
+ "epoch": 1.8888888888888888,
930
+ "grad_norm": 0.2311498075723648,
931
+ "learning_rate": 1.51e-05,
932
+ "loss": 0.1565,
933
+ "mean_token_accuracy": 0.977969428896904,
934
+ "num_tokens": 1978822.0,
935
+ "step": 850
936
+ },
937
+ {
938
+ "epoch": 1.8888888888888888,
939
+ "eval_loss": 0.1739804595708847,
940
+ "eval_mean_token_accuracy": 0.9762575566768646,
941
+ "eval_num_tokens": 1978822.0,
942
+ "eval_runtime": 40.9407,
943
+ "eval_samples_per_second": 4.885,
944
+ "eval_steps_per_second": 4.885,
945
+ "step": 850
946
+ },
947
+ {
948
+ "epoch": 1.911111111111111,
949
+ "grad_norm": 0.15526416897773743,
950
+ "learning_rate": 1.4099999999999999e-05,
951
+ "loss": 0.1574,
952
+ "mean_token_accuracy": 0.9787370949983597,
953
+ "num_tokens": 2002989.0,
954
+ "step": 860
955
+ },
956
+ {
957
+ "epoch": 1.9333333333333333,
958
+ "grad_norm": 0.13134504854679108,
959
+ "learning_rate": 1.3100000000000002e-05,
960
+ "loss": 0.1766,
961
+ "mean_token_accuracy": 0.9767401814460754,
962
+ "num_tokens": 2025070.0,
963
+ "step": 870
964
+ },
965
+ {
966
+ "epoch": 1.9555555555555557,
967
+ "grad_norm": 0.24305500090122223,
968
+ "learning_rate": 1.2100000000000001e-05,
969
+ "loss": 0.1571,
970
+ "mean_token_accuracy": 0.978802427649498,
971
+ "num_tokens": 2048924.0,
972
+ "step": 880
973
+ },
974
+ {
975
+ "epoch": 1.9777777777777779,
976
+ "grad_norm": 0.16169771552085876,
977
+ "learning_rate": 1.11e-05,
978
+ "loss": 0.1554,
979
+ "mean_token_accuracy": 0.9774592310190201,
980
+ "num_tokens": 2072389.0,
981
+ "step": 890
982
+ },
983
+ {
984
+ "epoch": 2.0,
985
+ "grad_norm": 0.11251533031463623,
986
+ "learning_rate": 1.0100000000000002e-05,
987
+ "loss": 0.1694,
988
+ "mean_token_accuracy": 0.9790247350931167,
989
+ "num_tokens": 2092938.0,
990
+ "step": 900
991
+ },
992
+ {
993
+ "epoch": 2.0,
994
+ "eval_loss": 0.1734553575515747,
995
+ "eval_mean_token_accuracy": 0.9805351263284683,
996
+ "eval_num_tokens": 2092938.0,
997
+ "eval_runtime": 40.8549,
998
+ "eval_samples_per_second": 4.895,
999
+ "eval_steps_per_second": 4.895,
1000
+ "step": 900
1001
+ },
1002
+ {
1003
+ "epoch": 2.022222222222222,
1004
+ "grad_norm": 0.14182798564434052,
1005
+ "learning_rate": 9.100000000000001e-06,
1006
+ "loss": 0.1593,
1007
+ "mean_token_accuracy": 0.9806110426783562,
1008
+ "num_tokens": 2117439.0,
1009
+ "step": 910
1010
+ },
1011
+ {
1012
+ "epoch": 2.0444444444444443,
1013
+ "grad_norm": 0.13165000081062317,
1014
+ "learning_rate": 8.1e-06,
1015
+ "loss": 0.1532,
1016
+ "mean_token_accuracy": 0.980982506275177,
1017
+ "num_tokens": 2143284.0,
1018
+ "step": 920
1019
+ },
1020
+ {
1021
+ "epoch": 2.066666666666667,
1022
+ "grad_norm": 0.13505029678344727,
1023
+ "learning_rate": 7.1e-06,
1024
+ "loss": 0.1602,
1025
+ "mean_token_accuracy": 0.9789509087800979,
1026
+ "num_tokens": 2166692.0,
1027
+ "step": 930
1028
+ },
1029
+ {
1030
+ "epoch": 2.088888888888889,
1031
+ "grad_norm": 0.16785164177417755,
1032
+ "learning_rate": 6.1e-06,
1033
+ "loss": 0.1637,
1034
+ "mean_token_accuracy": 0.9785576567053795,
1035
+ "num_tokens": 2189796.0,
1036
+ "step": 940
1037
+ },
1038
+ {
1039
+ "epoch": 2.111111111111111,
1040
+ "grad_norm": 0.1329684853553772,
1041
+ "learning_rate": 5.1e-06,
1042
+ "loss": 0.1497,
1043
+ "mean_token_accuracy": 0.9809095978736877,
1044
+ "num_tokens": 2215533.0,
1045
+ "step": 950
1046
+ },
1047
+ {
1048
+ "epoch": 2.111111111111111,
1049
+ "eval_loss": 0.17318324744701385,
1050
+ "eval_mean_token_accuracy": 0.9781925734877587,
1051
+ "eval_num_tokens": 2215533.0,
1052
+ "eval_runtime": 40.6397,
1053
+ "eval_samples_per_second": 4.921,
1054
+ "eval_steps_per_second": 4.921,
1055
+ "step": 950
1056
+ },
1057
+ {
1058
+ "epoch": 2.1333333333333333,
1059
+ "grad_norm": 0.15546564757823944,
1060
+ "learning_rate": 4.1000000000000006e-06,
1061
+ "loss": 0.1598,
1062
+ "mean_token_accuracy": 0.979330213367939,
1063
+ "num_tokens": 2238719.0,
1064
+ "step": 960
1065
+ },
1066
+ {
1067
+ "epoch": 2.1555555555555554,
1068
+ "grad_norm": 0.20469367504119873,
1069
+ "learning_rate": 3.1e-06,
1070
+ "loss": 0.1589,
1071
+ "mean_token_accuracy": 0.9795133844017982,
1072
+ "num_tokens": 2260964.0,
1073
+ "step": 970
1074
+ },
1075
+ {
1076
+ "epoch": 2.1777777777777776,
1077
+ "grad_norm": 0.14832386374473572,
1078
+ "learning_rate": 2.1000000000000002e-06,
1079
+ "loss": 0.158,
1080
+ "mean_token_accuracy": 0.9798341304063797,
1081
+ "num_tokens": 2284590.0,
1082
+ "step": 980
1083
+ },
1084
+ {
1085
+ "epoch": 2.2,
1086
+ "grad_norm": 0.1651660054922104,
1087
+ "learning_rate": 1.1e-06,
1088
+ "loss": 0.1664,
1089
+ "mean_token_accuracy": 0.9788132831454277,
1090
+ "num_tokens": 2306316.0,
1091
+ "step": 990
1092
+ },
1093
+ {
1094
+ "epoch": 2.2222222222222223,
1095
+ "grad_norm": 0.1315845400094986,
1096
+ "learning_rate": 1.0000000000000001e-07,
1097
+ "loss": 0.1641,
1098
+ "mean_token_accuracy": 0.978704534471035,
1099
+ "num_tokens": 2328618.0,
1100
+ "step": 1000
1101
+ },
1102
+ {
1103
+ "epoch": 2.2222222222222223,
1104
+ "eval_loss": 0.1731773316860199,
1105
+ "eval_mean_token_accuracy": 0.9781978166103363,
1106
+ "eval_num_tokens": 2328618.0,
1107
+ "eval_runtime": 40.7486,
1108
+ "eval_samples_per_second": 4.908,
1109
+ "eval_steps_per_second": 4.908,
1110
+ "step": 1000
1111
+ }
1112
+ ],
1113
+ "logging_steps": 10,
1114
+ "max_steps": 1000,
1115
+ "num_input_tokens_seen": 0,
1116
+ "num_train_epochs": 3,
1117
+ "save_steps": 100,
1118
+ "stateful_callbacks": {
1119
+ "TrainerControl": {
1120
+ "args": {
1121
+ "should_epoch_stop": false,
1122
+ "should_evaluate": false,
1123
+ "should_log": false,
1124
+ "should_save": true,
1125
+ "should_training_stop": true
1126
+ },
1127
+ "attributes": {}
1128
+ }
1129
+ },
1130
+ "total_flos": 1.1269467899136e+16,
1131
+ "train_batch_size": 1,
1132
+ "trial_name": null,
1133
+ "trial_params": null
1134
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f24a1d1fcf3561539dba436a511a781c825b96e542ffcf6aba86ad917df122
3
+ size 6225
checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Maincode/Maincoder-1B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Maincode/Maincoder-1B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
checkpoint-200/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Maincode/Maincoder-1B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "up_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "k_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dcb540ff4dc2540e2bd363ce28028787b6b6b4f6698a8537fe7d6f58e0f4e74
3
+ size 55111408
checkpoint-200/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-200/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d4a3eacd2d7009dd07e34c04f23b77a586b8ab047caa19c0a5fbb3835385cbd
3
+ size 110484131
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:475ac8dac7f22b85ca45f829097f3c4fa9f2b7ba63ccc7dba229264b9b144b4f
3
+ size 14645
checkpoint-200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a
3
+ size 1383
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793b45e356907c0dd0ce0a30dee131ac2edb92e56cb56cf8c95af4feabe3d5a8
3
+ size 1465
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896