jprivera44 commited on
Commit
e0c406b
·
verified ·
1 Parent(s): 873f673

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-241/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Llama-3.3-70B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Llama-3.3-70B-Instruct
7
+ - lora
8
+ - transformers
9
+ - unsloth
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "LlamaForCausalLM",
7
+ "parent_library": "transformers.models.llama.modeling_llama",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Llama-3.3-70B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 64,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 64,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "up_proj",
37
+ "down_proj",
38
+ "o_proj",
39
+ "q_proj",
40
+ "gate_proj",
41
+ "v_proj",
42
+ "k_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b4ed940017aa43d9cbfa0752eb1a280f73c5264996fefd6c0844d61cc89f7a
3
+ size 3313653480
chat_template.jinja ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- set date_string = "26 Jul 2024" %}
10
+ {%- endif %}
11
+ {%- if not tools is defined %}
12
+ {%- set tools = none %}
13
+ {%- endif %}
14
+
15
+ {#- This block extracts the system message, so we can slot it into the right place. #}
16
+ {%- if messages[0]['role'] == 'system' %}
17
+ {%- set system_message = messages[0]['content']|trim %}
18
+ {%- set messages = messages[1:] %}
19
+ {%- else %}
20
+ {%- set system_message = "" %}
21
+ {%- endif %}
22
+
23
+ {#- System message + builtin tools #}
24
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
25
+ {%- if builtin_tools is defined or tools is not none %}
26
+ {{- "Environment: ipython\n" }}
27
+ {%- endif %}
28
+ {%- if builtin_tools is defined %}
29
+ {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
30
+ {%- endif %}
31
+ {{- "Cutting Knowledge Date: December 2023\n" }}
32
+ {{- "Today Date: " + date_string + "\n\n" }}
33
+ {%- if tools is not none and not tools_in_user_message %}
34
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
35
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
36
+ {{- "Do not use variables.\n\n" }}
37
+ {%- for t in tools %}
38
+ {{- t | tojson(indent=4) }}
39
+ {{- "\n\n" }}
40
+ {%- endfor %}
41
+ {%- endif %}
42
+ {{- system_message }}
43
+ {{- "<|eot_id|>" }}
44
+
45
+ {#- Custom tools are passed in a user message with some extra guidance #}
46
+ {%- if tools_in_user_message and not tools is none %}
47
+ {#- Extract the first user message so we can plug it in here #}
48
+ {%- if messages | length != 0 %}
49
+ {%- set first_user_message = messages[0]['content']|trim %}
50
+ {%- set messages = messages[1:] %}
51
+ {%- else %}
52
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
53
+ {%- endif %}
54
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
55
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
56
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
57
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
58
+ {{- "Do not use variables.\n\n" }}
59
+ {%- for t in tools %}
60
+ {{- t | tojson(indent=4) }}
61
+ {{- "\n\n" }}
62
+ {%- endfor %}
63
+ {{- first_user_message + "<|eot_id|>"}}
64
+ {%- endif %}
65
+
66
+ {%- for message in messages %}
67
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
68
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
69
+ {%- elif 'tool_calls' in message %}
70
+ {%- if not message.tool_calls|length == 1 %}
71
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
72
+ {%- endif %}
73
+ {%- set tool_call = message.tool_calls[0].function %}
74
+ {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- "<|python_tag|>" + tool_call.name + ".call(" }}
77
+ {%- for arg_name, arg_val in tool_call.arguments | items %}
78
+ {{- arg_name + '="' + arg_val + '"' }}
79
+ {%- if not loop.last %}
80
+ {{- ", " }}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {{- ")" }}
84
+ {%- else %}
85
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
86
+ {{- '{"name": "' + tool_call.name + '", ' }}
87
+ {{- '"parameters": ' }}
88
+ {{- tool_call.arguments | tojson }}
89
+ {{- "}" }}
90
+ {%- endif %}
91
+ {%- if builtin_tools is defined %}
92
+ {#- This means we're in ipython mode #}
93
+ {{- "<|eom_id|>" }}
94
+ {%- else %}
95
+ {{- "<|eot_id|>" }}
96
+ {%- endif %}
97
+ {%- elif message.role == "tool" or message.role == "ipython" %}
98
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
99
+ {%- if message.content is mapping or message.content is iterable %}
100
+ {{- message.content | tojson }}
101
+ {%- else %}
102
+ {{- message.content }}
103
+ {%- endif %}
104
+ {{- "<|eot_id|>" }}
105
+ {%- endif %}
106
+ {%- endfor %}
107
+ {%- if add_generation_prompt %}
108
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
109
+ {%- endif %}
checkpoint-241/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Llama-3.3-70B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Llama-3.3-70B-Instruct
7
+ - lora
8
+ - transformers
9
+ - unsloth
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-241/adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "LlamaForCausalLM",
7
+ "parent_library": "transformers.models.llama.modeling_llama",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Llama-3.3-70B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 64,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 64,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "up_proj",
37
+ "down_proj",
38
+ "o_proj",
39
+ "q_proj",
40
+ "gate_proj",
41
+ "v_proj",
42
+ "k_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
checkpoint-241/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b4ed940017aa43d9cbfa0752eb1a280f73c5264996fefd6c0844d61cc89f7a
3
+ size 3313653480
checkpoint-241/chat_template.jinja ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- set date_string = "26 Jul 2024" %}
10
+ {%- endif %}
11
+ {%- if not tools is defined %}
12
+ {%- set tools = none %}
13
+ {%- endif %}
14
+
15
+ {#- This block extracts the system message, so we can slot it into the right place. #}
16
+ {%- if messages[0]['role'] == 'system' %}
17
+ {%- set system_message = messages[0]['content']|trim %}
18
+ {%- set messages = messages[1:] %}
19
+ {%- else %}
20
+ {%- set system_message = "" %}
21
+ {%- endif %}
22
+
23
+ {#- System message + builtin tools #}
24
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
25
+ {%- if builtin_tools is defined or tools is not none %}
26
+ {{- "Environment: ipython\n" }}
27
+ {%- endif %}
28
+ {%- if builtin_tools is defined %}
29
+ {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
30
+ {%- endif %}
31
+ {{- "Cutting Knowledge Date: December 2023\n" }}
32
+ {{- "Today Date: " + date_string + "\n\n" }}
33
+ {%- if tools is not none and not tools_in_user_message %}
34
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
35
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
36
+ {{- "Do not use variables.\n\n" }}
37
+ {%- for t in tools %}
38
+ {{- t | tojson(indent=4) }}
39
+ {{- "\n\n" }}
40
+ {%- endfor %}
41
+ {%- endif %}
42
+ {{- system_message }}
43
+ {{- "<|eot_id|>" }}
44
+
45
+ {#- Custom tools are passed in a user message with some extra guidance #}
46
+ {%- if tools_in_user_message and not tools is none %}
47
+ {#- Extract the first user message so we can plug it in here #}
48
+ {%- if messages | length != 0 %}
49
+ {%- set first_user_message = messages[0]['content']|trim %}
50
+ {%- set messages = messages[1:] %}
51
+ {%- else %}
52
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
53
+ {%- endif %}
54
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
55
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
56
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
57
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
58
+ {{- "Do not use variables.\n\n" }}
59
+ {%- for t in tools %}
60
+ {{- t | tojson(indent=4) }}
61
+ {{- "\n\n" }}
62
+ {%- endfor %}
63
+ {{- first_user_message + "<|eot_id|>"}}
64
+ {%- endif %}
65
+
66
+ {%- for message in messages %}
67
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
68
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
69
+ {%- elif 'tool_calls' in message %}
70
+ {%- if not message.tool_calls|length == 1 %}
71
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
72
+ {%- endif %}
73
+ {%- set tool_call = message.tool_calls[0].function %}
74
+ {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- "<|python_tag|>" + tool_call.name + ".call(" }}
77
+ {%- for arg_name, arg_val in tool_call.arguments | items %}
78
+ {{- arg_name + '="' + arg_val + '"' }}
79
+ {%- if not loop.last %}
80
+ {{- ", " }}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {{- ")" }}
84
+ {%- else %}
85
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
86
+ {{- '{"name": "' + tool_call.name + '", ' }}
87
+ {{- '"parameters": ' }}
88
+ {{- tool_call.arguments | tojson }}
89
+ {{- "}" }}
90
+ {%- endif %}
91
+ {%- if builtin_tools is defined %}
92
+ {#- This means we're in ipython mode #}
93
+ {{- "<|eom_id|>" }}
94
+ {%- else %}
95
+ {{- "<|eot_id|>" }}
96
+ {%- endif %}
97
+ {%- elif message.role == "tool" or message.role == "ipython" %}
98
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
99
+ {%- if message.content is mapping or message.content is iterable %}
100
+ {{- message.content | tojson }}
101
+ {%- else %}
102
+ {{- message.content }}
103
+ {%- endif %}
104
+ {{- "<|eot_id|>" }}
105
+ {%- endif %}
106
+ {%- endfor %}
107
+ {%- if add_generation_prompt %}
108
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
109
+ {%- endif %}
checkpoint-241/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:695cd7a8f68ce2a582c208593bc32b75450fbcf0da8c649de7ff6627f4b93b30
3
+ size 6627963827
checkpoint-241/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
+ size 14645
checkpoint-241/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd216a79bf36a073b8de03df89b5078ad5217e1c0c2703644ab0fbbdc132b05
3
+ size 1465
checkpoint-241/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f90a0ee1b41702c7b233b02234294a53bc0684a08d3bcd8c8ff702e9a12f64
3
+ size 17210019
checkpoint-241/tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "from_slow": true,
7
+ "is_local": false,
8
+ "legacy": false,
9
+ "model_input_names": [
10
+ "input_ids",
11
+ "attention_mask"
12
+ ],
13
+ "model_max_length": 131072,
14
+ "pad_token": "<|finetune_right_pad_id|>",
15
+ "padding_side": "right",
16
+ "tokenizer_class": "TokenizersBackend",
17
+ "unk_token": null
18
+ }
checkpoint-241/trainer_state.json ADDED
@@ -0,0 +1,1721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 241,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004149377593360996,
14
+ "grad_norm": 0.5114469528198242,
15
+ "learning_rate": 2e-05,
16
+ "loss": 0.7995174527168274,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.008298755186721992,
21
+ "grad_norm": 0.5205491185188293,
22
+ "learning_rate": 2e-05,
23
+ "loss": 0.8812965750694275,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.012448132780082987,
28
+ "grad_norm": 0.6057224273681641,
29
+ "learning_rate": 2e-05,
30
+ "loss": 0.8402022123336792,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.016597510373443983,
35
+ "grad_norm": 0.5623906254768372,
36
+ "learning_rate": 2e-05,
37
+ "loss": 0.8188848495483398,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.02074688796680498,
42
+ "grad_norm": 0.574876606464386,
43
+ "learning_rate": 2e-05,
44
+ "loss": 0.8380811214447021,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.024896265560165973,
49
+ "grad_norm": 0.4625989496707916,
50
+ "learning_rate": 2e-05,
51
+ "loss": 0.7132218480110168,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.029045643153526972,
56
+ "grad_norm": 0.5183306336402893,
57
+ "learning_rate": 2e-05,
58
+ "loss": 0.8268325328826904,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03319502074688797,
63
+ "grad_norm": 0.4928549826145172,
64
+ "learning_rate": 2e-05,
65
+ "loss": 0.7686080932617188,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.03734439834024896,
70
+ "grad_norm": 0.4636511206626892,
71
+ "learning_rate": 2e-05,
72
+ "loss": 0.8444753289222717,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.04149377593360996,
77
+ "grad_norm": 0.5008803606033325,
78
+ "learning_rate": 2e-05,
79
+ "loss": 0.6671140789985657,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.04564315352697095,
84
+ "grad_norm": 0.49685290455818176,
85
+ "learning_rate": 2e-05,
86
+ "loss": 0.7625027894973755,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.04979253112033195,
91
+ "grad_norm": 0.5161386728286743,
92
+ "learning_rate": 2e-05,
93
+ "loss": 0.5999635457992554,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.05394190871369295,
98
+ "grad_norm": 0.46996110677719116,
99
+ "learning_rate": 2e-05,
100
+ "loss": 0.7389070987701416,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.058091286307053944,
105
+ "grad_norm": 0.45131370425224304,
106
+ "learning_rate": 2e-05,
107
+ "loss": 0.6111957430839539,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.06224066390041494,
112
+ "grad_norm": 0.4911205470561981,
113
+ "learning_rate": 2e-05,
114
+ "loss": 0.5750669240951538,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.06639004149377593,
119
+ "grad_norm": 0.46468034386634827,
120
+ "learning_rate": 2e-05,
121
+ "loss": 0.6607809066772461,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.07053941908713693,
126
+ "grad_norm": 0.5140272378921509,
127
+ "learning_rate": 2e-05,
128
+ "loss": 0.8089659214019775,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.07468879668049792,
133
+ "grad_norm": 0.49761149287223816,
134
+ "learning_rate": 2e-05,
135
+ "loss": 0.8017055988311768,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.07883817427385892,
140
+ "grad_norm": 0.45623964071273804,
141
+ "learning_rate": 2e-05,
142
+ "loss": 0.725612223148346,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.08298755186721991,
147
+ "grad_norm": 0.4778558015823364,
148
+ "learning_rate": 2e-05,
149
+ "loss": 0.6465242505073547,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.08713692946058091,
154
+ "grad_norm": 0.4813624620437622,
155
+ "learning_rate": 2e-05,
156
+ "loss": 0.6812542676925659,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.0912863070539419,
161
+ "grad_norm": 0.45828455686569214,
162
+ "learning_rate": 2e-05,
163
+ "loss": 0.6355943083763123,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.0954356846473029,
168
+ "grad_norm": 0.39770182967185974,
169
+ "learning_rate": 2e-05,
170
+ "loss": 0.734164297580719,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.0995850622406639,
175
+ "grad_norm": 0.515662431716919,
176
+ "learning_rate": 2e-05,
177
+ "loss": 0.775545060634613,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.1037344398340249,
182
+ "grad_norm": 0.4875846207141876,
183
+ "learning_rate": 2e-05,
184
+ "loss": 0.7608263492584229,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.1078838174273859,
189
+ "grad_norm": 0.4272926449775696,
190
+ "learning_rate": 2e-05,
191
+ "loss": 0.655767560005188,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.11203319502074689,
196
+ "grad_norm": 0.47189342975616455,
197
+ "learning_rate": 2e-05,
198
+ "loss": 0.6984891295433044,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.11618257261410789,
203
+ "grad_norm": 0.49677926301956177,
204
+ "learning_rate": 2e-05,
205
+ "loss": 0.6952549815177917,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.12033195020746888,
210
+ "grad_norm": 0.5341811776161194,
211
+ "learning_rate": 2e-05,
212
+ "loss": 0.6844781041145325,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.12448132780082988,
217
+ "grad_norm": 0.49139678478240967,
218
+ "learning_rate": 2e-05,
219
+ "loss": 0.7043532729148865,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.12863070539419086,
224
+ "grad_norm": 0.42113780975341797,
225
+ "learning_rate": 2e-05,
226
+ "loss": 0.6791371703147888,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.13278008298755187,
231
+ "grad_norm": 0.490699827671051,
232
+ "learning_rate": 2e-05,
233
+ "loss": 0.66917484998703,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.13692946058091288,
238
+ "grad_norm": 0.48269012570381165,
239
+ "learning_rate": 2e-05,
240
+ "loss": 0.6663049459457397,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.14107883817427386,
245
+ "grad_norm": 0.4833972454071045,
246
+ "learning_rate": 2e-05,
247
+ "loss": 0.7479192018508911,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.14522821576763487,
252
+ "grad_norm": 0.4521920382976532,
253
+ "learning_rate": 2e-05,
254
+ "loss": 0.5006750822067261,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.14937759336099585,
259
+ "grad_norm": 0.4805753231048584,
260
+ "learning_rate": 2e-05,
261
+ "loss": 0.7437685132026672,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.15352697095435686,
266
+ "grad_norm": 0.4702300429344177,
267
+ "learning_rate": 2e-05,
268
+ "loss": 0.7820006608963013,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.15767634854771784,
273
+ "grad_norm": 0.4416898190975189,
274
+ "learning_rate": 2e-05,
275
+ "loss": 0.5911201238632202,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.16182572614107885,
280
+ "grad_norm": 0.46818608045578003,
281
+ "learning_rate": 2e-05,
282
+ "loss": 0.6237752437591553,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.16597510373443983,
287
+ "grad_norm": 0.38742795586586,
288
+ "learning_rate": 2e-05,
289
+ "loss": 0.6044095754623413,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.17012448132780084,
294
+ "grad_norm": 0.4806065857410431,
295
+ "learning_rate": 2e-05,
296
+ "loss": 0.6341798901557922,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.17427385892116182,
301
+ "grad_norm": 0.4329955279827118,
302
+ "learning_rate": 2e-05,
303
+ "loss": 0.621407687664032,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.17842323651452283,
308
+ "grad_norm": 0.46890074014663696,
309
+ "learning_rate": 2e-05,
310
+ "loss": 0.7025566697120667,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.1825726141078838,
315
+ "grad_norm": 0.4821957051753998,
316
+ "learning_rate": 2e-05,
317
+ "loss": 0.6547812819480896,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.18672199170124482,
322
+ "grad_norm": 0.4716266691684723,
323
+ "learning_rate": 2e-05,
324
+ "loss": 0.6434807777404785,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.1908713692946058,
329
+ "grad_norm": 0.5017584562301636,
330
+ "learning_rate": 2e-05,
331
+ "loss": 0.6461539268493652,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.1950207468879668,
336
+ "grad_norm": 0.4837803244590759,
337
+ "learning_rate": 2e-05,
338
+ "loss": 0.6638780236244202,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.1991701244813278,
343
+ "grad_norm": 0.4523409605026245,
344
+ "learning_rate": 2e-05,
345
+ "loss": 0.5731872916221619,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.2033195020746888,
350
+ "grad_norm": 0.46308189630508423,
351
+ "learning_rate": 2e-05,
352
+ "loss": 0.6024616956710815,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.2074688796680498,
357
+ "grad_norm": 0.4565693140029907,
358
+ "learning_rate": 2e-05,
359
+ "loss": 0.5795129537582397,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.21161825726141079,
364
+ "grad_norm": 0.48081323504447937,
365
+ "learning_rate": 2e-05,
366
+ "loss": 0.6645175814628601,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.2157676348547718,
371
+ "grad_norm": 0.4649989902973175,
372
+ "learning_rate": 2e-05,
373
+ "loss": 0.6339988112449646,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.21991701244813278,
378
+ "grad_norm": 0.45999905467033386,
379
+ "learning_rate": 2e-05,
380
+ "loss": 0.6070005297660828,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.22406639004149378,
385
+ "grad_norm": 0.43405112624168396,
386
+ "learning_rate": 2e-05,
387
+ "loss": 0.6078118085861206,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.22821576763485477,
392
+ "grad_norm": 0.557212233543396,
393
+ "learning_rate": 2e-05,
394
+ "loss": 0.6502783894538879,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.23236514522821577,
399
+ "grad_norm": 0.4206949472427368,
400
+ "learning_rate": 2e-05,
401
+ "loss": 0.5604119896888733,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.23651452282157676,
406
+ "grad_norm": 0.4931945502758026,
407
+ "learning_rate": 2e-05,
408
+ "loss": 0.5463195443153381,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.24066390041493776,
413
+ "grad_norm": 0.44888630509376526,
414
+ "learning_rate": 2e-05,
415
+ "loss": 0.49333369731903076,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.24481327800829875,
420
+ "grad_norm": 0.4515199363231659,
421
+ "learning_rate": 2e-05,
422
+ "loss": 0.66854327917099,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.24896265560165975,
427
+ "grad_norm": 0.46686026453971863,
428
+ "learning_rate": 2e-05,
429
+ "loss": 0.5279274582862854,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.25311203319502074,
434
+ "grad_norm": 0.46663975715637207,
435
+ "learning_rate": 2e-05,
436
+ "loss": 0.6141489148139954,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.2572614107883817,
441
+ "grad_norm": 0.45049089193344116,
442
+ "learning_rate": 2e-05,
443
+ "loss": 0.6643646955490112,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.26141078838174275,
448
+ "grad_norm": 0.49262335896492004,
449
+ "learning_rate": 2e-05,
450
+ "loss": 0.6589719653129578,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.26556016597510373,
455
+ "grad_norm": 0.5234288573265076,
456
+ "learning_rate": 2e-05,
457
+ "loss": 0.6250555515289307,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.2697095435684647,
462
+ "grad_norm": 0.4657873809337616,
463
+ "learning_rate": 2e-05,
464
+ "loss": 0.5761417150497437,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.27385892116182575,
469
+ "grad_norm": 2.8522613048553467,
470
+ "learning_rate": 2e-05,
471
+ "loss": 0.6810148358345032,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.27800829875518673,
476
+ "grad_norm": 0.45667174458503723,
477
+ "learning_rate": 2e-05,
478
+ "loss": 0.5667203664779663,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.2821576763485477,
483
+ "grad_norm": 0.48965880274772644,
484
+ "learning_rate": 2e-05,
485
+ "loss": 0.6057634949684143,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.2863070539419087,
490
+ "grad_norm": 0.4700252115726471,
491
+ "learning_rate": 2e-05,
492
+ "loss": 0.5498369932174683,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.29045643153526973,
497
+ "grad_norm": 0.4457707703113556,
498
+ "learning_rate": 2e-05,
499
+ "loss": 0.5500881671905518,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.2946058091286307,
504
+ "grad_norm": 0.5242801904678345,
505
+ "learning_rate": 2e-05,
506
+ "loss": 0.6648991703987122,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.2987551867219917,
511
+ "grad_norm": 0.4845593273639679,
512
+ "learning_rate": 2e-05,
513
+ "loss": 0.6495253443717957,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.3029045643153527,
518
+ "grad_norm": 0.4535577595233917,
519
+ "learning_rate": 2e-05,
520
+ "loss": 0.6440762281417847,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.3070539419087137,
525
+ "grad_norm": 0.4424896240234375,
526
+ "learning_rate": 2e-05,
527
+ "loss": 0.5427602529525757,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.3112033195020747,
532
+ "grad_norm": 0.4791293144226074,
533
+ "learning_rate": 2e-05,
534
+ "loss": 0.6312339901924133,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.3153526970954357,
539
+ "grad_norm": 0.49440717697143555,
540
+ "learning_rate": 2e-05,
541
+ "loss": 0.7304765582084656,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.31950207468879666,
546
+ "grad_norm": 0.47376683354377747,
547
+ "learning_rate": 2e-05,
548
+ "loss": 0.5550855994224548,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.3236514522821577,
553
+ "grad_norm": 0.5386195182800293,
554
+ "learning_rate": 2e-05,
555
+ "loss": 0.7627665996551514,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.3278008298755187,
560
+ "grad_norm": 0.5139470100402832,
561
+ "learning_rate": 2e-05,
562
+ "loss": 0.7294001579284668,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.33195020746887965,
567
+ "grad_norm": 0.5727441310882568,
568
+ "learning_rate": 2e-05,
569
+ "loss": 0.6094337105751038,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.3360995850622407,
574
+ "grad_norm": 0.4475933313369751,
575
+ "learning_rate": 2e-05,
576
+ "loss": 0.6689184904098511,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.34024896265560167,
581
+ "grad_norm": 0.48615196347236633,
582
+ "learning_rate": 2e-05,
583
+ "loss": 0.5170673727989197,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.34439834024896265,
588
+ "grad_norm": 0.4444977939128876,
589
+ "learning_rate": 2e-05,
590
+ "loss": 0.5426638126373291,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.34854771784232363,
595
+ "grad_norm": 0.4532429873943329,
596
+ "learning_rate": 2e-05,
597
+ "loss": 0.5246436595916748,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.35269709543568467,
602
+ "grad_norm": 0.5425305962562561,
603
+ "learning_rate": 2e-05,
604
+ "loss": 0.7444034814834595,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.35684647302904565,
609
+ "grad_norm": 0.4604993164539337,
610
+ "learning_rate": 2e-05,
611
+ "loss": 0.6390590071678162,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.36099585062240663,
616
+ "grad_norm": 0.4503551423549652,
617
+ "learning_rate": 2e-05,
618
+ "loss": 0.7437008023262024,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.3651452282157676,
623
+ "grad_norm": 0.473531037569046,
624
+ "learning_rate": 2e-05,
625
+ "loss": 0.5801289677619934,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.36929460580912865,
630
+ "grad_norm": 0.43614616990089417,
631
+ "learning_rate": 2e-05,
632
+ "loss": 0.5945846438407898,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.37344398340248963,
637
+ "grad_norm": 0.5157416462898254,
638
+ "learning_rate": 2e-05,
639
+ "loss": 0.5870503187179565,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.3775933609958506,
644
+ "grad_norm": 0.4724714756011963,
645
+ "learning_rate": 2e-05,
646
+ "loss": 0.7136172652244568,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.3817427385892116,
651
+ "grad_norm": 0.49608129262924194,
652
+ "learning_rate": 2e-05,
653
+ "loss": 0.5707521438598633,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.38589211618257263,
658
+ "grad_norm": 0.4372619390487671,
659
+ "learning_rate": 2e-05,
660
+ "loss": 0.6751445531845093,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.3900414937759336,
665
+ "grad_norm": 0.8502039909362793,
666
+ "learning_rate": 2e-05,
667
+ "loss": 0.7432682514190674,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.3941908713692946,
672
+ "grad_norm": 0.43237465620040894,
673
+ "learning_rate": 2e-05,
674
+ "loss": 0.5463064908981323,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.3983402489626556,
679
+ "grad_norm": 0.4683166444301605,
680
+ "learning_rate": 2e-05,
681
+ "loss": 0.5722454190254211,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.4024896265560166,
686
+ "grad_norm": 0.49307140707969666,
687
+ "learning_rate": 2e-05,
688
+ "loss": 0.7676360011100769,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.4066390041493776,
693
+ "grad_norm": 0.45873740315437317,
694
+ "learning_rate": 2e-05,
695
+ "loss": 0.7670221328735352,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.4107883817427386,
700
+ "grad_norm": 0.522739589214325,
701
+ "learning_rate": 2e-05,
702
+ "loss": 0.6198732256889343,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.4149377593360996,
707
+ "grad_norm": 0.513500988483429,
708
+ "learning_rate": 2e-05,
709
+ "loss": 0.6557285189628601,
710
+ "step": 100
711
+ },
712
+ {
713
+ "epoch": 0.4190871369294606,
714
+ "grad_norm": 0.5162559747695923,
715
+ "learning_rate": 2e-05,
716
+ "loss": 0.6777411699295044,
717
+ "step": 101
718
+ },
719
+ {
720
+ "epoch": 0.42323651452282157,
721
+ "grad_norm": 0.4742807447910309,
722
+ "learning_rate": 2e-05,
723
+ "loss": 0.5189216732978821,
724
+ "step": 102
725
+ },
726
+ {
727
+ "epoch": 0.42738589211618255,
728
+ "grad_norm": 0.3864991068840027,
729
+ "learning_rate": 2e-05,
730
+ "loss": 0.5397198796272278,
731
+ "step": 103
732
+ },
733
+ {
734
+ "epoch": 0.4315352697095436,
735
+ "grad_norm": 0.44808462262153625,
736
+ "learning_rate": 2e-05,
737
+ "loss": 0.5719993710517883,
738
+ "step": 104
739
+ },
740
+ {
741
+ "epoch": 0.43568464730290457,
742
+ "grad_norm": 0.5047919154167175,
743
+ "learning_rate": 2e-05,
744
+ "loss": 0.7246726751327515,
745
+ "step": 105
746
+ },
747
+ {
748
+ "epoch": 0.43983402489626555,
749
+ "grad_norm": 0.4501510262489319,
750
+ "learning_rate": 2e-05,
751
+ "loss": 0.5421350598335266,
752
+ "step": 106
753
+ },
754
+ {
755
+ "epoch": 0.44398340248962653,
756
+ "grad_norm": 0.5187399983406067,
757
+ "learning_rate": 2e-05,
758
+ "loss": 0.6851190328598022,
759
+ "step": 107
760
+ },
761
+ {
762
+ "epoch": 0.44813278008298757,
763
+ "grad_norm": 0.4442541003227234,
764
+ "learning_rate": 2e-05,
765
+ "loss": 0.7323095798492432,
766
+ "step": 108
767
+ },
768
+ {
769
+ "epoch": 0.45228215767634855,
770
+ "grad_norm": 0.4546023905277252,
771
+ "learning_rate": 2e-05,
772
+ "loss": 0.5949406027793884,
773
+ "step": 109
774
+ },
775
+ {
776
+ "epoch": 0.45643153526970953,
777
+ "grad_norm": 0.43765076994895935,
778
+ "learning_rate": 2e-05,
779
+ "loss": 0.5195109248161316,
780
+ "step": 110
781
+ },
782
+ {
783
+ "epoch": 0.4605809128630705,
784
+ "grad_norm": 0.6012418866157532,
785
+ "learning_rate": 2e-05,
786
+ "loss": 0.5891928672790527,
787
+ "step": 111
788
+ },
789
+ {
790
+ "epoch": 0.46473029045643155,
791
+ "grad_norm": 0.5350989699363708,
792
+ "learning_rate": 2e-05,
793
+ "loss": 0.7073556184768677,
794
+ "step": 112
795
+ },
796
+ {
797
+ "epoch": 0.46887966804979253,
798
+ "grad_norm": 0.40423402190208435,
799
+ "learning_rate": 2e-05,
800
+ "loss": 0.6081284284591675,
801
+ "step": 113
802
+ },
803
+ {
804
+ "epoch": 0.4730290456431535,
805
+ "grad_norm": 0.48459556698799133,
806
+ "learning_rate": 2e-05,
807
+ "loss": 0.7626031637191772,
808
+ "step": 114
809
+ },
810
+ {
811
+ "epoch": 0.47717842323651455,
812
+ "grad_norm": 0.5132282972335815,
813
+ "learning_rate": 2e-05,
814
+ "loss": 0.7070454359054565,
815
+ "step": 115
816
+ },
817
+ {
818
+ "epoch": 0.48132780082987553,
819
+ "grad_norm": 0.40754643082618713,
820
+ "learning_rate": 2e-05,
821
+ "loss": 0.7881268858909607,
822
+ "step": 116
823
+ },
824
+ {
825
+ "epoch": 0.4854771784232365,
826
+ "grad_norm": 0.46227574348449707,
827
+ "learning_rate": 2e-05,
828
+ "loss": 0.5589393973350525,
829
+ "step": 117
830
+ },
831
+ {
832
+ "epoch": 0.4896265560165975,
833
+ "grad_norm": 0.458891898393631,
834
+ "learning_rate": 2e-05,
835
+ "loss": 0.6076244711875916,
836
+ "step": 118
837
+ },
838
+ {
839
+ "epoch": 0.49377593360995853,
840
+ "grad_norm": 0.4314862787723541,
841
+ "learning_rate": 2e-05,
842
+ "loss": 0.58890700340271,
843
+ "step": 119
844
+ },
845
+ {
846
+ "epoch": 0.4979253112033195,
847
+ "grad_norm": 0.4849430322647095,
848
+ "learning_rate": 2e-05,
849
+ "loss": 0.7297042012214661,
850
+ "step": 120
851
+ },
852
+ {
853
+ "epoch": 0.5020746887966805,
854
+ "grad_norm": 0.4734286963939667,
855
+ "learning_rate": 2e-05,
856
+ "loss": 0.7929898500442505,
857
+ "step": 121
858
+ },
859
+ {
860
+ "epoch": 0.5062240663900415,
861
+ "grad_norm": 0.4982983469963074,
862
+ "learning_rate": 2e-05,
863
+ "loss": 0.6973749399185181,
864
+ "step": 122
865
+ },
866
+ {
867
+ "epoch": 0.5103734439834025,
868
+ "grad_norm": 0.4555007517337799,
869
+ "learning_rate": 2e-05,
870
+ "loss": 0.6363988518714905,
871
+ "step": 123
872
+ },
873
+ {
874
+ "epoch": 0.5145228215767634,
875
+ "grad_norm": 0.469707190990448,
876
+ "learning_rate": 2e-05,
877
+ "loss": 0.6936283111572266,
878
+ "step": 124
879
+ },
880
+ {
881
+ "epoch": 0.5186721991701245,
882
+ "grad_norm": 0.45310160517692566,
883
+ "learning_rate": 2e-05,
884
+ "loss": 0.8045607209205627,
885
+ "step": 125
886
+ },
887
+ {
888
+ "epoch": 0.5228215767634855,
889
+ "grad_norm": 0.5117340087890625,
890
+ "learning_rate": 2e-05,
891
+ "loss": 0.5602521300315857,
892
+ "step": 126
893
+ },
894
+ {
895
+ "epoch": 0.5269709543568465,
896
+ "grad_norm": 0.4890298545360565,
897
+ "learning_rate": 2e-05,
898
+ "loss": 0.5749447345733643,
899
+ "step": 127
900
+ },
901
+ {
902
+ "epoch": 0.5311203319502075,
903
+ "grad_norm": 0.4680368900299072,
904
+ "learning_rate": 2e-05,
905
+ "loss": 0.6603504419326782,
906
+ "step": 128
907
+ },
908
+ {
909
+ "epoch": 0.5352697095435685,
910
+ "grad_norm": 0.4364625811576843,
911
+ "learning_rate": 2e-05,
912
+ "loss": 0.6615546941757202,
913
+ "step": 129
914
+ },
915
+ {
916
+ "epoch": 0.5394190871369294,
917
+ "grad_norm": 0.44393712282180786,
918
+ "learning_rate": 2e-05,
919
+ "loss": 0.7206588387489319,
920
+ "step": 130
921
+ },
922
+ {
923
+ "epoch": 0.5435684647302904,
924
+ "grad_norm": 0.4770648777484894,
925
+ "learning_rate": 2e-05,
926
+ "loss": 0.5122599005699158,
927
+ "step": 131
928
+ },
929
+ {
930
+ "epoch": 0.5477178423236515,
931
+ "grad_norm": 0.4254826307296753,
932
+ "learning_rate": 2e-05,
933
+ "loss": 0.5919891595840454,
934
+ "step": 132
935
+ },
936
+ {
937
+ "epoch": 0.5518672199170125,
938
+ "grad_norm": 0.49948850274086,
939
+ "learning_rate": 2e-05,
940
+ "loss": 0.7168218493461609,
941
+ "step": 133
942
+ },
943
+ {
944
+ "epoch": 0.5560165975103735,
945
+ "grad_norm": 0.46940577030181885,
946
+ "learning_rate": 2e-05,
947
+ "loss": 0.559630274772644,
948
+ "step": 134
949
+ },
950
+ {
951
+ "epoch": 0.5601659751037344,
952
+ "grad_norm": 0.38155895471572876,
953
+ "learning_rate": 2e-05,
954
+ "loss": 0.35719043016433716,
955
+ "step": 135
956
+ },
957
+ {
958
+ "epoch": 0.5643153526970954,
959
+ "grad_norm": 0.446111798286438,
960
+ "learning_rate": 2e-05,
961
+ "loss": 0.5944488644599915,
962
+ "step": 136
963
+ },
964
+ {
965
+ "epoch": 0.5684647302904564,
966
+ "grad_norm": 0.44898721575737,
967
+ "learning_rate": 2e-05,
968
+ "loss": 0.6778333187103271,
969
+ "step": 137
970
+ },
971
+ {
972
+ "epoch": 0.5726141078838174,
973
+ "grad_norm": 0.4727020263671875,
974
+ "learning_rate": 2e-05,
975
+ "loss": 0.6683153510093689,
976
+ "step": 138
977
+ },
978
+ {
979
+ "epoch": 0.5767634854771784,
980
+ "grad_norm": 0.4775353968143463,
981
+ "learning_rate": 2e-05,
982
+ "loss": 0.7357037663459778,
983
+ "step": 139
984
+ },
985
+ {
986
+ "epoch": 0.5809128630705395,
987
+ "grad_norm": 0.5201453566551208,
988
+ "learning_rate": 2e-05,
989
+ "loss": 0.5672426819801331,
990
+ "step": 140
991
+ },
992
+ {
993
+ "epoch": 0.5850622406639004,
994
+ "grad_norm": 0.4446447491645813,
995
+ "learning_rate": 2e-05,
996
+ "loss": 0.6665009260177612,
997
+ "step": 141
998
+ },
999
+ {
1000
+ "epoch": 0.5892116182572614,
1001
+ "grad_norm": 0.44674625992774963,
1002
+ "learning_rate": 2e-05,
1003
+ "loss": 0.6256436705589294,
1004
+ "step": 142
1005
+ },
1006
+ {
1007
+ "epoch": 0.5933609958506224,
1008
+ "grad_norm": 0.48278629779815674,
1009
+ "learning_rate": 2e-05,
1010
+ "loss": 0.652278482913971,
1011
+ "step": 143
1012
+ },
1013
+ {
1014
+ "epoch": 0.5975103734439834,
1015
+ "grad_norm": 0.4608626067638397,
1016
+ "learning_rate": 2e-05,
1017
+ "loss": 0.687121570110321,
1018
+ "step": 144
1019
+ },
1020
+ {
1021
+ "epoch": 0.6016597510373444,
1022
+ "grad_norm": 0.5146644711494446,
1023
+ "learning_rate": 2e-05,
1024
+ "loss": 0.7759085297584534,
1025
+ "step": 145
1026
+ },
1027
+ {
1028
+ "epoch": 0.6058091286307054,
1029
+ "grad_norm": 0.4703519344329834,
1030
+ "learning_rate": 2e-05,
1031
+ "loss": 0.6268375515937805,
1032
+ "step": 146
1033
+ },
1034
+ {
1035
+ "epoch": 0.6099585062240664,
1036
+ "grad_norm": 0.4373490512371063,
1037
+ "learning_rate": 2e-05,
1038
+ "loss": 0.7350006699562073,
1039
+ "step": 147
1040
+ },
1041
+ {
1042
+ "epoch": 0.6141078838174274,
1043
+ "grad_norm": 0.48525917530059814,
1044
+ "learning_rate": 2e-05,
1045
+ "loss": 0.6609182357788086,
1046
+ "step": 148
1047
+ },
1048
+ {
1049
+ "epoch": 0.6182572614107884,
1050
+ "grad_norm": 0.509609043598175,
1051
+ "learning_rate": 2e-05,
1052
+ "loss": 0.7720542550086975,
1053
+ "step": 149
1054
+ },
1055
+ {
1056
+ "epoch": 0.6224066390041494,
1057
+ "grad_norm": 0.46813687682151794,
1058
+ "learning_rate": 2e-05,
1059
+ "loss": 0.658400297164917,
1060
+ "step": 150
1061
+ },
1062
+ {
1063
+ "epoch": 0.6265560165975104,
1064
+ "grad_norm": 0.48811477422714233,
1065
+ "learning_rate": 2e-05,
1066
+ "loss": 0.6340473890304565,
1067
+ "step": 151
1068
+ },
1069
+ {
1070
+ "epoch": 0.6307053941908713,
1071
+ "grad_norm": 0.48529860377311707,
1072
+ "learning_rate": 2e-05,
1073
+ "loss": 0.7543718218803406,
1074
+ "step": 152
1075
+ },
1076
+ {
1077
+ "epoch": 0.6348547717842323,
1078
+ "grad_norm": 0.4565221965312958,
1079
+ "learning_rate": 2e-05,
1080
+ "loss": 0.5810791254043579,
1081
+ "step": 153
1082
+ },
1083
+ {
1084
+ "epoch": 0.6390041493775933,
1085
+ "grad_norm": 0.4667608141899109,
1086
+ "learning_rate": 2e-05,
1087
+ "loss": 0.5940293669700623,
1088
+ "step": 154
1089
+ },
1090
+ {
1091
+ "epoch": 0.6431535269709544,
1092
+ "grad_norm": 0.476724773645401,
1093
+ "learning_rate": 2e-05,
1094
+ "loss": 0.5076797604560852,
1095
+ "step": 155
1096
+ },
1097
+ {
1098
+ "epoch": 0.6473029045643154,
1099
+ "grad_norm": 0.48997762799263,
1100
+ "learning_rate": 2e-05,
1101
+ "loss": 0.5588229894638062,
1102
+ "step": 156
1103
+ },
1104
+ {
1105
+ "epoch": 0.6514522821576764,
1106
+ "grad_norm": 0.4687066674232483,
1107
+ "learning_rate": 2e-05,
1108
+ "loss": 0.7414963245391846,
1109
+ "step": 157
1110
+ },
1111
+ {
1112
+ "epoch": 0.6556016597510373,
1113
+ "grad_norm": 0.5096819400787354,
1114
+ "learning_rate": 2e-05,
1115
+ "loss": 0.6766090393066406,
1116
+ "step": 158
1117
+ },
1118
+ {
1119
+ "epoch": 0.6597510373443983,
1120
+ "grad_norm": 0.40396353602409363,
1121
+ "learning_rate": 2e-05,
1122
+ "loss": 0.5890622735023499,
1123
+ "step": 159
1124
+ },
1125
+ {
1126
+ "epoch": 0.6639004149377593,
1127
+ "grad_norm": 0.46985870599746704,
1128
+ "learning_rate": 2e-05,
1129
+ "loss": 0.5969380140304565,
1130
+ "step": 160
1131
+ },
1132
+ {
1133
+ "epoch": 0.6680497925311203,
1134
+ "grad_norm": 0.49084073305130005,
1135
+ "learning_rate": 2e-05,
1136
+ "loss": 0.6371229887008667,
1137
+ "step": 161
1138
+ },
1139
+ {
1140
+ "epoch": 0.6721991701244814,
1141
+ "grad_norm": 0.4466313123703003,
1142
+ "learning_rate": 2e-05,
1143
+ "loss": 0.6732550263404846,
1144
+ "step": 162
1145
+ },
1146
+ {
1147
+ "epoch": 0.6763485477178424,
1148
+ "grad_norm": 0.4656016528606415,
1149
+ "learning_rate": 2e-05,
1150
+ "loss": 0.7082672119140625,
1151
+ "step": 163
1152
+ },
1153
+ {
1154
+ "epoch": 0.6804979253112033,
1155
+ "grad_norm": 0.43604540824890137,
1156
+ "learning_rate": 2e-05,
1157
+ "loss": 0.5961745977401733,
1158
+ "step": 164
1159
+ },
1160
+ {
1161
+ "epoch": 0.6846473029045643,
1162
+ "grad_norm": 0.45962008833885193,
1163
+ "learning_rate": 2e-05,
1164
+ "loss": 0.5974591374397278,
1165
+ "step": 165
1166
+ },
1167
+ {
1168
+ "epoch": 0.6887966804979253,
1169
+ "grad_norm": 0.4566839635372162,
1170
+ "learning_rate": 2e-05,
1171
+ "loss": 0.5828849673271179,
1172
+ "step": 166
1173
+ },
1174
+ {
1175
+ "epoch": 0.6929460580912863,
1176
+ "grad_norm": 0.38006696105003357,
1177
+ "learning_rate": 2e-05,
1178
+ "loss": 0.6747267246246338,
1179
+ "step": 167
1180
+ },
1181
+ {
1182
+ "epoch": 0.6970954356846473,
1183
+ "grad_norm": 0.439981609582901,
1184
+ "learning_rate": 2e-05,
1185
+ "loss": 0.7797038555145264,
1186
+ "step": 168
1187
+ },
1188
+ {
1189
+ "epoch": 0.7012448132780082,
1190
+ "grad_norm": 0.47687003016471863,
1191
+ "learning_rate": 2e-05,
1192
+ "loss": 0.570720911026001,
1193
+ "step": 169
1194
+ },
1195
+ {
1196
+ "epoch": 0.7053941908713693,
1197
+ "grad_norm": 0.4829600155353546,
1198
+ "learning_rate": 2e-05,
1199
+ "loss": 0.5899892449378967,
1200
+ "step": 170
1201
+ },
1202
+ {
1203
+ "epoch": 0.7095435684647303,
1204
+ "grad_norm": 0.4642188847064972,
1205
+ "learning_rate": 2e-05,
1206
+ "loss": 0.6866733431816101,
1207
+ "step": 171
1208
+ },
1209
+ {
1210
+ "epoch": 0.7136929460580913,
1211
+ "grad_norm": 0.4619278013706207,
1212
+ "learning_rate": 2e-05,
1213
+ "loss": 0.5310846567153931,
1214
+ "step": 172
1215
+ },
1216
+ {
1217
+ "epoch": 0.7178423236514523,
1218
+ "grad_norm": 0.40906423330307007,
1219
+ "learning_rate": 2e-05,
1220
+ "loss": 0.6505522131919861,
1221
+ "step": 173
1222
+ },
1223
+ {
1224
+ "epoch": 0.7219917012448133,
1225
+ "grad_norm": 0.47687482833862305,
1226
+ "learning_rate": 2e-05,
1227
+ "loss": 0.6477482318878174,
1228
+ "step": 174
1229
+ },
1230
+ {
1231
+ "epoch": 0.7261410788381742,
1232
+ "grad_norm": 0.4249359369277954,
1233
+ "learning_rate": 2e-05,
1234
+ "loss": 0.542078971862793,
1235
+ "step": 175
1236
+ },
1237
+ {
1238
+ "epoch": 0.7302904564315352,
1239
+ "grad_norm": 0.4437820315361023,
1240
+ "learning_rate": 2e-05,
1241
+ "loss": 0.7326051592826843,
1242
+ "step": 176
1243
+ },
1244
+ {
1245
+ "epoch": 0.7344398340248963,
1246
+ "grad_norm": 0.47250184416770935,
1247
+ "learning_rate": 2e-05,
1248
+ "loss": 0.7204862236976624,
1249
+ "step": 177
1250
+ },
1251
+ {
1252
+ "epoch": 0.7385892116182573,
1253
+ "grad_norm": 0.45673149824142456,
1254
+ "learning_rate": 2e-05,
1255
+ "loss": 0.6894567608833313,
1256
+ "step": 178
1257
+ },
1258
+ {
1259
+ "epoch": 0.7427385892116183,
1260
+ "grad_norm": 0.4065015912055969,
1261
+ "learning_rate": 2e-05,
1262
+ "loss": 0.5020947456359863,
1263
+ "step": 179
1264
+ },
1265
+ {
1266
+ "epoch": 0.7468879668049793,
1267
+ "grad_norm": 0.480761855840683,
1268
+ "learning_rate": 2e-05,
1269
+ "loss": 0.652772843837738,
1270
+ "step": 180
1271
+ },
1272
+ {
1273
+ "epoch": 0.7510373443983402,
1274
+ "grad_norm": 0.4796382784843445,
1275
+ "learning_rate": 2e-05,
1276
+ "loss": 0.5466834306716919,
1277
+ "step": 181
1278
+ },
1279
+ {
1280
+ "epoch": 0.7551867219917012,
1281
+ "grad_norm": 0.427696168422699,
1282
+ "learning_rate": 2e-05,
1283
+ "loss": 0.46073320508003235,
1284
+ "step": 182
1285
+ },
1286
+ {
1287
+ "epoch": 0.7593360995850622,
1288
+ "grad_norm": 0.4324597716331482,
1289
+ "learning_rate": 2e-05,
1290
+ "loss": 0.6211638450622559,
1291
+ "step": 183
1292
+ },
1293
+ {
1294
+ "epoch": 0.7634854771784232,
1295
+ "grad_norm": 0.47733691334724426,
1296
+ "learning_rate": 2e-05,
1297
+ "loss": 0.6684774160385132,
1298
+ "step": 184
1299
+ },
1300
+ {
1301
+ "epoch": 0.7676348547717843,
1302
+ "grad_norm": 0.431084007024765,
1303
+ "learning_rate": 2e-05,
1304
+ "loss": 0.6145834922790527,
1305
+ "step": 185
1306
+ },
1307
+ {
1308
+ "epoch": 0.7717842323651453,
1309
+ "grad_norm": 0.5007755160331726,
1310
+ "learning_rate": 2e-05,
1311
+ "loss": 0.6526326537132263,
1312
+ "step": 186
1313
+ },
1314
+ {
1315
+ "epoch": 0.7759336099585062,
1316
+ "grad_norm": 0.4393167793750763,
1317
+ "learning_rate": 2e-05,
1318
+ "loss": 0.6100775599479675,
1319
+ "step": 187
1320
+ },
1321
+ {
1322
+ "epoch": 0.7800829875518672,
1323
+ "grad_norm": 0.4865422248840332,
1324
+ "learning_rate": 2e-05,
1325
+ "loss": 0.7980203032493591,
1326
+ "step": 188
1327
+ },
1328
+ {
1329
+ "epoch": 0.7842323651452282,
1330
+ "grad_norm": 0.4837598502635956,
1331
+ "learning_rate": 2e-05,
1332
+ "loss": 0.5299490690231323,
1333
+ "step": 189
1334
+ },
1335
+ {
1336
+ "epoch": 0.7883817427385892,
1337
+ "grad_norm": 0.5101847052574158,
1338
+ "learning_rate": 2e-05,
1339
+ "loss": 0.636174201965332,
1340
+ "step": 190
1341
+ },
1342
+ {
1343
+ "epoch": 0.7925311203319502,
1344
+ "grad_norm": 0.481587678194046,
1345
+ "learning_rate": 2e-05,
1346
+ "loss": 0.584964394569397,
1347
+ "step": 191
1348
+ },
1349
+ {
1350
+ "epoch": 0.7966804979253111,
1351
+ "grad_norm": 0.4833771288394928,
1352
+ "learning_rate": 2e-05,
1353
+ "loss": 0.660033643245697,
1354
+ "step": 192
1355
+ },
1356
+ {
1357
+ "epoch": 0.8008298755186722,
1358
+ "grad_norm": 0.47723522782325745,
1359
+ "learning_rate": 2e-05,
1360
+ "loss": 0.5514160394668579,
1361
+ "step": 193
1362
+ },
1363
+ {
1364
+ "epoch": 0.8049792531120332,
1365
+ "grad_norm": 0.46386954188346863,
1366
+ "learning_rate": 2e-05,
1367
+ "loss": 0.5447302460670471,
1368
+ "step": 194
1369
+ },
1370
+ {
1371
+ "epoch": 0.8091286307053942,
1372
+ "grad_norm": 0.47975945472717285,
1373
+ "learning_rate": 2e-05,
1374
+ "loss": 0.6700522303581238,
1375
+ "step": 195
1376
+ },
1377
+ {
1378
+ "epoch": 0.8132780082987552,
1379
+ "grad_norm": 0.45628130435943604,
1380
+ "learning_rate": 2e-05,
1381
+ "loss": 0.725788950920105,
1382
+ "step": 196
1383
+ },
1384
+ {
1385
+ "epoch": 0.8174273858921162,
1386
+ "grad_norm": 0.5276447534561157,
1387
+ "learning_rate": 2e-05,
1388
+ "loss": 0.4795994460582733,
1389
+ "step": 197
1390
+ },
1391
+ {
1392
+ "epoch": 0.8215767634854771,
1393
+ "grad_norm": 0.4197767376899719,
1394
+ "learning_rate": 2e-05,
1395
+ "loss": 0.5689822435379028,
1396
+ "step": 198
1397
+ },
1398
+ {
1399
+ "epoch": 0.8257261410788381,
1400
+ "grad_norm": 0.4988608956336975,
1401
+ "learning_rate": 2e-05,
1402
+ "loss": 0.5570112466812134,
1403
+ "step": 199
1404
+ },
1405
+ {
1406
+ "epoch": 0.8298755186721992,
1407
+ "grad_norm": 0.43889400362968445,
1408
+ "learning_rate": 2e-05,
1409
+ "loss": 0.5546621680259705,
1410
+ "step": 200
1411
+ },
1412
+ {
1413
+ "epoch": 0.8340248962655602,
1414
+ "grad_norm": 0.4966701865196228,
1415
+ "learning_rate": 2e-05,
1416
+ "loss": 0.7806369066238403,
1417
+ "step": 201
1418
+ },
1419
+ {
1420
+ "epoch": 0.8381742738589212,
1421
+ "grad_norm": 0.444965124130249,
1422
+ "learning_rate": 2e-05,
1423
+ "loss": 0.6175658702850342,
1424
+ "step": 202
1425
+ },
1426
+ {
1427
+ "epoch": 0.8423236514522822,
1428
+ "grad_norm": 0.47721561789512634,
1429
+ "learning_rate": 2e-05,
1430
+ "loss": 0.608608603477478,
1431
+ "step": 203
1432
+ },
1433
+ {
1434
+ "epoch": 0.8464730290456431,
1435
+ "grad_norm": 0.41363325715065,
1436
+ "learning_rate": 2e-05,
1437
+ "loss": 0.5362960696220398,
1438
+ "step": 204
1439
+ },
1440
+ {
1441
+ "epoch": 0.8506224066390041,
1442
+ "grad_norm": 0.4979526102542877,
1443
+ "learning_rate": 2e-05,
1444
+ "loss": 0.6923606395721436,
1445
+ "step": 205
1446
+ },
1447
+ {
1448
+ "epoch": 0.8547717842323651,
1449
+ "grad_norm": 0.4715823829174042,
1450
+ "learning_rate": 2e-05,
1451
+ "loss": 0.5849528312683105,
1452
+ "step": 206
1453
+ },
1454
+ {
1455
+ "epoch": 0.8589211618257261,
1456
+ "grad_norm": 0.43941834568977356,
1457
+ "learning_rate": 2e-05,
1458
+ "loss": 0.5507952570915222,
1459
+ "step": 207
1460
+ },
1461
+ {
1462
+ "epoch": 0.8630705394190872,
1463
+ "grad_norm": 0.6943396925926208,
1464
+ "learning_rate": 2e-05,
1465
+ "loss": 0.6139302253723145,
1466
+ "step": 208
1467
+ },
1468
+ {
1469
+ "epoch": 0.8672199170124482,
1470
+ "grad_norm": 0.4135432541370392,
1471
+ "learning_rate": 2e-05,
1472
+ "loss": 0.6495124697685242,
1473
+ "step": 209
1474
+ },
1475
+ {
1476
+ "epoch": 0.8713692946058091,
1477
+ "grad_norm": 0.4735243320465088,
1478
+ "learning_rate": 2e-05,
1479
+ "loss": 0.6073355674743652,
1480
+ "step": 210
1481
+ },
1482
+ {
1483
+ "epoch": 0.8755186721991701,
1484
+ "grad_norm": 0.5081479549407959,
1485
+ "learning_rate": 2e-05,
1486
+ "loss": 0.5338884592056274,
1487
+ "step": 211
1488
+ },
1489
+ {
1490
+ "epoch": 0.8796680497925311,
1491
+ "grad_norm": 0.44402876496315,
1492
+ "learning_rate": 2e-05,
1493
+ "loss": 0.5649405717849731,
1494
+ "step": 212
1495
+ },
1496
+ {
1497
+ "epoch": 0.8838174273858921,
1498
+ "grad_norm": 0.4597266614437103,
1499
+ "learning_rate": 2e-05,
1500
+ "loss": 0.851700484752655,
1501
+ "step": 213
1502
+ },
1503
+ {
1504
+ "epoch": 0.8879668049792531,
1505
+ "grad_norm": 0.49691715836524963,
1506
+ "learning_rate": 2e-05,
1507
+ "loss": 0.6800894141197205,
1508
+ "step": 214
1509
+ },
1510
+ {
1511
+ "epoch": 0.8921161825726142,
1512
+ "grad_norm": 0.4347255825996399,
1513
+ "learning_rate": 2e-05,
1514
+ "loss": 0.6838465332984924,
1515
+ "step": 215
1516
+ },
1517
+ {
1518
+ "epoch": 0.8962655601659751,
1519
+ "grad_norm": 0.4532018303871155,
1520
+ "learning_rate": 2e-05,
1521
+ "loss": 0.6527755856513977,
1522
+ "step": 216
1523
+ },
1524
+ {
1525
+ "epoch": 0.9004149377593361,
1526
+ "grad_norm": 0.5003204941749573,
1527
+ "learning_rate": 2e-05,
1528
+ "loss": 0.6630940437316895,
1529
+ "step": 217
1530
+ },
1531
+ {
1532
+ "epoch": 0.9045643153526971,
1533
+ "grad_norm": 0.4661204218864441,
1534
+ "learning_rate": 2e-05,
1535
+ "loss": 0.693079948425293,
1536
+ "step": 218
1537
+ },
1538
+ {
1539
+ "epoch": 0.9087136929460581,
1540
+ "grad_norm": 0.4552728235721588,
1541
+ "learning_rate": 2e-05,
1542
+ "loss": 0.6484197974205017,
1543
+ "step": 219
1544
+ },
1545
+ {
1546
+ "epoch": 0.9128630705394191,
1547
+ "grad_norm": 0.4681585133075714,
1548
+ "learning_rate": 2e-05,
1549
+ "loss": 0.6020994186401367,
1550
+ "step": 220
1551
+ },
1552
+ {
1553
+ "epoch": 0.91701244813278,
1554
+ "grad_norm": 0.41022825241088867,
1555
+ "learning_rate": 2e-05,
1556
+ "loss": 0.530207097530365,
1557
+ "step": 221
1558
+ },
1559
+ {
1560
+ "epoch": 0.921161825726141,
1561
+ "grad_norm": 0.39006152749061584,
1562
+ "learning_rate": 2e-05,
1563
+ "loss": 0.445180743932724,
1564
+ "step": 222
1565
+ },
1566
+ {
1567
+ "epoch": 0.9253112033195021,
1568
+ "grad_norm": 0.4057929217815399,
1569
+ "learning_rate": 2e-05,
1570
+ "loss": 0.5387605428695679,
1571
+ "step": 223
1572
+ },
1573
+ {
1574
+ "epoch": 0.9294605809128631,
1575
+ "grad_norm": 0.42876264452934265,
1576
+ "learning_rate": 2e-05,
1577
+ "loss": 0.5825240015983582,
1578
+ "step": 224
1579
+ },
1580
+ {
1581
+ "epoch": 0.9336099585062241,
1582
+ "grad_norm": 0.48948875069618225,
1583
+ "learning_rate": 2e-05,
1584
+ "loss": 0.6396217942237854,
1585
+ "step": 225
1586
+ },
1587
+ {
1588
+ "epoch": 0.9377593360995851,
1589
+ "grad_norm": 0.4649500548839569,
1590
+ "learning_rate": 2e-05,
1591
+ "loss": 0.4400583505630493,
1592
+ "step": 226
1593
+ },
1594
+ {
1595
+ "epoch": 0.941908713692946,
1596
+ "grad_norm": 0.43061113357543945,
1597
+ "learning_rate": 2e-05,
1598
+ "loss": 0.5668185353279114,
1599
+ "step": 227
1600
+ },
1601
+ {
1602
+ "epoch": 0.946058091286307,
1603
+ "grad_norm": 0.37659695744514465,
1604
+ "learning_rate": 2e-05,
1605
+ "loss": 0.3734014630317688,
1606
+ "step": 228
1607
+ },
1608
+ {
1609
+ "epoch": 0.950207468879668,
1610
+ "grad_norm": 0.5160449743270874,
1611
+ "learning_rate": 2e-05,
1612
+ "loss": 0.7836225032806396,
1613
+ "step": 229
1614
+ },
1615
+ {
1616
+ "epoch": 0.9543568464730291,
1617
+ "grad_norm": 0.5332698822021484,
1618
+ "learning_rate": 2e-05,
1619
+ "loss": 0.6564600467681885,
1620
+ "step": 230
1621
+ },
1622
+ {
1623
+ "epoch": 0.9585062240663901,
1624
+ "grad_norm": 0.48597726225852966,
1625
+ "learning_rate": 2e-05,
1626
+ "loss": 0.7620537281036377,
1627
+ "step": 231
1628
+ },
1629
+ {
1630
+ "epoch": 0.9626556016597511,
1631
+ "grad_norm": 0.437928169965744,
1632
+ "learning_rate": 2e-05,
1633
+ "loss": 0.5499407052993774,
1634
+ "step": 232
1635
+ },
1636
+ {
1637
+ "epoch": 0.966804979253112,
1638
+ "grad_norm": 0.4861524701118469,
1639
+ "learning_rate": 2e-05,
1640
+ "loss": 0.6248472332954407,
1641
+ "step": 233
1642
+ },
1643
+ {
1644
+ "epoch": 0.970954356846473,
1645
+ "grad_norm": 0.4638573229312897,
1646
+ "learning_rate": 2e-05,
1647
+ "loss": 0.5971051454544067,
1648
+ "step": 234
1649
+ },
1650
+ {
1651
+ "epoch": 0.975103734439834,
1652
+ "grad_norm": 0.4368666410446167,
1653
+ "learning_rate": 2e-05,
1654
+ "loss": 0.5971348285675049,
1655
+ "step": 235
1656
+ },
1657
+ {
1658
+ "epoch": 0.979253112033195,
1659
+ "grad_norm": 0.4261365830898285,
1660
+ "learning_rate": 2e-05,
1661
+ "loss": 0.5625735521316528,
1662
+ "step": 236
1663
+ },
1664
+ {
1665
+ "epoch": 0.983402489626556,
1666
+ "grad_norm": 0.47601279616355896,
1667
+ "learning_rate": 2e-05,
1668
+ "loss": 0.518233597278595,
1669
+ "step": 237
1670
+ },
1671
+ {
1672
+ "epoch": 0.9875518672199171,
1673
+ "grad_norm": 0.4935397803783417,
1674
+ "learning_rate": 2e-05,
1675
+ "loss": 0.7158107161521912,
1676
+ "step": 238
1677
+ },
1678
+ {
1679
+ "epoch": 0.991701244813278,
1680
+ "grad_norm": 0.456167072057724,
1681
+ "learning_rate": 2e-05,
1682
+ "loss": 0.6627569198608398,
1683
+ "step": 239
1684
+ },
1685
+ {
1686
+ "epoch": 0.995850622406639,
1687
+ "grad_norm": 0.4805908799171448,
1688
+ "learning_rate": 2e-05,
1689
+ "loss": 0.6887528896331787,
1690
+ "step": 240
1691
+ },
1692
+ {
1693
+ "epoch": 1.0,
1694
+ "grad_norm": 0.6356716156005859,
1695
+ "learning_rate": 2e-05,
1696
+ "loss": 0.65900057554245,
1697
+ "step": 241
1698
+ }
1699
+ ],
1700
+ "logging_steps": 1,
1701
+ "max_steps": 241,
1702
+ "num_input_tokens_seen": 0,
1703
+ "num_train_epochs": 1,
1704
+ "save_steps": 500,
1705
+ "stateful_callbacks": {
1706
+ "TrainerControl": {
1707
+ "args": {
1708
+ "should_epoch_stop": false,
1709
+ "should_evaluate": false,
1710
+ "should_log": false,
1711
+ "should_save": true,
1712
+ "should_training_stop": true
1713
+ },
1714
+ "attributes": {}
1715
+ }
1716
+ },
1717
+ "total_flos": 1.1048678841008456e+18,
1718
+ "train_batch_size": 8,
1719
+ "trial_name": null,
1720
+ "trial_params": null
1721
+ }
checkpoint-241/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b03f7a5042997611c859c4828c7ddb2832a3adf3f268061d4844c0f03f64964
3
+ size 5265
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f90a0ee1b41702c7b233b02234294a53bc0684a08d3bcd8c8ff702e9a12f64
3
+ size 17210019
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "from_slow": true,
7
+ "is_local": false,
8
+ "legacy": false,
9
+ "model_input_names": [
10
+ "input_ids",
11
+ "attention_mask"
12
+ ],
13
+ "model_max_length": 131072,
14
+ "pad_token": "<|finetune_right_pad_id|>",
15
+ "padding_side": "right",
16
+ "tokenizer_class": "TokenizersBackend",
17
+ "unk_token": null
18
+ }
training_config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: mo10_code_monitor
2
+ data:
3
+ path: experiments/260419_mo10/data/mo10_train.jsonl
4
+ model:
5
+ name: unsloth/Llama-3.3-70B-Instruct
6
+ training:
7
+ epochs: 1
8
+ batch_size: 8
9
+ gradient_accumulation_steps: 1
10
+ learning_rate: 2.0e-05
11
+ adapter_path: experiments/260409_b200_unsloth/output/mo9c
12
+ shuffle_seed: 42
13
+ max_seq_length: 4096
14
+ save_total_limit: 1
15
+ lora:
16
+ rank: 64
17
+ alpha: 64
18
+ dropout: 0.0
19
+ target_modules: all-linear
20
+ logging:
21
+ wandb_project: collusion-mo-finetune
22
+ wandb_run_name: mo10_code_monitor
23
+ require_wandb: true
24
+ log_every_n_steps: 1
25
+ save_every_n_steps: 500